2 // FORESTER -- software libraries and applications
\r
3 // for evolutionary biology research and applications.
\r
5 // Copyright (C) 2008-2009 Christian M. Zmasek
\r
6 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
\r
7 // Copyright (C) 2000-2001 Washington University School of Medicine
\r
8 // and Howard Hughes Medical Institute
\r
9 // Copyright (C) 2003-2007 Ethalinda K.S. Cannon
\r
10 // All rights reserved
\r
12 // This library is free software; you can redistribute it and/or
\r
13 // modify it under the terms of the GNU Lesser General Public
\r
14 // License as published by the Free Software Foundation; either
\r
15 // version 2.1 of the License, or (at your option) any later version.
\r
17 // This library is distributed in the hope that it will be useful,
\r
18 // but WITHOUT ANY WARRANTY; without even the implied warranty of
\r
19 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
\r
20 // Lesser General Public License for more details.
\r
22 // You should have received a copy of the GNU Lesser General Public
\r
23 // License along with this library; if not, write to the Free Software
\r
24 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
\r
26 // Contact: phylosoft @ gmail . com
\r
27 // WWW: https://sites.google.com/site/cmzmasek/home/software/forester
\r
29 package org.forester.util;
\r
31 import java.util.regex.Matcher;
\r
32 import java.util.regex.Pattern;
\r
34 import org.forester.phylogeny.PhylogenyNode;
\r
35 import org.forester.phylogeny.data.Accession;
\r
36 import org.forester.phylogeny.data.Sequence;
\r
38 public final class SequenceAccessionTools {
\r
40 public final static Pattern UNIPROT_KB_PATTERN_0 = Pattern
\r
41 .compile( "\\b([A-Z][0-9][A-Z0-9]{3}[0-9])\\b" );
\r
42 public final static Pattern UNIPROT_KB_PATTERN_1 = Pattern
\r
43 .compile( "(?:\\b|_)(?:sp|tr)[\\.|\\-_=/\\\\]([A-Z][0-9][A-Z0-9]{3}[0-9])(?:\\b|_)" );
\r
44 public final static Pattern UNIPROT_KB_PATTERN_2 = Pattern
\r
45 .compile( "(?:\\b|_)(?:[A-Z0-9]{2,5}|(?:[A-Z][0-9][A-Z0-9]{3}[0-9]))_(([A-Z9][A-Z]{2}[A-Z0-9]{2})|RAT|PIG|PEA)(?:\\b|_)" );
\r
46 // gb_ADF31344_1_segmented_worms_
\r
48 // gb_EHB07727_1_rodents_
\r
49 // dbj_BAF37827_1_turtles_
\r
50 // emb_CAA73223_1_primates_
\r
51 // lcl_91970_unknown_
\r
52 // mites|ref_XP_002434188_1
\r
53 // ref_XP_002434188_1_mites___ticks_
\r
54 // ref_NP_001121530_1_frogs___toads_
\r
55 //The format for GenBank Accession numbers are:
\r
56 //Nucleotide: 1 letter + 5 numerals OR 2 letters + 6 numerals
\r
57 //Protein: 3 letters + 5 numerals
\r
58 //http://www.ncbi.nlm.nih.gov/Sequin/acc.html
\r
59 private final static Pattern GENBANK_NUCLEOTIDE_AC_PATTERN_1 = Pattern
\r
60 .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]\\d{5}(?:\\.\\d+)?)(?:[^a-zA-Z0-9]|\\Z)" );
\r
61 private final static Pattern GENBANK_NUCLEOTIDE_AC_PATTERN_2 = Pattern
\r
62 .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]{2}\\d{6}(?:\\.\\d+)?)(?:[^a-zA-Z0-9]|\\Z)" );
\r
63 private final static Pattern GENBANK_PROTEIN_AC_PATTERN = Pattern
\r
64 .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]{3}\\d{5}(?:\\.\\d+)?)(?:[^a-zA-Z0-9]|\\Z)" );
\r
65 private final static Pattern GI_PATTERN = Pattern
\r
66 .compile( "(?:\\b|_)(?:GI|gi)[|_=:](\\d+)(?:\\b|_)" );
\r
67 // RefSeq accession numbers can be distinguished from GenBank accessions
\r
68 // by their distinct prefix format of 2 characters followed by an
\r
69 // underscore character ('_'). For example, a RefSeq protein accession is NP_015325.
\r
70 private final static Pattern REFSEQ_PATTERN = Pattern
\r
71 .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]{2}_\\d{6,})(?:[^a-zA-Z0-9]|\\Z)" );
\r
73 private SequenceAccessionTools() {
\r
74 // Hiding the constructor.
\r
77 public static String extractGenbankAccessor( final PhylogenyNode node ) {
\r
79 if ( node.getNodeData().isHasSequence() ) {
\r
80 final Sequence seq = node.getNodeData().getSequence();
\r
81 if ( !ForesterUtil.isEmpty( seq.getSymbol() ) ) {
\r
82 v = parseGenbankAccessor( seq.getSymbol() );
\r
84 if ( !ForesterUtil.isEmpty( seq.getGeneName() ) ) {
\r
85 v = parseGenbankAccessor( seq.getGeneName() );
\r
87 if ( ForesterUtil.isEmpty( v ) && !ForesterUtil.isEmpty( seq.getName() ) ) {
\r
88 v = parseGenbankAccessor( seq.getName() );
\r
90 if ( ForesterUtil.isEmpty( v ) && ( node.getNodeData().getSequence().getAccession() != null )
\r
91 && !ForesterUtil.isEmpty( seq.getAccession().getValue() ) ) {
\r
92 v = parseGenbankAccessor( seq.getAccession().getValue() );
\r
95 if ( ForesterUtil.isEmpty( v ) && !ForesterUtil.isEmpty( node.getName() ) ) {
\r
96 v = parseGenbankAccessor( node.getName() );
\r
101 public static String extractGInumber( final PhylogenyNode node ) {
\r
103 if ( node.getNodeData().isHasSequence() ) {
\r
104 final Sequence seq = node.getNodeData().getSequence();
\r
105 if ( ForesterUtil.isEmpty( v ) && !ForesterUtil.isEmpty( seq.getName() ) ) {
\r
106 v = parseGInumber( seq.getName() );
\r
108 if ( ForesterUtil.isEmpty( v ) && ( node.getNodeData().getSequence().getAccession() != null )
\r
109 && !ForesterUtil.isEmpty( seq.getAccession().getValue() ) ) {
\r
110 v = parseGInumber( seq.getAccession().getValue() );
\r
113 if ( ForesterUtil.isEmpty( v ) && !ForesterUtil.isEmpty( node.getName() ) ) {
\r
114 v = parseGInumber( node.getName() );
\r
119 public static String extractRefSeqAccessor( final PhylogenyNode node ) {
\r
121 if ( node.getNodeData().isHasSequence() ) {
\r
122 final Sequence seq = node.getNodeData().getSequence();
\r
123 if ( !ForesterUtil.isEmpty( seq.getSymbol() ) ) {
\r
124 v = parseRefSeqAccessor( seq.getSymbol() );
\r
126 if ( !ForesterUtil.isEmpty( seq.getGeneName() ) ) {
\r
127 v = parseRefSeqAccessor( seq.getGeneName() );
\r
129 if ( ForesterUtil.isEmpty( v ) && !ForesterUtil.isEmpty( seq.getName() ) ) {
\r
130 v = parseRefSeqAccessor( seq.getName() );
\r
132 if ( ForesterUtil.isEmpty( v ) && ( node.getNodeData().getSequence().getAccession() != null )
\r
133 && !ForesterUtil.isEmpty( seq.getAccession().getValue() ) ) {
\r
134 v = parseRefSeqAccessor( seq.getAccession().getValue() );
\r
137 if ( ForesterUtil.isEmpty( v ) && !ForesterUtil.isEmpty( node.getName() ) ) {
\r
138 v = parseRefSeqAccessor( node.getName() );
\r
143 public static String extractUniProtKbProteinSeqIdentifier( final PhylogenyNode node ) {
\r
145 if ( node.getNodeData().isHasSequence() ) {
\r
146 final Sequence seq = node.getNodeData().getSequence();
\r
147 if ( !ForesterUtil.isEmpty( seq.getSymbol() ) ) {
\r
148 a = SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( seq.getSymbol() );
\r
150 if ( ForesterUtil.isEmpty( a ) && !ForesterUtil.isEmpty( seq.getName() ) ) {
\r
151 a = SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( seq.getName() );
\r
153 if ( ForesterUtil.isEmpty( a ) && !ForesterUtil.isEmpty( seq.getGeneName() ) ) {
\r
154 a = SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( seq.getGeneName() );
\r
156 if ( ForesterUtil.isEmpty( a ) && ( node.getNodeData().getSequence().getAccession() != null )
\r
157 && !ForesterUtil.isEmpty( seq.getAccession().getValue() ) ) {
\r
158 a = SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( seq.getAccession().getValue() );
\r
161 if ( ForesterUtil.isEmpty( a ) && !ForesterUtil.isEmpty( node.getName() ) ) {
\r
162 a = SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( node.getName() );
\r
167 public static String extractUniProtKbProteinSeqIdentifier( final String str ) {
\r
168 Matcher m = UNIPROT_KB_PATTERN_0.matcher( str );
\r
170 return m.group( 1 );
\r
172 m = UNIPROT_KB_PATTERN_1.matcher( str );
\r
174 return m.group( 1 );
\r
176 m = UNIPROT_KB_PATTERN_2.matcher( str );
\r
183 public final static boolean isProtein( final String query ) {
\r
184 final String r1 = parseRefSeqAccessor( query );
\r
185 if ( !ForesterUtil.isEmpty( r1 ) && ( r1.charAt( 1 ) == 'P' ) ) {
\r
188 final String r2 = extractUniProtKbProteinSeqIdentifier( query );
\r
189 if ( !ForesterUtil.isEmpty( r2 ) ) {
\r
192 return GENBANK_PROTEIN_AC_PATTERN.matcher( query ).lookingAt();
\r
195 public final static Accession parse( final PhylogenyNode n ) {
\r
196 String v = extractUniProtKbProteinSeqIdentifier( n );
\r
197 if ( !ForesterUtil.isEmpty( v ) ) {
\r
198 return new Accession( v, Accession.UNIPROT );
\r
200 v = extractGenbankAccessor( n );
\r
201 if ( !ForesterUtil.isEmpty( v ) ) {
\r
202 return new Accession( v, Accession.NCBI );
\r
204 v = extractRefSeqAccessor( n );
\r
205 if ( !ForesterUtil.isEmpty( v ) ) {
\r
206 return new Accession( v, Accession.REFSEQ );
\r
208 v = extractGInumber( n );
\r
209 if ( !ForesterUtil.isEmpty( v ) ) {
\r
210 return new Accession( v, Accession.GI );
\r
215 public final static Accession obtainFromSeqAccession( final PhylogenyNode node ) {
\r
216 if ( node.getNodeData().isHasSequence() && ( node.getNodeData().getSequence().getAccession() != null )
\r
217 && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getSource() )
\r
218 && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getValue() ) ) {
\r
219 final String source = node.getNodeData().getSequence().getAccession().getSource().toLowerCase();
\r
220 final String value = node.getNodeData().getSequence().getAccession().getValue();
\r
221 if ( ( source.startsWith( "uniprot" ) || source.equals( "swissprot" ) || source.equals( "trembl" ) || source
\r
222 .equals( "sp" ) ) ) {
\r
223 return new Accession( value, Accession.UNIPROT );
\r
225 else if ( source.equals( "embl" ) || source.equals( "ebi" ) ) {
\r
226 return new Accession( value, Accession.EMBL );
\r
228 else if ( source.equals( "ncbi" ) || source.equals( "genbank" ) ) {
\r
229 return new Accession( value, Accession.NCBI );
\r
231 else if ( source.equals( "refseq" ) ) {
\r
232 return new Accession( value, Accession.REFSEQ );
\r
234 else if ( source.equals( "gi" ) ) {
\r
235 return new Accession( value, Accession.GI );
\r
242 * Returns null if no match.
\r
245 public final static Accession parse( final String s ) {
\r
246 if ( !ForesterUtil.isEmpty( s ) ) {
\r
247 String v = extractUniProtKbProteinSeqIdentifier( s );
\r
248 if ( !ForesterUtil.isEmpty( v ) ) {
\r
249 return new Accession( v, Accession.UNIPROT );
\r
251 v = parseGenbankAccessor( s );
\r
252 if ( !ForesterUtil.isEmpty( v ) ) {
\r
253 return new Accession( v, Accession.NCBI );
\r
255 v = parseRefSeqAccessor( s );
\r
256 if ( !ForesterUtil.isEmpty( v ) ) {
\r
257 return new Accession( v, Accession.REFSEQ );
\r
259 v = parseGInumber( s );
\r
260 if ( !ForesterUtil.isEmpty( v ) ) {
\r
261 return new Accession( v, Accession.GI );
\r
268 * Returns null if no match.
\r
271 public static String parseGenbankAccessor( final String query ) {
\r
272 Matcher m = GENBANK_NUCLEOTIDE_AC_PATTERN_1.matcher( query );
\r
273 if ( m.lookingAt() ) {
\r
274 return m.group( 1 );
\r
277 m = GENBANK_NUCLEOTIDE_AC_PATTERN_2.matcher( query );
\r
278 if ( m.lookingAt() ) {
\r
279 return m.group( 1 );
\r
282 m = GENBANK_PROTEIN_AC_PATTERN.matcher( query );
\r
283 if ( m.lookingAt() ) {
\r
284 return m.group( 1 );
\r
293 public static String parseGenbankProteinAccessor( final String query ) {
\r
294 final Matcher m = GENBANK_PROTEIN_AC_PATTERN.matcher( query );
\r
295 if ( m.lookingAt() ) {
\r
296 return m.group( 1 );
\r
303 public static String parseGInumber( final String query ) {
\r
304 final Matcher m = GI_PATTERN.matcher( query );
\r
306 return m.group( 1 );
\r
312 * Returns null if no match.
\r
315 public final static String parseRefSeqAccessor( final String query ) {
\r
316 final Matcher m = REFSEQ_PATTERN.matcher( query );
\r
317 if ( m.lookingAt() ) {
\r
318 return m.group( 1 );
\r