2 // FORESTER -- software libraries and applications
3 // for evolutionary biology research and applications.
5 // Copyright (C) 2008-2009 Christian M. Zmasek
6 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
9 // This library is free software; you can redistribute it and/or
10 // modify it under the terms of the GNU Lesser General Public
11 // License as published by the Free Software Foundation; either
12 // version 2.1 of the License, or (at your option) any later version.
14 // This library is distributed in the hope that it will be useful,
15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 // Lesser General Public License for more details.
19 // You should have received a copy of the GNU Lesser General Public
20 // License along with this library; if not, write to the Free Software
21 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
23 // Contact: phylosoft @ gmail . com
24 // WWW: https://sites.google.com/site/cmzmasek/home/software/forester
26 package org.forester.archaeopteryx.tools;
28 import java.io.IOException;
30 import java.net.URISyntaxException;
31 import java.util.Arrays;
32 import java.util.Enumeration;
33 import java.util.Hashtable;
34 import java.util.Vector;
36 import javax.swing.JApplet;
38 import org.forester.archaeopteryx.AptxUtil;
39 import org.forester.archaeopteryx.TreePanel;
40 import org.forester.phylogeny.PhylogenyNode;
41 import org.forester.phylogeny.data.Accession;
42 import org.forester.util.ForesterUtil;
43 import org.forester.util.SequenceAccessionTools;
44 import org.forester.ws.wabi.RestUtil;
46 public final class Blast {
48 final public static void openNcbiBlastWeb( final String query,
49 final boolean is_nucleic_acids,
52 //http://www.ncbi.nlm.nih.gov/blast/Blast.cgi?CMD=Web&PAGE=Proteins&DATABASE=swissprot&QUERY=gi|163848401
53 final StringBuilder uri_str = new StringBuilder();
54 uri_str.append( "http://www.ncbi.nlm.nih.gov/blast/Blast.cgi?CMD=Web&DATABASE=nr&PAGE=" );
55 if ( is_nucleic_acids ) {
56 uri_str.append( "Nucleotide" );
59 uri_str.append( "Proteins" );
61 uri_str.append( "&QUERY=" );
62 uri_str.append( query );
64 AptxUtil.launchWebBrowser( new URI( uri_str.toString() ), applet != null, applet, "_aptx_blast" );
66 catch ( final IOException e ) {
67 AptxUtil.showErrorMessage( p, e.toString() );
70 catch ( final URISyntaxException e ) {
71 AptxUtil.showErrorMessage( p, e.toString() );
76 final public static String obtainQueryForBlast( final PhylogenyNode node ) {
78 if ( node.getNodeData().isHasSequence() ) {
79 if ( !ForesterUtil.isEmpty( node.getNodeData().getSequence().getMolecularSequence() ) ) {
80 query = node.getNodeData().getSequence().getMolecularSequence();
82 if ( ForesterUtil.isEmpty( query ) && ( node.getNodeData().getSequence().getAccession() != null )
83 && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getValue() ) ) {
84 final Accession id = SequenceAccessionTools.parseAccessorFromString( node.getNodeData().getSequence()
85 .getAccession().getValue() );
87 query = id.getValue();
90 if ( ForesterUtil.isEmpty( query ) && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getName() ) ) {
91 final Accession id = SequenceAccessionTools.parseAccessorFromString( node.getNodeData().getSequence()
94 query = id.getValue();
97 if ( ForesterUtil.isEmpty( query ) && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getSymbol() ) ) {
98 final Accession id = SequenceAccessionTools.parseAccessorFromString( node.getNodeData().getSequence()
101 query = id.getValue();
104 if ( ForesterUtil.isEmpty( query )
105 && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getGeneName() ) ) {
106 final Accession id = SequenceAccessionTools.parseAccessorFromString( node.getNodeData().getSequence()
109 query = id.getValue();
113 if ( ForesterUtil.isEmpty( query ) && !ForesterUtil.isEmpty( node.getName() ) ) {
114 final Accession id = SequenceAccessionTools.parseAccessorFromString( node.getName() );
116 query = id.getValue();
122 final public static boolean isContainsQueryForBlast( final PhylogenyNode node ) {
123 return !ForesterUtil.isEmpty( obtainQueryForBlast( node ) );
126 final public void ddbjBlast( final String geneName ) {
127 // Retrieve accession number list which has specified gene name from searchByXMLPath of ARSA. Please click here for details of ARSA.
128 /*target: Sequence length is between 300bp and 1000bp.
130 Gene qualifire is same as specified gene name.*/
131 String queryPath = "/ENTRY/DDBJ/division=='HUM' AND (/ENTRY/DDBJ/length>=300 AND "
132 + "/ENTRY/DDBJ/length<=1000) ";
133 queryPath += "AND (/ENTRY/DDBJ/feature-table/feature{/f_key = 'CDS' AND ";
134 queryPath += "/f_quals/qualifier{/q_name = 'gene' AND /q_value=='" + geneName + "'}})";
135 String query = "service=ARSA&method=searchByXMLPath&queryPath=" + queryPath
136 + "&returnPath=/ENTRY/DDBJ/primary-accession&offset=1&count=100";
138 String arsaResult = null;
140 arsaResult = RestUtil.getResult( query );
142 catch ( final IOException e ) {
143 // TODO Auto-generated catch block
146 final String[] arsaResultLines = arsaResult.split( "\n" );
148 final int arsaResultNum = Integer.parseInt( arsaResultLines[ 0 ].replaceAll( "hitscount =", "" ).trim() );
149 //If there is no hit, print a message and exit
150 if ( arsaResultNum == 0 ) {
151 System.out.println( "There is no entry for gene:" + geneName );
154 //Retrieve DNA sequence of top hit entry by using getFASTA_DDBJEntry of GetEntry.
155 //Retrieve DNA sequence of first fit.
156 final String repAccession = arsaResultLines[ 2 ];
157 query = "service=GetEntry&method=getFASTA_DDBJEntry&accession=" + repAccession;
158 String dnaSeq = null;
160 dnaSeq = RestUtil.getResult( query );
162 catch ( final IOException e ) {
163 // TODO Auto-generated catch block
166 System.out.println( "Retrieved DNA sequence is: " + dnaSeq );
167 //Execute blastn by using searchParam of Blast with step2's sequence. Specified option is -e 0.0001 -m 8 -b 50 -v 50. It means "Extract top 50 hit which E-value is more than 0.0001.". The reference databases are specified as follows. ddbjpri(primates) ddbjrod(rodents) ddbjmam(mammals) ddbjvrt(vertebrates ) ddbjinv(invertebrates).
168 //Execute blastn with step3's sequence
169 query = "service=Blast&method=searchParam&program=blastn&database=ddbjpri ddbjrod ddbjmam ddbjvrt "
170 + "ddbjinv&query=" + dnaSeq + "¶m=-m 8 -b 50 -v 50 -e 0.0001";
171 String blastResult = null;
173 blastResult = RestUtil.getResult( query );
175 catch ( final IOException e ) {
176 // TODO Auto-generated catch block
179 // Extract both accession number and similarity score from BLAST result.
180 // This step does not use Web API and extract the part of result or edit the result. Please click here to see the details of each column in the BLAST tab delimited format which is generated by -m 8 option.
181 final String blastResultLines[] = blastResult.split( "\n" );
182 final Vector<String[]> parsedBlastResult = new Vector<String[]>();
183 for( final String blastResultLine : blastResultLines ) {
184 final String cols[] = blastResultLine.split( "\t" );
185 final String accession = cols[ 1 ].substring( 0, cols[ 1 ].indexOf( "|" ) );
186 final String[] result = { accession, cols[ 2 ] };
187 parsedBlastResult.add( result );
189 // Retrieve species name by using searchByXMLPath of ARSA. If the plural subjects whose species
190 // name are same are in the result, the highest similarity score is used.
191 //Retrieve species from accession number.
192 final Hashtable<String, String> organismAccession = new Hashtable<String, String>();
193 for( int i = 0; i < parsedBlastResult.size(); i++ ) {
194 final String[] parsed = parsedBlastResult.elementAt( i );
195 query = "service=ARSA&method=searchByXMLPath&queryPath=/ENTRY/DDBJ/primary-accession=='" + parsed[ 0 ]
196 + "'&returnPath=/ENTRY/DDBJ/organism&offset=1&count=100";
197 String organism = null;
199 organism = RestUtil.getResult( query );
201 catch ( final IOException e ) {
202 // TODO Auto-generated catch block
205 final String[] organismLines = organism.split( "\n" );
206 organism = organismLines[ 2 ];
207 //If same organism name hits, use first hit.
208 if ( !organismAccession.containsKey( organism ) ) {
209 organismAccession.put( organism, parsed[ 0 ] + "\t" + parsed[ 1 ] );
214 System.out.println( "DDBJ entries: " + arsaResultNum );
215 System.out.println( "Representative accession: " + repAccession );
216 System.out.println( "Organism name\tDDBJ accession number\tSequence similarity" );
217 final String[] keys = new String[ organismAccession.size() ];
218 final Enumeration<String> enu = organismAccession.keys();
220 while ( enu.hasMoreElements() ) {
221 keys[ count ] = enu.nextElement();
225 for( final String key : keys ) {
226 System.out.println( key + "\t" + organismAccession.get( key ) );