forester/java/src/org/forester/archaeopteryx/tools/Blast.java

   1 // $Id:
   2 // FORESTER -- software libraries and applications
   3 // for evolutionary biology research and applications.
   4 //
   5 // Copyright (C) 2008-2009 Christian M. Zmasek
   6 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
   7 // All rights reserved
   8 //
   9 // This library is free software; you can redistribute it and/or
  10 // modify it under the terms of the GNU Lesser General Public
  11 // License as published by the Free Software Foundation; either
  12 // version 2.1 of the License, or (at your option) any later version.
  13 //
  14 // This library is distributed in the hope that it will be useful,
  15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
  16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  17 // Lesser General Public License for more details.
  18 //
  19 // You should have received a copy of the GNU Lesser General Public
  20 // License along with this library; if not, write to the Free Software
  21 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
  22 //
  23 // Contact: phylosoft @ gmail . com
  24 // WWW: https://sites.google.com/site/cmzmasek/home/software/forester
  25
  26 package org.forester.archaeopteryx.tools;
  27
  28 import java.io.IOException;
  29 import java.net.URI;
  30 import java.net.URISyntaxException;
  31 import java.util.Arrays;
  32 import java.util.Enumeration;
  33 import java.util.Hashtable;
  34 import java.util.Vector;
  35
  36 import javax.swing.JApplet;
  37
  38 import org.forester.archaeopteryx.AptxUtil;
  39 import org.forester.archaeopteryx.TreePanel;
  40 import org.forester.phylogeny.PhylogenyNode;
  41 import org.forester.phylogeny.data.Identifier;
  42 import org.forester.util.ForesterUtil;
  43 import org.forester.util.SequenceIdParser;
  44 import org.forester.ws.wabi.RestUtil;
  45
  46 public final class Blast {
  47
  48     final public static void openNcbiBlastWeb( final String query,
  49                                                final boolean is_nucleic_acids,
  50                                                final JApplet applet,
  51                                                final TreePanel p ) {
  52         //http://www.ncbi.nlm.nih.gov/blast/Blast.cgi?CMD=Web&PAGE=Proteins&DATABASE=swissprot&QUERY=gi|163848401
  53         final StringBuilder uri_str = new StringBuilder();
  54         uri_str.append( "http://www.ncbi.nlm.nih.gov/blast/Blast.cgi?CMD=Web&DATABASE=nr&PAGE=" );
  55         if ( is_nucleic_acids ) {
  56             uri_str.append( "Nucleotide" );
  57         }
  58         else {
  59             uri_str.append( "Proteins" );
  60         }
  61         uri_str.append( "&QUERY=" );
  62         uri_str.append( query );
  63         try {
  64             AptxUtil.launchWebBrowser( new URI( uri_str.toString() ), applet != null, applet, "_aptx_blast" );
  65         }
  66         catch ( final IOException e ) {
  67             AptxUtil.showErrorMessage( p, e.toString() );
  68             e.printStackTrace();
  69         }
  70         catch ( final URISyntaxException e ) {
  71             AptxUtil.showErrorMessage( p, e.toString() );
  72             e.printStackTrace();
  73         }
  74     }
  75
  76     final public static String obtainQueryForBlast( final PhylogenyNode node ) {
  77         String query = "";
  78         if ( node.getNodeData().isHasSequence() ) {
  79             if ( !ForesterUtil.isEmpty( node.getNodeData().getSequence().getMolecularSequence() ) ) {
  80                 query = node.getNodeData().getSequence().getMolecularSequence();
  81             }
  82             if ( ForesterUtil.isEmpty( query ) && ( node.getNodeData().getSequence().getAccession() != null )
  83                     && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getValue() ) ) {
  84                 final Identifier id = SequenceIdParser.parse( node.getNodeData().getSequence().getAccession()
  85                         .getValue() );
  86                 if ( id != null ) {
  87                     query = id.getValue();
  88                 }
  89             }
  90             if ( ForesterUtil.isEmpty( query ) && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getName() ) ) {
  91                 final Identifier id = SequenceIdParser.parse( node.getNodeData().getSequence().getName() );
  92                 if ( id != null ) {
  93                     query = id.getValue();
  94                 }
  95             }
  96             if ( ForesterUtil.isEmpty( query ) && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getSymbol() ) ) {
  97                 final Identifier id = SequenceIdParser.parse( node.getNodeData().getSequence().getSymbol() );
  98                 if ( id != null ) {
  99                     query = id.getValue();
 100                 }
 101             }
 102         }
 103         if ( ForesterUtil.isEmpty( query ) && !ForesterUtil.isEmpty( node.getName() ) ) {
 104             final Identifier id = SequenceIdParser.parse( node.getName() );
 105             if ( id != null ) {
 106                 query = id.getValue();
 107             }
 108         }
 109         return query;
 110     }
 111
 112     final public static boolean isContainsQueryForBlast( final PhylogenyNode node ) {
 113         return !ForesterUtil.isEmpty( obtainQueryForBlast( node ) );
 114     }
 115
 116     final public void ddbjBlast( final String geneName ) {
 117         // Retrieve accession number list which has specified gene name from searchByXMLPath of ARSA. Please click here for details of ARSA.
 118         /*target: Sequence length is between 300bp and 1000bp.
 119         Feature key is CDS.
 120         Gene qualifire is same as specified gene name.*/
 121         String queryPath = "/ENTRY/DDBJ/division=='HUM' AND (/ENTRY/DDBJ/length>=300 AND "
 122                 + "/ENTRY/DDBJ/length<=1000) ";
 123         queryPath += "AND (/ENTRY/DDBJ/feature-table/feature{/f_key = 'CDS' AND ";
 124         queryPath += "/f_quals/qualifier{/q_name = 'gene' AND /q_value=='" + geneName + "'}})";
 125         String query = "service=ARSA&method=searchByXMLPath&queryPath=" + queryPath
 126                 + "&returnPath=/ENTRY/DDBJ/primary-accession&offset=1&count=100";
 127         //Execute ARSA
 128         String arsaResult = null;
 129         try {
 130             arsaResult = RestUtil.getResult( query );
 131         }
 132         catch ( final IOException e ) {
 133             // TODO Auto-generated catch block
 134             e.printStackTrace();
 135         }
 136         final String[] arsaResultLines = arsaResult.split( "\n" );
 137         //Get hit count
 138         final int arsaResultNum = Integer.parseInt( arsaResultLines[ 0 ].replaceAll( "hitscount       =", "" ).trim() );
 139         //If there is no hit, print a message and exit
 140         if ( arsaResultNum == 0 ) {
 141             System.out.println( "There is no entry for gene:" + geneName );
 142             return;
 143         }
 144         //Retrieve DNA sequence of top hit entry by using getFASTA_DDBJEntry of GetEntry.
 145         //Retrieve DNA sequence of first fit.
 146         final String repAccession = arsaResultLines[ 2 ];
 147         query = "service=GetEntry&method=getFASTA_DDBJEntry&accession=" + repAccession;
 148         String dnaSeq = null;
 149         try {
 150             dnaSeq = RestUtil.getResult( query );
 151         }
 152         catch ( final IOException e ) {
 153             // TODO Auto-generated catch block
 154             e.printStackTrace();
 155         }
 156         System.out.println( "Retrieved DNA sequence is: " + dnaSeq );
 157         //Execute blastn by using searchParam of Blast with step2's sequence. Specified option is -e 0.0001 -m 8 -b 50 -v 50. It means "Extract top 50 hit which E-value is more than 0.0001.". The reference databases are specified as follows. ddbjpri(primates) ddbjrod(rodents) ddbjmam(mammals) ddbjvrt(vertebrates ) ddbjinv(invertebrates).
 158         //Execute blastn with step3's sequence
 159         query = "service=Blast&method=searchParam&program=blastn&database=ddbjpri ddbjrod ddbjmam ddbjvrt "
 160                 + "ddbjinv&query=" + dnaSeq + "&param=-m 8 -b 50 -v 50 -e 0.0001";
 161         String blastResult = null;
 162         try {
 163             blastResult = RestUtil.getResult( query );
 164         }
 165         catch ( final IOException e ) {
 166             // TODO Auto-generated catch block
 167             e.printStackTrace();
 168         }
 169         // Extract both accession number and similarity score from BLAST result.
 170         // This step does not use Web API and extract the part of result or edit the result. Please click here to see the details of each column in the BLAST tab delimited format which is generated by -m 8 option.
 171         final String blastResultLines[] = blastResult.split( "\n" );
 172         final Vector<String[]> parsedBlastResult = new Vector<String[]>();
 173         for( final String blastResultLine : blastResultLines ) {
 174             final String cols[] = blastResultLine.split( "\t" );
 175             final String accession = cols[ 1 ].substring( 0, cols[ 1 ].indexOf( "|" ) );
 176             final String[] result = { accession, cols[ 2 ] };
 177             parsedBlastResult.add( result );
 178         }
 179         // Retrieve species name by using searchByXMLPath of ARSA. If the plural subjects whose species
 180         // name are same are in the result, the highest similarity score is used.
 181         //Retrieve species from accession number.
 182         final Hashtable<String, String> organismAccession = new Hashtable<String, String>();
 183         for( int i = 0; i < parsedBlastResult.size(); i++ ) {
 184             final String[] parsed = parsedBlastResult.elementAt( i );
 185             query = "service=ARSA&method=searchByXMLPath&queryPath=/ENTRY/DDBJ/primary-accession=='" + parsed[ 0 ]
 186                     + "'&returnPath=/ENTRY/DDBJ/organism&offset=1&count=100";
 187             String organism = null;
 188             try {
 189                 organism = RestUtil.getResult( query );
 190             }
 191             catch ( final IOException e ) {
 192                 // TODO Auto-generated catch block
 193                 e.printStackTrace();
 194             }
 195             final String[] organismLines = organism.split( "\n" );
 196             organism = organismLines[ 2 ];
 197             //If same organism name hits, use first hit.
 198             if ( !organismAccession.containsKey( organism ) ) {
 199                 organismAccession.put( organism, parsed[ 0 ] + "\t" + parsed[ 1 ] );
 200             }
 201         }
 202         // Print result.
 203         // Print Result
 204         System.out.println( "DDBJ entries: " + arsaResultNum );
 205         System.out.println( "Representative accession: " + repAccession );
 206         System.out.println( "Organism name\tDDBJ accession number\tSequence similarity" );
 207         final String[] keys = new String[ organismAccession.size() ];
 208         final Enumeration<String> enu = organismAccession.keys();
 209         int count = 0;
 210         while ( enu.hasMoreElements() ) {
 211             keys[ count ] = enu.nextElement();
 212             ++count;
 213         }
 214         Arrays.sort( keys );
 215         for( final String key : keys ) {
 216             System.out.println( key + "\t" + organismAccession.get( key ) );
 217         }
 218     }
 219 }