7f0477ed05faccdb49269544639d216d5fbea1d1
[jalview.git] / forester / java / src / org / forester / archaeopteryx / tools / SequenceDataRetriver.java
1 // Exp $
2 // forester -- software libraries and applications
3 // for genomics and evolutionary biology research.
4 //
5 // Copyright (C) 2010 Christian M Zmasek
6 // Copyright (C) 2010 Sanford-Burnham Medical Research Institute
7 // All rights reserved
8 //
9 // This library is free software; you can redistribute it and/or
10 // modify it under the terms of the GNU Lesser General Public
11 // License as published by the Free Software Foundation; either
12 // version 2.1 of the License, or (at your option) any later version.
13 //
14 // This library is distributed in the hope that it will be useful,
15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 // Lesser General Public License for more details.
18 //
19 // You should have received a copy of the GNU Lesser General Public
20 // License along with this library; if not, write to the Free Software
21 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
22 //
23 // Contact: phylosoft @ gmail . com
24 // WWW: www.phylosoft.org/forester
25
26 package org.forester.archaeopteryx.tools;
27
28 import java.io.FileNotFoundException;
29 import java.io.IOException;
30 import java.net.UnknownHostException;
31 import java.util.SortedSet;
32 import java.util.TreeSet;
33 import java.util.regex.Pattern;
34
35 import javax.swing.JOptionPane;
36
37 import org.forester.archaeopteryx.MainFrameApplication;
38 import org.forester.archaeopteryx.TreePanel;
39 import org.forester.phylogeny.Phylogeny;
40 import org.forester.phylogeny.PhylogenyNode;
41 import org.forester.phylogeny.data.Accession;
42 import org.forester.phylogeny.data.Identifier;
43 import org.forester.phylogeny.data.Sequence;
44 import org.forester.phylogeny.data.Taxonomy;
45 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
46 import org.forester.util.ForesterUtil;
47 import org.forester.ws.uniprot.SequenceDatabaseEntry;
48 import org.forester.ws.uniprot.UniProtWsTools;
49
50 public final class SequenceDataRetriver implements Runnable {
51
52     // uniprot/expasy accession number format (6 chars):
53     // letter digit letter-or-digit letter-or-digit letter-or-digit digit
54     private final static Pattern       UNIPROT_AC_PATTERN = Pattern.compile( "[A-NR-ZOPQ]\\d[A-Z0-9]{3}\\d" );
55     private final Phylogeny            _phy;
56     private final MainFrameApplication _mf;
57     private final TreePanel            _treepanel;
58     private final static boolean       DEBUG              = true;
59
60     private enum Db {
61         UNKNOWN, UNIPROT;
62     }
63
64     public SequenceDataRetriver( final MainFrameApplication mf, final TreePanel treepanel, final Phylogeny phy ) {
65         _phy = phy;
66         _mf = mf;
67         _treepanel = treepanel;
68     }
69
70     private String getBaseUrl() {
71         return UniProtWsTools.BASE_URL;
72     }
73
74     private void execute() {
75         _mf.getMainPanel().getCurrentTreePanel().setWaitCursor();
76         SortedSet<String> not_found = null;
77         try {
78             not_found = obtainSeqInformation( _phy );
79         }
80         catch ( final UnknownHostException e ) {
81             _mf.getMainPanel().getCurrentTreePanel().setArrowCursor();
82             JOptionPane.showMessageDialog( _mf,
83                                            "Could not connect to \"" + getBaseUrl() + "\"",
84                                            "Network error during taxonomic information gathering",
85                                            JOptionPane.ERROR_MESSAGE );
86             return;
87         }
88         catch ( final IOException e ) {
89             _mf.getMainPanel().getCurrentTreePanel().setArrowCursor();
90             e.printStackTrace();
91             JOptionPane.showMessageDialog( _mf,
92                                            e.toString(),
93                                            "Failed to obtain taxonomic information",
94                                            JOptionPane.ERROR_MESSAGE );
95             return;
96         }
97         finally {
98             _mf.getMainPanel().getCurrentTreePanel().setArrowCursor();
99         }
100         _treepanel.setTree( _phy );
101         _mf.showWhole();
102         _treepanel.setEdited( true );
103         if ( ( not_found != null ) && ( not_found.size() > 0 ) ) {
104             int max = not_found.size();
105             boolean more = false;
106             if ( max > 20 ) {
107                 more = true;
108                 max = 20;
109             }
110             final StringBuffer sb = new StringBuffer();
111             sb.append( "Not all identifiers could be resolved.\n" );
112             if ( not_found.size() == 1 ) {
113                 sb.append( "The following identifier was not found:\n" );
114             }
115             else {
116                 sb.append( "The following identifiers were not found (total: " + not_found.size() + "):\n" );
117             }
118             int i = 0;
119             for( final String string : not_found ) {
120                 if ( i > 19 ) {
121                     break;
122                 }
123                 sb.append( string );
124                 sb.append( "\n" );
125                 ++i;
126             }
127             if ( more ) {
128                 sb.append( "..." );
129             }
130             try {
131                 JOptionPane.showMessageDialog( _mf,
132                                                sb.toString(),
133                                                "UniProt Sequence Tool Completed",
134                                                JOptionPane.WARNING_MESSAGE );
135             }
136             catch ( final Exception e ) {
137                 // Not important if this fails, do nothing. 
138             }
139         }
140         else {
141             try {
142                 JOptionPane.showMessageDialog( _mf,
143                                                "UniProt sequence tool successfully completed",
144                                                "UniProt Sequence Tool Completed",
145                                                JOptionPane.INFORMATION_MESSAGE );
146             }
147             catch ( final Exception e ) {
148                 // Not important if this fails, do nothing.
149             }
150         }
151     }
152
153     synchronized public static SortedSet<String> obtainSeqInformation( final Phylogeny phy ) throws IOException {
154         final SortedSet<String> not_found = new TreeSet<String>();
155         for( final PhylogenyNodeIterator iter = phy.iteratorPostorder(); iter.hasNext(); ) {
156             final PhylogenyNode node = iter.next();
157             Sequence seq = null;
158             Taxonomy tax = null;
159             if ( node.getNodeData().isHasSequence() ) {
160                 seq = node.getNodeData().getSequence();
161             }
162             else {
163                 seq = new Sequence();
164             }
165             if ( node.getNodeData().isHasTaxonomy() ) {
166                 tax = node.getNodeData().getTaxonomy();
167             }
168             else {
169                 tax = new Taxonomy();
170             }
171             String query = null;
172             Db db = Db.UNKNOWN;
173             if ( node.getNodeData().isHasSequence() && ( node.getNodeData().getSequence().getAccession() != null )
174                     && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getSource() )
175                     && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getValue() )
176                     && node.getNodeData().getSequence().getAccession().getValue().toLowerCase().startsWith( "uniprot" ) ) {
177                 query = node.getNodeData().getSequence().getAccession().getValue();
178                 db = Db.UNIPROT;
179             }
180             else if ( !ForesterUtil.isEmpty( node.getName() ) ) {
181                 query = node.getName();
182             }
183             if ( !ForesterUtil.isEmpty( query ) ) {
184                 if ( query.indexOf( '/' ) > 0 ) {
185                     query = query.substring( 0, query.indexOf( '/' ) );
186                 }
187                 if ( query.indexOf( '.' ) > 0 ) {
188                     query = query.substring( 0, query.indexOf( '.' ) );
189                 }
190                 if ( query.indexOf( '_' ) > 0 ) {
191                     query = query.substring( 0, query.indexOf( '_' ) );
192                 }
193                 SequenceDatabaseEntry db_entry = null;
194                 if ( ( db == Db.UNIPROT ) || UNIPROT_AC_PATTERN.matcher( query ).matches() ) {
195                     if ( DEBUG ) {
196                         System.out.println( "uniprot: " + query );
197                     }
198                     try {
199                         db_entry = UniProtWsTools.obtainUniProtEntry( query, 200 );
200                     }
201                     catch ( final FileNotFoundException e ) {
202                         // Ignore.
203                     }
204                 }
205                 if ( db_entry != null ) {
206                     if ( !ForesterUtil.isEmpty( db_entry.getAccession() ) ) {
207                         seq.setAccession( new Accession( db_entry.getAccession(), "uniprot" ) );
208                     }
209                     if ( !ForesterUtil.isEmpty( db_entry.getSequenceName() ) ) {
210                         seq.setName( db_entry.getSequenceName() );
211                     }
212                     if ( !ForesterUtil.isEmpty( db_entry.getSequenceSymbol() ) ) {
213                         seq.setSymbol( db_entry.getSequenceSymbol() );
214                     }
215                     if ( !ForesterUtil.isEmpty( db_entry.getTaxonomyScientificName() ) ) {
216                         tax.setScientificName( db_entry.getTaxonomyScientificName() );
217                     }
218                     if ( !ForesterUtil.isEmpty( db_entry.getTaxonomyIdentifier() ) ) {
219                         tax.setIdentifier( new Identifier( db_entry.getTaxonomyIdentifier(), "uniprot" ) );
220                     }
221                     node.getNodeData().setTaxonomy( tax );
222                     node.getNodeData().setSequence( seq );
223                 }
224                 else {
225                     not_found.add( node.getName() );
226                 }
227             }
228         }
229         return not_found;
230     }
231
232     @Override
233     public void run() {
234         execute();
235     }
236 }