1 // forester -- software libraries and applications
2 // for genomics and evolutionary biology research.
4 // Copyright (C) 2010 Christian M Zmasek
5 // Copyright (C) 2010 Sanford-Burnham Medical Research Institute
8 // This library is free software; you can redistribute it and/or
9 // modify it under the terms of the GNU Lesser General Public
10 // License as published by the Free Software Foundation; either
11 // version 2.1 of the License, or (at your option) any later version.
13 // This library is distributed in the hope that it will be useful,
14 // but WITHOUT ANY WARRANTY; without even the implied warranty of
15 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 // Lesser General Public License for more details.
18 // You should have received a copy of the GNU Lesser General Public
19 // License along with this library; if not, write to the Free Software
20 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
22 // Contact: phylosoft @ gmail . com
23 // WWW: www.phylosoft.org/forester
25 package org.forester.analysis;
27 import java.io.IOException;
28 import java.util.ArrayList;
29 import java.util.HashMap;
30 import java.util.List;
32 import org.forester.analysis.TaxonomyDataObtainer.QUERY_TYPE;
33 import org.forester.io.parsers.phyloxml.PhyloXmlDataFormatException;
34 import org.forester.phylogeny.Phylogeny;
35 import org.forester.phylogeny.PhylogenyNode;
36 import org.forester.phylogeny.data.Identifier;
37 import org.forester.phylogeny.data.Taxonomy;
38 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
39 import org.forester.util.ForesterUtil;
40 import org.forester.ws.uniprot.UniProtTaxonomy;
41 import org.forester.ws.uniprot.UniProtWsTools;
43 public final class AncestralTaxonomyInference {
45 private static final int MAX_TAXONOMIES_TO_RETURN = 10;
47 private static UniProtTaxonomy getTaxonomies( final HashMap<String, UniProtTaxonomy> cache,
49 final QUERY_TYPE qt ) throws IOException,
50 AncestralTaxonomyInferenceException {
51 if ( cache.containsKey( query ) ) {
52 return cache.get( query ).copy();
55 List<UniProtTaxonomy> up_taxonomies = null;
58 up_taxonomies = getTaxonomiesFromId( ( String ) query );
61 up_taxonomies = getTaxonomiesFromTaxonomyCode( ( String ) query );
64 up_taxonomies = getTaxonomiesFromScientificName( ( String ) query );
67 up_taxonomies = getTaxonomiesFromCommonName( ( String ) query );
70 return obtainUniProtTaxonomyFromLineage( ( List<String> ) query );
72 throw new RuntimeException();
74 if ( ( up_taxonomies != null ) && ( up_taxonomies.size() == 1 ) ) {
75 final UniProtTaxonomy up_tax = up_taxonomies.get( 0 );
76 if ( !ForesterUtil.isEmpty( up_tax.getScientificName() ) ) {
77 TaxonomyDataObtainer.getSnTaxCacheMap().put( up_tax.getScientificName(), up_tax );
79 if ( !ForesterUtil.isEmpty( up_tax.getCode() ) ) {
80 TaxonomyDataObtainer.getCodeTaxCacheMap().put( up_tax.getCode(), up_tax );
82 if ( !ForesterUtil.isEmpty( up_tax.getCommonName() ) ) {
83 TaxonomyDataObtainer.getCnTaxCacheMap().put( up_tax.getCommonName(), up_tax );
85 if ( !ForesterUtil.isEmpty( up_tax.getId() ) ) {
86 TaxonomyDataObtainer.getIdTaxCacheMap().put( up_tax.getId(), up_tax );
96 private static List<UniProtTaxonomy> getTaxonomiesFromCommonName( final String query )
98 return UniProtWsTools.getTaxonomiesFromCommonNameStrict( query, MAX_TAXONOMIES_TO_RETURN );
101 private static List<UniProtTaxonomy> getTaxonomiesFromId( final String query ) throws IOException {
102 return UniProtWsTools.getTaxonomiesFromId( query, MAX_TAXONOMIES_TO_RETURN );
105 private static List<UniProtTaxonomy> getTaxonomiesFromScientificName( final String query )
107 return UniProtWsTools.getTaxonomiesFromScientificNameStrict( query, MAX_TAXONOMIES_TO_RETURN );
110 private static List<UniProtTaxonomy> getTaxonomiesFromTaxonomyCode( final String query )
112 return UniProtWsTools.getTaxonomiesFromTaxonomyCode( query, MAX_TAXONOMIES_TO_RETURN );
115 public static void inferTaxonomyFromDescendents( final Phylogeny phy ) throws IOException,
116 AncestralTaxonomyInferenceException {
117 TaxonomyDataObtainer.clearCachesIfTooLarge();
118 for( final PhylogenyNodeIterator iter = phy.iteratorPostorder(); iter.hasNext(); ) {
119 final PhylogenyNode node = iter.next();
120 if ( !node.isExternal() ) {
121 inferTaxonomyFromDescendents( node );
126 private static void inferTaxonomyFromDescendents( final PhylogenyNode n ) throws IOException,
127 AncestralTaxonomyInferenceException {
128 if ( n.isExternal() ) {
129 throw new IllegalArgumentException( "attempt to infer taxonomy from descendants of external node" );
131 n.getNodeData().setTaxonomy( null );
132 final List<PhylogenyNode> descs = n.getDescendants();
133 final List<String[]> lineages = new ArrayList<String[]>();
134 int shortest_lin_length = Integer.MAX_VALUE;
135 for( final PhylogenyNode desc : descs ) {
136 if ( desc.getNodeData().isHasTaxonomy()
137 && ( TaxonomyDataObtainer.isHasAppropriateId( desc.getNodeData().getTaxonomy() )
138 || !ForesterUtil.isEmpty( desc.getNodeData().getTaxonomy().getScientificName() )
139 || !ForesterUtil.isEmpty( desc.getNodeData().getTaxonomy().getLineage() )
140 || !ForesterUtil.isEmpty( desc.getNodeData().getTaxonomy().getTaxonomyCode() ) || !ForesterUtil
141 .isEmpty( desc.getNodeData().getTaxonomy().getCommonName() ) ) ) {
142 final UniProtTaxonomy up_tax = obtainUniProtTaxonomy( desc.getNodeData().getTaxonomy(), null, null );
143 if ( ( up_tax == null ) && ForesterUtil.isEmpty( desc.getNodeData().getTaxonomy().getLineage() ) ) {
144 String desc_str = "";
145 if ( !ForesterUtil.isEmpty( desc.getName() ) ) {
146 desc_str = "\"" + desc.getName() + "\"";
149 desc_str = "[" + desc.getId() + "]";
151 System.out.println( desc.getNodeData().getTaxonomy().toString() );
152 System.out.println( ForesterUtil.stringListToString( desc.getNodeData().getTaxonomy().getLineage(),
154 throw new AncestralTaxonomyInferenceException( "a taxonomy for node " + desc_str
155 + " could not be established from the database" );
157 String[] lineage = ForesterUtil.stringListToArray( desc.getNodeData().getTaxonomy().getLineage() );
158 if ( ( lineage == null ) || ( lineage.length < 1 ) ) {
159 lineage = ForesterUtil.stringListToArray( up_tax.getLineage() );
161 if ( ( lineage == null ) || ( lineage.length < 1 ) ) {
162 throw new AncestralTaxonomyInferenceException( "a taxonomic lineage for node \""
163 + desc.getNodeData().getTaxonomy().toString() + "\" could not be established" );
165 if ( lineage.length < shortest_lin_length ) {
166 shortest_lin_length = lineage.length;
168 lineages.add( lineage );
172 if ( !ForesterUtil.isEmpty( desc.getName() ) ) {
173 node = "\"" + desc.getName() + "\"";
176 node = "[" + desc.getId() + "]";
178 // final List<PhylogenyNode> e = desc.getAllExternalDescendants();
180 // System.out.println();
182 // for( final PhylogenyNode object : e ) {
183 // System.out.println( x + ":" );
184 // System.out.println( object.getName() + " " );
187 // System.out.println();
189 throw new AncestralTaxonomyInferenceException( "node " + node
190 + " has no or inappropriate taxonomic information" );
193 final List<String> last_common_lineage = new ArrayList<String>();
194 String last_common = null;
195 if ( shortest_lin_length > 0 ) {
196 I: for( int i = 0; i < shortest_lin_length; ++i ) {
197 final String lineage_0 = lineages.get( 0 )[ i ];
198 for( int j = 1; j < lineages.size(); ++j ) {
199 if ( !lineage_0.equals( lineages.get( j )[ i ] ) ) {
203 last_common_lineage.add( lineage_0 );
204 last_common = lineage_0;
207 if ( last_common_lineage.isEmpty() ) {
208 boolean saw_viruses = false;
209 boolean saw_cellular_organism = false;
210 for( final String[] lineage : lineages ) {
211 if ( lineage.length > 0 ) {
212 if ( lineage[ 0 ].equalsIgnoreCase( UniProtTaxonomy.VIRUSES ) ) {
215 else if ( lineage[ 0 ].equalsIgnoreCase( UniProtTaxonomy.CELLULAR_ORGANISMS ) ) {
216 saw_cellular_organism = true;
218 if ( saw_cellular_organism && saw_viruses ) {
223 if ( saw_cellular_organism && saw_viruses ) {
224 last_common_lineage.add( UniProtTaxonomy.CELLULAR_ORGANISMS );
225 last_common = UniProtTaxonomy.CELLULAR_ORGANISMS;
228 String msg = "no common lineage for:\n";
230 for( final String[] strings : lineages ) {
231 msg += counter + ": ";
233 for( final String string : strings ) {
238 throw new AncestralTaxonomyInferenceException( msg );
241 final Taxonomy tax = new Taxonomy();
242 n.getNodeData().setTaxonomy( tax );
243 tax.setScientificName( last_common );
244 final UniProtTaxonomy up_tax = obtainUniProtTaxonomyFromLineage( last_common_lineage );
245 if ( up_tax != null ) {
246 if ( !ForesterUtil.isEmpty( up_tax.getRank() ) ) {
248 tax.setRank( up_tax.getRank().toLowerCase() );
250 catch ( final PhyloXmlDataFormatException ex ) {
254 if ( !ForesterUtil.isEmpty( up_tax.getId() ) ) {
255 tax.setIdentifier( new Identifier( up_tax.getId(), "uniprot" ) );
257 if ( !ForesterUtil.isEmpty( up_tax.getCommonName() ) ) {
258 tax.setCommonName( up_tax.getCommonName() );
260 if ( !ForesterUtil.isEmpty( up_tax.getSynonym() ) && !tax.getSynonyms().contains( up_tax.getSynonym() ) ) {
261 tax.getSynonyms().add( up_tax.getSynonym() );
263 if ( up_tax.getLineage() != null ) {
264 tax.setLineage( new ArrayList<String>() );
265 for( final String lin : up_tax.getLineage() ) {
266 if ( !ForesterUtil.isEmpty( lin ) ) {
267 tax.getLineage().add( lin );
272 if ( ForesterUtil.isEmpty( tax.getLineage() ) ) {
273 tax.setLineage( new ArrayList<String>() );
274 for( final String lin : last_common_lineage ) {
275 if ( !ForesterUtil.isEmpty( lin ) ) {
276 tax.getLineage().add( lin );
280 for( final PhylogenyNode desc : descs ) {
281 if ( !desc.isExternal() && desc.getNodeData().isHasTaxonomy()
282 && desc.getNodeData().getTaxonomy().isEqual( tax ) ) {
283 desc.getNodeData().setTaxonomy( null );
291 private static UniProtTaxonomy obtainUniProtTaxonomyFromLineage( final List<String> lineage )
292 throws AncestralTaxonomyInferenceException, IOException {
293 final String lineage_str = ForesterUtil.stringListToString( lineage, ">" );
294 UniProtTaxonomy up_tax = null;
295 if ( TaxonomyDataObtainer.getLineageTaxCacheMap().containsKey( lineage_str ) ) {
296 up_tax = TaxonomyDataObtainer.getLineageTaxCacheMap().get( lineage_str ).copy();
299 final List<UniProtTaxonomy> up_taxonomies = getTaxonomiesFromScientificName( lineage
300 .get( lineage.size() - 1 ) );
301 if ( ( up_taxonomies != null ) && ( up_taxonomies.size() > 0 ) ) {
302 for( final UniProtTaxonomy up_taxonomy : up_taxonomies ) {
303 boolean match = true;
304 I: for( int i = 0; i < lineage.size(); ++i ) {
305 if ( !lineage.get( i ).equalsIgnoreCase( up_taxonomy.getLineage().get( i ) ) ) {
311 if ( up_tax != null ) {
312 throw new AncestralTaxonomyInferenceException( "lineage \""
313 + ForesterUtil.stringListToString( lineage, " > " ) + "\" is not unique" );
315 up_tax = up_taxonomy;
318 if ( up_tax == null ) {
319 throw new AncestralTaxonomyInferenceException( "lineage \""
320 + ForesterUtil.stringListToString( lineage, " > " ) + "\" not found" );
322 TaxonomyDataObtainer.getLineageTaxCacheMap().put( lineage_str, up_tax );
323 if ( !ForesterUtil.isEmpty( up_tax.getScientificName() ) ) {
324 TaxonomyDataObtainer.getSnTaxCacheMap().put( up_tax.getScientificName(), up_tax );
326 if ( !ForesterUtil.isEmpty( up_tax.getCode() ) ) {
327 TaxonomyDataObtainer.getCodeTaxCacheMap().put( up_tax.getCode(), up_tax );
329 if ( !ForesterUtil.isEmpty( up_tax.getCommonName() ) ) {
330 TaxonomyDataObtainer.getCnTaxCacheMap().put( up_tax.getCommonName(), up_tax );
332 if ( !ForesterUtil.isEmpty( up_tax.getId() ) ) {
333 TaxonomyDataObtainer.getIdTaxCacheMap().put( up_tax.getId(), up_tax );