1 // forester -- software libraries and applications
2 // for genomics and evolutionary biology research.
4 // Copyright (C) 2010 Christian M Zmasek
5 // Copyright (C) 2010 Sanford-Burnham Medical Research Institute
8 // This library is free software; you can redistribute it and/or
9 // modify it under the terms of the GNU Lesser General Public
10 // License as published by the Free Software Foundation; either
11 // version 2.1 of the License, or (at your option) any later version.
13 // This library is distributed in the hope that it will be useful,
14 // but WITHOUT ANY WARRANTY; without even the implied warranty of
15 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 // Lesser General Public License for more details.
18 // You should have received a copy of the GNU Lesser General Public
19 // License along with this library; if not, write to the Free Software
20 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
22 // Contact: phylosoft @ gmail . com
23 // WWW: www.phylosoft.org/forester
25 package org.forester.analysis;
27 import java.io.IOException;
28 import java.util.ArrayList;
29 import java.util.List;
31 import org.forester.io.parsers.phyloxml.PhyloXmlDataFormatException;
32 import org.forester.phylogeny.Phylogeny;
33 import org.forester.phylogeny.PhylogenyNode;
34 import org.forester.phylogeny.data.Identifier;
35 import org.forester.phylogeny.data.Taxonomy;
36 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
37 import org.forester.util.ForesterUtil;
38 import org.forester.ws.seqdb.UniProtTaxonomy;
40 public final class AncestralTaxonomyInference {
42 public static void inferTaxonomyFromDescendents( final Phylogeny phy ) throws IOException,
43 AncestralTaxonomyInferenceException {
44 TaxonomyDataManager.clearCachesIfTooLarge();
45 for( final PhylogenyNodeIterator iter = phy.iteratorPostorder(); iter.hasNext(); ) {
46 final PhylogenyNode node = iter.next();
47 if ( !node.isExternal() ) {
48 inferTaxonomyFromDescendents( node );
53 private static void inferTaxonomyFromDescendents( final PhylogenyNode n ) throws IOException,
54 AncestralTaxonomyInferenceException {
55 if ( n.isExternal() ) {
56 throw new IllegalArgumentException( "attempt to infer taxonomy from descendants of external node" );
58 n.getNodeData().setTaxonomy( null );
59 final List<PhylogenyNode> descs = n.getDescendants();
60 final List<String[]> lineages = new ArrayList<String[]>();
61 int shortest_lin_length = Integer.MAX_VALUE;
62 for( final PhylogenyNode desc : descs ) {
63 if ( desc.getNodeData().isHasTaxonomy()
64 && ( TaxonomyDataManager.isHasAppropriateId( desc.getNodeData().getTaxonomy() )
65 || !ForesterUtil.isEmpty( desc.getNodeData().getTaxonomy().getScientificName() )
66 || !ForesterUtil.isEmpty( desc.getNodeData().getTaxonomy().getLineage() )
67 || !ForesterUtil.isEmpty( desc.getNodeData().getTaxonomy().getTaxonomyCode() ) || !ForesterUtil
68 .isEmpty( desc.getNodeData().getTaxonomy().getCommonName() ) ) ) {
69 final UniProtTaxonomy up_tax = TaxonomyDataManager.obtainUniProtTaxonomy( desc.getNodeData()
70 .getTaxonomy(), null, null );
71 if ( ( up_tax == null ) && ForesterUtil.isEmpty( desc.getNodeData().getTaxonomy().getLineage() ) ) {
73 if ( !ForesterUtil.isEmpty( desc.getName() ) ) {
74 desc_str = "\"" + desc.getName() + "\"";
77 desc_str = "[" + desc.getId() + "]";
79 System.out.println( desc.getNodeData().getTaxonomy().toString() );
80 System.out.println( ForesterUtil.stringListToString( desc.getNodeData().getTaxonomy().getLineage(),
82 throw new AncestralTaxonomyInferenceException( "a taxonomy for node " + desc_str
83 + " could not be established from the database" );
85 String[] lineage = ForesterUtil.stringListToArray( desc.getNodeData().getTaxonomy().getLineage() );
86 if ( ( lineage == null ) || ( lineage.length < 1 ) ) {
87 lineage = ForesterUtil.stringListToArray( up_tax.getLineage() );
89 if ( ( lineage == null ) || ( lineage.length < 1 ) ) {
90 throw new AncestralTaxonomyInferenceException( "a taxonomic lineage for node \""
91 + desc.getNodeData().getTaxonomy().toString() + "\" could not be established" );
93 if ( lineage.length < shortest_lin_length ) {
94 shortest_lin_length = lineage.length;
96 lineages.add( lineage );
100 if ( !ForesterUtil.isEmpty( desc.getName() ) ) {
101 node = "\"" + desc.getName() + "\"";
104 node = "[" + desc.getId() + "]";
106 throw new AncestralTaxonomyInferenceException( "node " + node
107 + " has no or inappropriate taxonomic information" );
110 final List<String> last_common_lineage = new ArrayList<String>();
111 String last_common = null;
112 if ( shortest_lin_length > 0 ) {
113 I: for( int i = 0; i < shortest_lin_length; ++i ) {
114 final String lineage_0 = lineages.get( 0 )[ i ];
115 for( int j = 1; j < lineages.size(); ++j ) {
116 if ( !lineage_0.equals( lineages.get( j )[ i ] ) ) {
120 last_common_lineage.add( lineage_0 );
121 last_common = lineage_0;
124 if ( last_common_lineage.isEmpty() ) {
125 boolean saw_viruses = false;
126 boolean saw_cellular_organism = false;
127 for( final String[] lineage : lineages ) {
128 if ( lineage.length > 0 ) {
129 if ( lineage[ 0 ].equalsIgnoreCase( UniProtTaxonomy.VIRUSES ) ) {
132 else if ( lineage[ 0 ].equalsIgnoreCase( UniProtTaxonomy.CELLULAR_ORGANISMS ) ) {
133 saw_cellular_organism = true;
135 if ( saw_cellular_organism && saw_viruses ) {
140 if ( saw_cellular_organism && saw_viruses ) {
141 last_common_lineage.add( UniProtTaxonomy.CELLULAR_ORGANISMS );
142 last_common = UniProtTaxonomy.CELLULAR_ORGANISMS;
145 String msg = "no common lineage for:\n";
147 for( final String[] strings : lineages ) {
148 msg += counter + ": ";
150 for( final String string : strings ) {
155 throw new AncestralTaxonomyInferenceException( msg );
158 final Taxonomy tax = new Taxonomy();
159 n.getNodeData().setTaxonomy( tax );
160 tax.setScientificName( last_common );
161 final UniProtTaxonomy up_tax = TaxonomyDataManager.obtainUniProtTaxonomyFromLineage( last_common_lineage );
162 if ( up_tax != null ) {
163 if ( !ForesterUtil.isEmpty( up_tax.getRank() ) ) {
165 tax.setRank( up_tax.getRank().toLowerCase() );
167 catch ( final PhyloXmlDataFormatException ex ) {
171 if ( !ForesterUtil.isEmpty( up_tax.getId() ) ) {
172 tax.setIdentifier( new Identifier( up_tax.getId(), "uniprot" ) );
174 if ( !ForesterUtil.isEmpty( up_tax.getCommonName() ) ) {
175 tax.setCommonName( up_tax.getCommonName() );
177 if ( !ForesterUtil.isEmpty( up_tax.getSynonym() ) && !tax.getSynonyms().contains( up_tax.getSynonym() ) ) {
178 tax.getSynonyms().add( up_tax.getSynonym() );
180 if ( up_tax.getLineage() != null ) {
181 tax.setLineage( new ArrayList<String>() );
182 for( final String lin : up_tax.getLineage() ) {
183 if ( !ForesterUtil.isEmpty( lin ) ) {
184 tax.getLineage().add( lin );
189 if ( ForesterUtil.isEmpty( tax.getLineage() ) ) {
190 tax.setLineage( new ArrayList<String>() );
191 for( final String lin : last_common_lineage ) {
192 if ( !ForesterUtil.isEmpty( lin ) ) {
193 tax.getLineage().add( lin );
197 for( final PhylogenyNode desc : descs ) {
198 if ( !desc.isExternal() && desc.getNodeData().isHasTaxonomy()
199 && desc.getNodeData().getTaxonomy().isEqual( tax ) ) {
200 desc.getNodeData().setTaxonomy( null );