// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
//
// Contact: phylosoft @ gmail . com
-// WWW: www.phylosoft.org/forester
+// WWW: https://sites.google.com/site/cmzmasek/home/software/forester
package org.forester.analysis;
import java.util.List;
import java.util.SortedSet;
import java.util.TreeSet;
+import java.util.regex.Matcher;
import javax.swing.JOptionPane;
import org.forester.archaeopteryx.TreePanel;
import org.forester.archaeopteryx.tools.AncestralTaxonomyInferrer;
import org.forester.archaeopteryx.tools.RunnableProcess;
+import org.forester.io.parsers.nhx.NHXParser;
import org.forester.io.parsers.phyloxml.PhyloXmlDataFormatException;
+import org.forester.io.parsers.util.ParserUtils;
import org.forester.phylogeny.Phylogeny;
import org.forester.phylogeny.PhylogenyNode;
import org.forester.phylogeny.data.Identifier;
import org.forester.phylogeny.data.Taxonomy;
import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
import org.forester.util.ForesterUtil;
+import org.forester.util.TaxonomyUtil;
import org.forester.ws.seqdb.SequenceDbWsTools;
import org.forester.ws.seqdb.UniProtTaxonomy;
CODE, SN, CN, ID, LIN;
}
private static final int MAX_CACHE_SIZE = 100000;
- private static final int MAX_TAXONOMIES_TO_RETURN = 10;
+ private static final int MAX_TAXONOMIES_TO_RETURN = 2000;
private static final HashMap<String, UniProtTaxonomy> _sn_up_cache_map = new HashMap<String, UniProtTaxonomy>();
private static final HashMap<String, UniProtTaxonomy> _lineage_up_cache_map = new HashMap<String, UniProtTaxonomy>();
private static final HashMap<String, UniProtTaxonomy> _code_up_cache_map = new HashMap<String, UniProtTaxonomy>();
return _sn_up_cache_map;
}
+
+ @SuppressWarnings("unchecked")
private final static UniProtTaxonomy obtainTaxonomy( final HashMap<String, UniProtTaxonomy> cache,
final Object query,
final QUERY_TYPE qt ) throws IOException,
- AncestralTaxonomyInferenceException {
+ AncestralTaxonomyInferenceException {
if ( cache.containsKey( query ) ) {
return cache.get( query ).copy();
}
break;
case LIN:
return obtainUniProtTaxonomyFromLineage( ( List<String> ) query );
+
default:
throw new RuntimeException();
}
}
private final static List<UniProtTaxonomy> getTaxonomiesFromScientificName( final String query ) throws IOException {
+ if ( query.equalsIgnoreCase( UniProtTaxonomy.BACTERIA ) || query.equalsIgnoreCase( UniProtTaxonomy.ARCHAEA )
+ || query.equalsIgnoreCase( UniProtTaxonomy.VIRUSES )
+ || query.equalsIgnoreCase( UniProtTaxonomy.EUKARYOTA ) || query.equalsIgnoreCase( UniProtTaxonomy.X ) ) {
+ final List<UniProtTaxonomy> l = new ArrayList<UniProtTaxonomy>();
+ l.add( UniProtTaxonomy.createSpecialFromScientificName( query ) );
+ return l;
+ }
return SequenceDbWsTools.getTaxonomiesFromScientificNameStrict( query, MAX_TAXONOMIES_TO_RETURN );
}
private final static List<UniProtTaxonomy> getTaxonomiesFromTaxonomyCode( final String query ) throws IOException {
+ //FIXME fix "SPHAR" issue
+ if ( ( ( query.indexOf( "XX" ) == 3 ) && TaxonomyUtil.isHasTaxIdFromFakeTaxCode( query ) )
+ || query.equals( "SPHAR" ) /* TODO remove me, is same as Sphingomonas aromaticivorans */
+ ) {
+ final int id = TaxonomyUtil.getTaxIdFromFakeTaxCode( query );
+ return SequenceDbWsTools.getTaxonomiesFromId( String.valueOf( id ), MAX_TAXONOMIES_TO_RETURN );
+ }
return SequenceDbWsTools.getTaxonomiesFromTaxonomyCode( query, MAX_TAXONOMIES_TO_RETURN );
}
synchronized final private static SortedSet<String> obtainDetailedTaxonomicInformation( final Phylogeny phy,
final boolean delete,
final boolean allow_to_use_basic_node_names )
- throws IOException, AncestralTaxonomyInferenceException {
+ throws IOException, AncestralTaxonomyInferenceException {
clearCachesIfTooLarge();
final SortedSet<String> not_found = new TreeSet<String>();
List<PhylogenyNode> not_found_external_nodes = null;
if ( ( ( tax != null ) && ( isHasAppropriateId( tax ) || !ForesterUtil.isEmpty( tax.getScientificName() )
|| !ForesterUtil.isEmpty( tax.getTaxonomyCode() ) || !ForesterUtil.isEmpty( tax.getCommonName() ) ) )
|| ( allow_to_use_basic_node_names && !ForesterUtil.isEmpty( node.getName() ) ) ) {
- if ( tax != null ) {
+ if ( ( ( tax != null ) && ( isHasAppropriateId( tax )
+ || !ForesterUtil.isEmpty( tax.getScientificName() )
+ || !ForesterUtil.isEmpty( tax.getTaxonomyCode() ) || !ForesterUtil
+ .isEmpty( tax.getCommonName() ) ) ) ) {
uniprot_tax = obtainUniProtTaxonomy( tax, null, qt );
}
else {
if ( tax == null ) {
tax = new Taxonomy();
node.getNodeData().addTaxonomy( tax );
- node.setName( "" );
}
updateTaxonomy( qt, node, tax, uniprot_tax );
}
if ( ForesterUtil.isEmpty( simple_name ) ) {
throw new IllegalArgumentException( "illegal attempt to use empty simple name" );
}
- qt = QUERY_TYPE.SN;
- UniProtTaxonomy ut = obtainTaxonomy( TaxonomyDataManager.getSnTaxCacheMap(), simple_name, qt );
- if ( ut == null ) {
+ UniProtTaxonomy ut = null;
+ final String code = ParserUtils.extractTaxonomyCodeFromNodeName( simple_name,
+ NHXParser.TAXONOMY_EXTRACTION.AGGRESSIVE );
+ if ( !ForesterUtil.isEmpty( code ) ) {
qt = QUERY_TYPE.CODE;
- ut = obtainTaxonomy( TaxonomyDataManager.getCodeTaxCacheMap(), simple_name, qt );
+ ut = obtainTaxonomy( TaxonomyDataManager.getCodeTaxCacheMap(), code, qt );
}
if ( ut == null ) {
- qt = QUERY_TYPE.CN;
- ut = obtainTaxonomy( TaxonomyDataManager.getCnTaxCacheMap(), simple_name, qt );
+ final String sn = ParserUtils.extractScientificNameFromNodeName( simple_name );
+ if ( !ForesterUtil.isEmpty( sn ) ) {
+ qt = QUERY_TYPE.SN;
+ ut = obtainTaxonomy( TaxonomyDataManager.getSnTaxCacheMap(), sn, qt );
+ }
+ }
+ if ( ut == null ) {
+ final String id = ParserUtils
+ .extractUniprotTaxonomyIdFromNodeName( simple_name,
+ NHXParser.TAXONOMY_EXTRACTION.PFAM_STYLE_RELAXED );
+ if ( !ForesterUtil.isEmpty( id ) ) {
+ qt = QUERY_TYPE.ID;
+ ut = obtainTaxonomy( TaxonomyDataManager.getIdTaxCacheMap(), id, qt );
+ }
+ }
+ if ( ut == null ) {
+ String sn = "";
+ final Matcher m = ParserUtils.TAXOMONY_SN_PATTERN_GENUS.matcher( simple_name );
+ if ( m.matches() ) {
+ sn = m.group( 1 );
+ }
+ if ( !ForesterUtil.isEmpty( sn ) ) {
+ qt = QUERY_TYPE.SN;
+ ut = obtainTaxonomy( TaxonomyDataManager.getSnTaxCacheMap(), sn, qt );
+ }
}
return ut;
}
static final UniProtTaxonomy obtainUniProtTaxonomyFromLineage( final List<String> lineage )
throws AncestralTaxonomyInferenceException, IOException {
final String lineage_str = ForesterUtil.stringListToString( lineage, ">" );
- UniProtTaxonomy up_tax = null;
if ( TaxonomyDataManager.getLineageTaxCacheMap().containsKey( lineage_str ) ) {
- up_tax = TaxonomyDataManager.getLineageTaxCacheMap().get( lineage_str ).copy();
+ return TaxonomyDataManager.getLineageTaxCacheMap().get( lineage_str ).copy();
}
else {
+ final List<UniProtTaxonomy> matching_taxonomies = new ArrayList<UniProtTaxonomy>();
final List<UniProtTaxonomy> up_taxonomies = getTaxonomiesFromScientificName( lineage
- .get( lineage.size() - 1 ) );
+ .get( lineage.size() - 1 ) );
if ( ( up_taxonomies != null ) && ( up_taxonomies.size() > 0 ) ) {
for( final UniProtTaxonomy up_taxonomy : up_taxonomies ) {
boolean match = true;
I: for( int i = 0; i < lineage.size(); ++i ) {
- if ( !lineage.get( i ).equalsIgnoreCase( up_taxonomy.getLineage().get( i ) ) ) {
+ if ( ( i == up_taxonomy.getLineage().size() )
+ || !lineage.get( i ).equalsIgnoreCase( up_taxonomy.getLineage().get( i ) ) ) {
match = false;
break I;
}
}
if ( match ) {
- if ( up_tax != null ) {
- //TODO this is dead code?!
- throw new AncestralTaxonomyInferenceException( "lineage \""
- + ForesterUtil.stringListToString( lineage, " > " ) + "\" is not unique" );
- }
- up_tax = up_taxonomy;
+ matching_taxonomies.add( up_taxonomy );
}
}
- if ( up_tax == null ) {
+ if ( matching_taxonomies.isEmpty() ) {
throw new AncestralTaxonomyInferenceException( "lineage \""
+ ForesterUtil.stringListToString( lineage, " > " ) + "\" not found" );
}
- TaxonomyDataManager.getLineageTaxCacheMap().put( lineage_str, up_tax );
- if ( !ForesterUtil.isEmpty( up_tax.getScientificName() ) ) {
- TaxonomyDataManager.getSnTaxCacheMap().put( up_tax.getScientificName(), up_tax );
+ //in case of more than one (e.g. "Xenopus" Genus and Subgenus), keep shorter, less specific one:
+ int shortest = Integer.MAX_VALUE;
+ UniProtTaxonomy least_specific_up_tax = null;
+ for( final UniProtTaxonomy m : matching_taxonomies ) {
+ final int s = m.getLineage().size();
+ if ( s < shortest ) {
+ shortest = s;
+ least_specific_up_tax = m;
+ }
}
- if ( !ForesterUtil.isEmpty( up_tax.getCode() ) ) {
- TaxonomyDataManager.getCodeTaxCacheMap().put( up_tax.getCode(), up_tax );
+ TaxonomyDataManager.getLineageTaxCacheMap().put( lineage_str, least_specific_up_tax );
+ if ( !ForesterUtil.isEmpty( least_specific_up_tax.getScientificName() ) ) {
+ TaxonomyDataManager.getSnTaxCacheMap().put( least_specific_up_tax.getScientificName(),
+ least_specific_up_tax );
}
- if ( !ForesterUtil.isEmpty( up_tax.getCommonName() ) ) {
- TaxonomyDataManager.getCnTaxCacheMap().put( up_tax.getCommonName(), up_tax );
+ if ( !ForesterUtil.isEmpty( least_specific_up_tax.getCode() ) ) {
+ TaxonomyDataManager.getCodeTaxCacheMap().put( least_specific_up_tax.getCode(),
+ least_specific_up_tax );
}
- if ( !ForesterUtil.isEmpty( up_tax.getId() ) ) {
- TaxonomyDataManager.getIdTaxCacheMap().put( up_tax.getId(), up_tax );
+ if ( !ForesterUtil.isEmpty( least_specific_up_tax.getCommonName() ) ) {
+ TaxonomyDataManager.getCnTaxCacheMap().put( least_specific_up_tax.getCommonName(),
+ least_specific_up_tax );
}
+ if ( !ForesterUtil.isEmpty( least_specific_up_tax.getId() ) ) {
+ TaxonomyDataManager.getIdTaxCacheMap().put( least_specific_up_tax.getId(), least_specific_up_tax );
+ }
+ return least_specific_up_tax;
+ }
+ else {
+ throw new AncestralTaxonomyInferenceException( "taxonomy \"" + ( lineage.get( lineage.size() - 1 ) )
+ + "\" not found" );
}
}
- return up_tax;
}
synchronized final private static void updateTaxonomy( final QUERY_TYPE qt,
final PhylogenyNode node,
final Taxonomy tax,
final UniProtTaxonomy up_tax )
- throws PhyloXmlDataFormatException {
+ throws PhyloXmlDataFormatException {
if ( ( qt != QUERY_TYPE.SN ) && !ForesterUtil.isEmpty( up_tax.getScientificName() )
&& ForesterUtil.isEmpty( tax.getScientificName() ) ) {
tax.setScientificName( up_tax.getScientificName() );
JOptionPane.WARNING_MESSAGE );
}
catch ( final Exception e ) {
- // Not important if this fails, do nothing.
+ // Not important if this fails, do nothing.
}
return;
}
JOptionPane.WARNING_MESSAGE );
}
catch ( final Exception e ) {
- // Not important if this fails, do nothing.
+ // Not important if this fails, do nothing.
}
}
else {