static final UniProtTaxonomy obtainUniProtTaxonomyFromLineage( final List<String> lineage )
throws AncestralTaxonomyInferenceException, IOException {
final String lineage_str = ForesterUtil.stringListToString( lineage, ">" );
- UniProtTaxonomy up_tax = null;
if ( TaxonomyDataManager.getLineageTaxCacheMap().containsKey( lineage_str ) ) {
- up_tax = TaxonomyDataManager.getLineageTaxCacheMap().get( lineage_str ).copy();
+ return TaxonomyDataManager.getLineageTaxCacheMap().get( lineage_str ).copy();
}
else {
+ final List<UniProtTaxonomy> matching_taxonomies = new ArrayList<UniProtTaxonomy>();
final List<UniProtTaxonomy> up_taxonomies = getTaxonomiesFromScientificName( lineage
.get( lineage.size() - 1 ) );
if ( ( up_taxonomies != null ) && ( up_taxonomies.size() > 0 ) ) {
}
}
if ( match ) {
- if ( up_tax != null ) {
- //TODO this is dead code?!
- throw new AncestralTaxonomyInferenceException( "lineage \""
- + ForesterUtil.stringListToString( lineage, " > " ) + "\" is not unique" );
- }
- up_tax = up_taxonomy;
+ matching_taxonomies.add( up_taxonomy );
}
}
- if ( up_tax == null ) {
+ if ( matching_taxonomies.isEmpty() ) {
throw new AncestralTaxonomyInferenceException( "lineage \""
+ ForesterUtil.stringListToString( lineage, " > " ) + "\" not found" );
}
- TaxonomyDataManager.getLineageTaxCacheMap().put( lineage_str, up_tax );
- if ( !ForesterUtil.isEmpty( up_tax.getScientificName() ) ) {
- TaxonomyDataManager.getSnTaxCacheMap().put( up_tax.getScientificName(), up_tax );
+ //in case of more than one (e.g. "Xenopus" Genus and Subgenus), keep shorter, less specific one:
+ int shortest = Integer.MAX_VALUE;
+ UniProtTaxonomy least_specific_up_tax = null;
+ for( final UniProtTaxonomy m : matching_taxonomies ) {
+ final int s = m.getLineage().size();
+ if ( s < shortest ) {
+ shortest = s;
+ least_specific_up_tax = m;
+ }
}
- if ( !ForesterUtil.isEmpty( up_tax.getCode() ) ) {
- TaxonomyDataManager.getCodeTaxCacheMap().put( up_tax.getCode(), up_tax );
+ TaxonomyDataManager.getLineageTaxCacheMap().put( lineage_str, least_specific_up_tax );
+ if ( !ForesterUtil.isEmpty( least_specific_up_tax.getScientificName() ) ) {
+ TaxonomyDataManager.getSnTaxCacheMap().put( least_specific_up_tax.getScientificName(),
+ least_specific_up_tax );
}
- if ( !ForesterUtil.isEmpty( up_tax.getCommonName() ) ) {
- TaxonomyDataManager.getCnTaxCacheMap().put( up_tax.getCommonName(), up_tax );
+ if ( !ForesterUtil.isEmpty( least_specific_up_tax.getCode() ) ) {
+ TaxonomyDataManager.getCodeTaxCacheMap().put( least_specific_up_tax.getCode(),
+ least_specific_up_tax );
}
- if ( !ForesterUtil.isEmpty( up_tax.getId() ) ) {
- TaxonomyDataManager.getIdTaxCacheMap().put( up_tax.getId(), up_tax );
+ if ( !ForesterUtil.isEmpty( least_specific_up_tax.getCommonName() ) ) {
+ TaxonomyDataManager.getCnTaxCacheMap().put( least_specific_up_tax.getCommonName(),
+ least_specific_up_tax );
}
+ if ( !ForesterUtil.isEmpty( least_specific_up_tax.getId() ) ) {
+ TaxonomyDataManager.getIdTaxCacheMap().put( least_specific_up_tax.getId(), least_specific_up_tax );
+ }
+ return least_specific_up_tax;
+ }
+ else {
+ throw new AncestralTaxonomyInferenceException( "taxonomy \"" + ( lineage.get( lineage.size() - 1 ) )
+ + "\" not found" );
}
}
- return up_tax;
}
synchronized final private static void updateTaxonomy( final QUERY_TYPE qt,
import org.forester.go.PfamToGoParser;
import org.forester.io.parsers.HmmscanPerDomainTableParser;
import org.forester.io.parsers.HmmscanPerDomainTableParser.INDIVIDUAL_SCORE_CUTOFF;
+import org.forester.io.parsers.phyloxml.PhyloXmlUtil;
import org.forester.io.parsers.util.ParserUtils;
import org.forester.phylogeny.Phylogeny;
import org.forester.phylogeny.PhylogenyMethods;
}
}
final String[][] input_file_properties = processInputGenomesFile( input_genomes_file );
- for( final String[] input_file_propertie : input_file_properties ) {
- for( final String element : input_file_propertie ) {
- System.out.print( element + " " );
- }
- System.out.println();
- }
+ // for( final String[] input_file_propertie : input_file_properties ) {
+ // for( final String element : input_file_propertie ) {
+ // System.out.print( element + " " );
+ // }
+ // System.out.println();
+ // }
final int number_of_genomes = input_file_properties.length;
if ( number_of_genomes < 2 ) {
ForesterUtil.fatalError( surfacing.PRG_NAME, "cannot analyze less than two files" );
"genomes files is to be in the following format \"<hmmpfam output file> <species>\": "
+ e.getLocalizedMessage() );
}
+ final Set<String> specs = new HashSet<String>();
+ final Set<String> paths = new HashSet<String>();
for( int i = 0; i < input_file_properties.length; ++i ) {
+ if ( !PhyloXmlUtil.TAXOMONY_CODE_PATTERN.matcher( input_file_properties[ i ][ 1 ] ).matches() ) {
+ ForesterUtil.fatalError( surfacing.PRG_NAME, "illegal format for species code: "
+ + input_file_properties[ i ][ 1 ] );
+ }
+ if ( specs.contains( input_file_properties[ i ][ 1 ] ) ) {
+ ForesterUtil.fatalError( surfacing.PRG_NAME, "species code " + input_file_properties[ i ][ 1 ]
+ + " is not unique" );
+ }
+ specs.add( input_file_properties[ i ][ 1 ] );
+ if ( paths.contains( input_file_properties[ i ][ 0 ] ) ) {
+ ForesterUtil.fatalError( surfacing.PRG_NAME, "path " + input_file_properties[ i ][ 0 ]
+ + " is not unique" );
+ }
+ paths.add( input_file_properties[ i ][ 0 ] );
final String error = ForesterUtil.isReadableFile( new File( input_file_properties[ i ][ 0 ] ) );
if ( !ForesterUtil.isEmpty( error ) ) {
ForesterUtil.fatalError( surfacing.PRG_NAME, error );
public final class ParserUtils {
- final public static String TAX_CODE = "(?:[A-Z9][A-Z]{2}[A-Z0-9]{2})|RAT|PIG|PEA|CAP";
+ final public static String TAX_CODE = "(?:[A-Z9][A-Z]{2}[A-Z0-9]{2})|RAT|PIG|PEA";
final public static Pattern TAXOMONY_SN_PATTERN = Pattern
.compile( "(?:\\b|_)[a-zA-Z0-9]{3,}_([A-Z][a-z]+_[a-z]{2,}(?:_[a-z][a-z0-9_]+)?)\\b" );
final private static Pattern TAXOMONY_CODE_PATTERN_PFS = Pattern.compile( "(?:\\b|_)[A-Z0-9]{4,}_("
System.out.println( results.get( 0 ).getLineage() );
return false;
}
+ //
+ results = null;
+ results = SequenceDbWsTools.getTaxonomiesFromScientificNameStrict( "Xenopus tropicalis", 10 );
+ if ( results.size() != 1 ) {
+ return false;
+ }
+ if ( !results.get( 0 ).getCode().equals( "XENTR" ) ) {
+ return false;
+ }
+ if ( !results.get( 0 ).getCommonName().equalsIgnoreCase( "Western clawed frog" ) ) {
+ return false;
+ }
+ if ( !results.get( 0 ).getId().equalsIgnoreCase( "8364" ) ) {
+ return false;
+ }
+ if ( !results.get( 0 ).getRank().equalsIgnoreCase( "species" ) ) {
+ return false;
+ }
+ if ( !results.get( 0 ).getScientificName().equals( "Xenopus tropicalis" ) ) {
+ return false;
+ }
+ if ( !results.get( 0 ).getLineage().get( results.get( 0 ).getLineage().size() - 1 )
+ .equals( "Xenopus tropicalis" ) ) {
+ System.out.println( results.get( 0 ).getLineage() );
+ return false;
+ }
+ //
+ results = null;
+ results = SequenceDbWsTools.getTaxonomiesFromId( "8364", 10 );
+ if ( results.size() != 1 ) {
+ return false;
+ }
+ if ( !results.get( 0 ).getCode().equals( "XENTR" ) ) {
+ return false;
+ }
+ if ( !results.get( 0 ).getCommonName().equalsIgnoreCase( "Western clawed frog" ) ) {
+ return false;
+ }
+ if ( !results.get( 0 ).getId().equalsIgnoreCase( "8364" ) ) {
+ return false;
+ }
+ if ( !results.get( 0 ).getRank().equalsIgnoreCase( "species" ) ) {
+ return false;
+ }
+ if ( !results.get( 0 ).getScientificName().equals( "Xenopus tropicalis" ) ) {
+ return false;
+ }
+ if ( !results.get( 0 ).getLineage().get( results.get( 0 ).getLineage().size() - 1 )
+ .equals( "Xenopus tropicalis" ) ) {
+ System.out.println( results.get( 0 ).getLineage() );
+ return false;
+ }
+ //
+ results = null;
+ results = SequenceDbWsTools.getTaxonomiesFromTaxonomyCode( "XENTR", 10 );
+ if ( results.size() != 1 ) {
+ return false;
+ }
+ if ( !results.get( 0 ).getCode().equals( "XENTR" ) ) {
+ return false;
+ }
+ if ( !results.get( 0 ).getCommonName().equalsIgnoreCase( "Western clawed frog" ) ) {
+ return false;
+ }
+ if ( !results.get( 0 ).getId().equalsIgnoreCase( "8364" ) ) {
+ return false;
+ }
+ if ( !results.get( 0 ).getRank().equalsIgnoreCase( "species" ) ) {
+ return false;
+ }
+ if ( !results.get( 0 ).getScientificName().equals( "Xenopus tropicalis" ) ) {
+ return false;
+ }
+ if ( !results.get( 0 ).getLineage().get( results.get( 0 ).getLineage().size() - 1 )
+ .equals( "Xenopus tropicalis" ) ) {
+ System.out.println( results.get( 0 ).getLineage() );
+ return false;
+ }
}
catch ( final IOException e ) {
System.out.println();
public final class SequenceDbWsTools {
- private static final boolean ALLOW_TAXONOMY_CODE_HACKS = true; //TODO turn off for final realease!
- public final static String BASE_UNIPROT_URL = "http://www.uniprot.org/";
- public final static String BASE_EMBL_DB_URL = "http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/";
- public final static String EMBL_DBS_EMBL = "embl";
- public final static String EMBL_DBS_REFSEQ_P = "refseqp";
- public final static String EMBL_DBS_REFSEQ_N = "refseqn";
- private final static String URL_ENC = "UTF-8";
- private final static boolean DEBUG = false;
+ public final static String BASE_UNIPROT_URL = "http://www.uniprot.org/";
+ public final static String BASE_EMBL_DB_URL = "http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/";
+ public final static String EMBL_DBS_EMBL = "embl";
+ public final static String EMBL_DBS_REFSEQ_P = "refseqp";
+ public final static String EMBL_DBS_REFSEQ_N = "refseqn";
+ private final static String URL_ENC = "UTF-8";
+ private final static boolean DEBUG = false;
- public static List<UniProtTaxonomy> getTaxonomiesFromCommonName( final String cn, final int max_taxonomies_return )
+ private static List<UniProtTaxonomy> getTaxonomiesFromCommonName( final String cn, final int max_taxonomies_return )
throws IOException {
final List<String> result = getTaxonomyStringFromCommonName( cn, max_taxonomies_return );
if ( result.size() > 0 ) {
return null;
}
- public static List<UniProtTaxonomy> getTaxonomiesFromScientificName( final String sn,
- final int max_taxonomies_return )
+ private static List<UniProtTaxonomy> getTaxonomiesFromScientificName( final String sn,
+ final int max_taxonomies_return )
throws IOException {
- // Hack! Craniata? ..
- if ( sn.equals( "Drosophila" ) ) {
- return uniProtTaxonomyToList( UniProtTaxonomy.DROSOPHILA_GENUS );
- }
- else if ( sn.equals( "Xenopus" ) ) {
- return uniProtTaxonomyToList( UniProtTaxonomy.XENOPUS_GENUS );
- }
- // else if ( sn.equals( "Nucleariidae and Fonticula group" ) ) {
- // return hack( UniProtTaxonomy.NUCLEARIIDAE_AND_FONTICULA );
- // }
final List<String> result = getTaxonomyStringFromScientificName( sn, max_taxonomies_return );
if ( result.size() > 0 ) {
return parseUniProtTaxonomy( result );
final int max_taxonomies_return )
throws IOException {
final String my_code = new String( code );
- if ( ALLOW_TAXONOMY_CODE_HACKS ) {
- final List<UniProtTaxonomy> l = resolveFakeTaxonomyCodes( max_taxonomies_return, my_code );
- if ( l != null ) {
- return l;
- }
- }
final List<String> result = getTaxonomyStringFromTaxonomyCode( my_code, max_taxonomies_return );
if ( result.size() > 0 ) {
return parseUniProtTaxonomy( result );
return taxonomies;
}
- private static List<UniProtTaxonomy> resolveFakeTaxonomyCodes( final int max_taxonomies_return, final String code )
- throws IOException {
- if ( code.equals( "CAP" ) ) {
- return getTaxonomiesFromId( "283909", max_taxonomies_return );
- }
- else if ( code.equals( "FUGRU" ) ) {
- return getTaxonomiesFromId( "31033", max_taxonomies_return );
- }
- else if ( code.equals( "GIALA" ) ) {
- return getTaxonomiesFromId( "5741", max_taxonomies_return );
- }
- else if ( code.equals( "TRIVE" ) ) {
- return getTaxonomiesFromId( "413071", max_taxonomies_return );
- }
- else if ( code.equals( "CAPOWC" ) ) {
- return getTaxonomiesFromId( "192875", max_taxonomies_return );
- }
- else if ( code.equals( "SPHARC" ) ) {
- return getTaxonomiesFromId( "667725", max_taxonomies_return );
- }
- else if ( code.equals( "THETRA" ) ) {
- return getTaxonomiesFromId( "529818", max_taxonomies_return );
- }
- else if ( code.equals( "CHLVUL" ) ) {
- return getTaxonomiesFromId( "574566", max_taxonomies_return );
- }
- else if ( code.equals( "CITCLE" ) ) {
- return getTaxonomiesFromId( "85681", max_taxonomies_return );
- }
- else if ( code.equals( "MYCPOP" ) ) {
- return getTaxonomiesFromId( "85929", max_taxonomies_return );
- }
- else if ( code.equals( "AGABB" ) ) {
- return getTaxonomiesFromId( "597362", max_taxonomies_return );
- }
- else if ( code.equals( "BAUCOM" ) ) {
- return getTaxonomiesFromId( "430998", max_taxonomies_return );
- }
- else if ( code.equals( "DICSQU" ) ) {
- return getTaxonomiesFromId( "114155", max_taxonomies_return );
- }
- else if ( code.equals( "FOMPIN" ) ) {
- return getTaxonomiesFromId( "40483", max_taxonomies_return );
- }
- else if ( code.equals( "HYDMA" ) ) {
- return getTaxonomiesFromId( "6085", max_taxonomies_return );
- }
- else if ( code.equals( "MYCFI" ) ) {
- return getTaxonomiesFromId( "83344", max_taxonomies_return );
- }
- else if ( code.equals( "OIDMAI" ) ) {
- return getTaxonomiesFromId( "78148", max_taxonomies_return );
- }
- else if ( code.equals( "OSTRC" ) ) {
- return getTaxonomiesFromId( "385169", max_taxonomies_return );
- }
- else if ( code.equals( "POSPL" ) ) {
- return getTaxonomiesFromId( "104341", max_taxonomies_return );
- }
- else if ( code.equals( "SAICOM" ) ) {
- return getTaxonomiesFromId( "5606", max_taxonomies_return );
- }
- else if ( code.equals( "SERLA" ) ) {
- return getTaxonomiesFromId( "85982", max_taxonomies_return );
- }
- else if ( code.equals( "SPORO" ) ) {
- return getTaxonomiesFromId( "40563", max_taxonomies_return );
- }
- else if ( code.equals( "ACRALC" ) ) {
- return getTaxonomiesFromId( "398408", max_taxonomies_return );
- }
- else if ( code.equals( "THITER" ) ) {
- return getTaxonomiesFromId( "35720", max_taxonomies_return );
- }
- else if ( code.equals( "MYCTHE" ) ) {
- return getTaxonomiesFromId( "78579", max_taxonomies_return );
- }
- else if ( code.equals( "CONPUT" ) ) {
- return getTaxonomiesFromId( "80637", max_taxonomies_return );
- }
- else if ( code.equals( "WOLCOC" ) ) {
- return getTaxonomiesFromId( "81056", max_taxonomies_return );
- }
- else if ( code.equals( "CLAGRA" ) ) {
- return getTaxonomiesFromId( "27339", max_taxonomies_return );
- }
- else if ( code.equals( "XANPAR" ) ) {
- return getTaxonomiesFromId( "107463", max_taxonomies_return );
- }
- else if ( code.equals( "HYDPIN" ) ) {
- return getTaxonomiesFromId( "388859", max_taxonomies_return );
- }
- else if ( code.equals( "SERLAC" ) ) {
- return getTaxonomiesFromId( "85982", max_taxonomies_return );
- }
- else {
- return null;
- }
- }
-
- private static List<UniProtTaxonomy> uniProtTaxonomyToList( final UniProtTaxonomy tax ) {
- final List<UniProtTaxonomy> l = new ArrayList<UniProtTaxonomy>();
- l.add( tax );
- return l;
- }
-
public enum Db {
UNIPROT, EMBL, NCBI, NONE, REFSEQ;
}
public final class UniProtTaxonomy {
- private static final String ARCHAEA = "Archaea";
- private static final String BACTERIA = "Bacteria";
- private static final String EUKARYOTA = "Eukaryota";
- private final List<String> _lineage;
- private final String _code;
- private final String _scientific_name;
- private final String _common_name;
- private final String _synonym;
- private final String _rank;
- private final String _id;
- public final static String CELLULAR_ORGANISMS = "cellular organisms";
- public final static String VIRUSES = "Viruses";
- public final static UniProtTaxonomy DROSOPHILA_GENUS = new UniProtTaxonomy( new String[] {
- CELLULAR_ORGANISMS, EUKARYOTA, "Metazoa", "Ecdysozoa", "Arthropoda", "Hexapoda", "Insecta", "Pterygota",
- "Neoptera", "Endopterygota", "Diptera", "Brachycera", "Muscomorpha", "Ephydroidea", "Drosophilidae",
- "Drosophila" },
- "",
- "fruit flies",
- "Drosophila",
- "",
- "genus",
- "7215" );
- public final static UniProtTaxonomy XENOPUS_GENUS = new UniProtTaxonomy( new String[] {
- CELLULAR_ORGANISMS, EUKARYOTA, "Metazoa", "Chordata", "Craniata", "Vertebrata", "Euteleostomi", "Amphibia",
- "Batrachia", "Anura", "Mesobatrachia", "Pipoidea", "Pipidae", "Xenopodinae", "Xenopus" },
- "",
- "",
- "Xenopus",
- "",
- "genus",
- "8353" );
- public final static UniProtTaxonomy CAPITELLA_TELATA_SPECIES = new UniProtTaxonomy( new String[] {
- CELLULAR_ORGANISMS, EUKARYOTA, "Metazoa", "Annelida", "Polychaeta", "Scolecida", "Capitellida",
- "Capitellidae", "Capitella", "Capitella teleta" },
- "",
- "",
- "Capitella teleta",
- "Capitella sp. I",
- "species",
- "283909" );
+ private static final String ARCHAEA = "Archaea";
+ private static final String BACTERIA = "Bacteria";
+ private static final String EUKARYOTA = "Eukaryota";
+ private final List<String> _lineage;
+ private final String _code;
+ private final String _scientific_name;
+ private final String _common_name;
+ private final String _synonym;
+ private final String _rank;
+ private final String _id;
+ public final static String CELLULAR_ORGANISMS = "cellular organisms";
+ public final static String VIRUSES = "Viruses";
public UniProtTaxonomy( final String line ) {
final String[] items = line.split( "\t" );