From cbc5c71b164a57b8ad6c988d015057c7f0972478 Mon Sep 17 00:00:00 2001 From: "cmzmasek@gmail.com" Date: Fri, 28 Jun 2013 03:46:04 +0000 Subject: [PATCH] inprogress --- .../org/forester/analysis/TaxonomyDataManager.java | 50 ++++--- .../src/org/forester/application/surfacing.java | 29 +++- .../org/forester/io/parsers/util/ParserUtils.java | 2 +- forester/java/src/org/forester/test/Test.java | 78 +++++++++++ .../org/forester/ws/seqdb/SequenceDbWsTools.java | 143 ++------------------ .../src/org/forester/ws/seqdb/UniProtTaxonomy.java | 52 ++----- 6 files changed, 155 insertions(+), 199 deletions(-) diff --git a/forester/java/src/org/forester/analysis/TaxonomyDataManager.java b/forester/java/src/org/forester/analysis/TaxonomyDataManager.java index cddaa4f..1a1bef6 100644 --- a/forester/java/src/org/forester/analysis/TaxonomyDataManager.java +++ b/forester/java/src/org/forester/analysis/TaxonomyDataManager.java @@ -326,11 +326,11 @@ public final class TaxonomyDataManager extends RunnableProcess { static final UniProtTaxonomy obtainUniProtTaxonomyFromLineage( final List lineage ) throws AncestralTaxonomyInferenceException, IOException { final String lineage_str = ForesterUtil.stringListToString( lineage, ">" ); - UniProtTaxonomy up_tax = null; if ( TaxonomyDataManager.getLineageTaxCacheMap().containsKey( lineage_str ) ) { - up_tax = TaxonomyDataManager.getLineageTaxCacheMap().get( lineage_str ).copy(); + return TaxonomyDataManager.getLineageTaxCacheMap().get( lineage_str ).copy(); } else { + final List matching_taxonomies = new ArrayList(); final List up_taxonomies = getTaxonomiesFromScientificName( lineage .get( lineage.size() - 1 ) ); if ( ( up_taxonomies != null ) && ( up_taxonomies.size() > 0 ) ) { @@ -344,34 +344,46 @@ public final class TaxonomyDataManager extends RunnableProcess { } } if ( match ) { - if ( up_tax != null ) { - //TODO this is dead code?! - throw new AncestralTaxonomyInferenceException( "lineage \"" - + ForesterUtil.stringListToString( lineage, " > " ) + "\" is not unique" ); - } - up_tax = up_taxonomy; + matching_taxonomies.add( up_taxonomy ); } } - if ( up_tax == null ) { + if ( matching_taxonomies.isEmpty() ) { throw new AncestralTaxonomyInferenceException( "lineage \"" + ForesterUtil.stringListToString( lineage, " > " ) + "\" not found" ); } - TaxonomyDataManager.getLineageTaxCacheMap().put( lineage_str, up_tax ); - if ( !ForesterUtil.isEmpty( up_tax.getScientificName() ) ) { - TaxonomyDataManager.getSnTaxCacheMap().put( up_tax.getScientificName(), up_tax ); + //in case of more than one (e.g. "Xenopus" Genus and Subgenus), keep shorter, less specific one: + int shortest = Integer.MAX_VALUE; + UniProtTaxonomy least_specific_up_tax = null; + for( final UniProtTaxonomy m : matching_taxonomies ) { + final int s = m.getLineage().size(); + if ( s < shortest ) { + shortest = s; + least_specific_up_tax = m; + } } - if ( !ForesterUtil.isEmpty( up_tax.getCode() ) ) { - TaxonomyDataManager.getCodeTaxCacheMap().put( up_tax.getCode(), up_tax ); + TaxonomyDataManager.getLineageTaxCacheMap().put( lineage_str, least_specific_up_tax ); + if ( !ForesterUtil.isEmpty( least_specific_up_tax.getScientificName() ) ) { + TaxonomyDataManager.getSnTaxCacheMap().put( least_specific_up_tax.getScientificName(), + least_specific_up_tax ); } - if ( !ForesterUtil.isEmpty( up_tax.getCommonName() ) ) { - TaxonomyDataManager.getCnTaxCacheMap().put( up_tax.getCommonName(), up_tax ); + if ( !ForesterUtil.isEmpty( least_specific_up_tax.getCode() ) ) { + TaxonomyDataManager.getCodeTaxCacheMap().put( least_specific_up_tax.getCode(), + least_specific_up_tax ); } - if ( !ForesterUtil.isEmpty( up_tax.getId() ) ) { - TaxonomyDataManager.getIdTaxCacheMap().put( up_tax.getId(), up_tax ); + if ( !ForesterUtil.isEmpty( least_specific_up_tax.getCommonName() ) ) { + TaxonomyDataManager.getCnTaxCacheMap().put( least_specific_up_tax.getCommonName(), + least_specific_up_tax ); } + if ( !ForesterUtil.isEmpty( least_specific_up_tax.getId() ) ) { + TaxonomyDataManager.getIdTaxCacheMap().put( least_specific_up_tax.getId(), least_specific_up_tax ); + } + return least_specific_up_tax; + } + else { + throw new AncestralTaxonomyInferenceException( "taxonomy \"" + ( lineage.get( lineage.size() - 1 ) ) + + "\" not found" ); } } - return up_tax; } synchronized final private static void updateTaxonomy( final QUERY_TYPE qt, diff --git a/forester/java/src/org/forester/application/surfacing.java b/forester/java/src/org/forester/application/surfacing.java index f15ea36..f2f0c9e 100644 --- a/forester/java/src/org/forester/application/surfacing.java +++ b/forester/java/src/org/forester/application/surfacing.java @@ -54,6 +54,7 @@ import org.forester.go.PfamToGoMapping; import org.forester.go.PfamToGoParser; import org.forester.io.parsers.HmmscanPerDomainTableParser; import org.forester.io.parsers.HmmscanPerDomainTableParser.INDIVIDUAL_SCORE_CUTOFF; +import org.forester.io.parsers.phyloxml.PhyloXmlUtil; import org.forester.io.parsers.util.ParserUtils; import org.forester.phylogeny.Phylogeny; import org.forester.phylogeny.PhylogenyMethods; @@ -1019,12 +1020,12 @@ public class surfacing { } } final String[][] input_file_properties = processInputGenomesFile( input_genomes_file ); - for( final String[] input_file_propertie : input_file_properties ) { - for( final String element : input_file_propertie ) { - System.out.print( element + " " ); - } - System.out.println(); - } + // for( final String[] input_file_propertie : input_file_properties ) { + // for( final String element : input_file_propertie ) { + // System.out.print( element + " " ); + // } + // System.out.println(); + // } final int number_of_genomes = input_file_properties.length; if ( number_of_genomes < 2 ) { ForesterUtil.fatalError( surfacing.PRG_NAME, "cannot analyze less than two files" ); @@ -2627,7 +2628,23 @@ public class surfacing { "genomes files is to be in the following format \" \": " + e.getLocalizedMessage() ); } + final Set specs = new HashSet(); + final Set paths = new HashSet(); for( int i = 0; i < input_file_properties.length; ++i ) { + if ( !PhyloXmlUtil.TAXOMONY_CODE_PATTERN.matcher( input_file_properties[ i ][ 1 ] ).matches() ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "illegal format for species code: " + + input_file_properties[ i ][ 1 ] ); + } + if ( specs.contains( input_file_properties[ i ][ 1 ] ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "species code " + input_file_properties[ i ][ 1 ] + + " is not unique" ); + } + specs.add( input_file_properties[ i ][ 1 ] ); + if ( paths.contains( input_file_properties[ i ][ 0 ] ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "path " + input_file_properties[ i ][ 0 ] + + " is not unique" ); + } + paths.add( input_file_properties[ i ][ 0 ] ); final String error = ForesterUtil.isReadableFile( new File( input_file_properties[ i ][ 0 ] ) ); if ( !ForesterUtil.isEmpty( error ) ) { ForesterUtil.fatalError( surfacing.PRG_NAME, error ); diff --git a/forester/java/src/org/forester/io/parsers/util/ParserUtils.java b/forester/java/src/org/forester/io/parsers/util/ParserUtils.java index 26e35b3..faa0918 100644 --- a/forester/java/src/org/forester/io/parsers/util/ParserUtils.java +++ b/forester/java/src/org/forester/io/parsers/util/ParserUtils.java @@ -55,7 +55,7 @@ import org.forester.util.ForesterUtil; public final class ParserUtils { - final public static String TAX_CODE = "(?:[A-Z9][A-Z]{2}[A-Z0-9]{2})|RAT|PIG|PEA|CAP"; + final public static String TAX_CODE = "(?:[A-Z9][A-Z]{2}[A-Z0-9]{2})|RAT|PIG|PEA"; final public static Pattern TAXOMONY_SN_PATTERN = Pattern .compile( "(?:\\b|_)[a-zA-Z0-9]{3,}_([A-Z][a-z]+_[a-z]{2,}(?:_[a-z][a-z0-9_]+)?)\\b" ); final private static Pattern TAXOMONY_CODE_PATTERN_PFS = Pattern.compile( "(?:\\b|_)[A-Z0-9]{4,}_(" diff --git a/forester/java/src/org/forester/test/Test.java b/forester/java/src/org/forester/test/Test.java index e8c173e..8f9341d 100644 --- a/forester/java/src/org/forester/test/Test.java +++ b/forester/java/src/org/forester/test/Test.java @@ -11160,6 +11160,84 @@ public final class Test { System.out.println( results.get( 0 ).getLineage() ); return false; } + // + results = null; + results = SequenceDbWsTools.getTaxonomiesFromScientificNameStrict( "Xenopus tropicalis", 10 ); + if ( results.size() != 1 ) { + return false; + } + if ( !results.get( 0 ).getCode().equals( "XENTR" ) ) { + return false; + } + if ( !results.get( 0 ).getCommonName().equalsIgnoreCase( "Western clawed frog" ) ) { + return false; + } + if ( !results.get( 0 ).getId().equalsIgnoreCase( "8364" ) ) { + return false; + } + if ( !results.get( 0 ).getRank().equalsIgnoreCase( "species" ) ) { + return false; + } + if ( !results.get( 0 ).getScientificName().equals( "Xenopus tropicalis" ) ) { + return false; + } + if ( !results.get( 0 ).getLineage().get( results.get( 0 ).getLineage().size() - 1 ) + .equals( "Xenopus tropicalis" ) ) { + System.out.println( results.get( 0 ).getLineage() ); + return false; + } + // + results = null; + results = SequenceDbWsTools.getTaxonomiesFromId( "8364", 10 ); + if ( results.size() != 1 ) { + return false; + } + if ( !results.get( 0 ).getCode().equals( "XENTR" ) ) { + return false; + } + if ( !results.get( 0 ).getCommonName().equalsIgnoreCase( "Western clawed frog" ) ) { + return false; + } + if ( !results.get( 0 ).getId().equalsIgnoreCase( "8364" ) ) { + return false; + } + if ( !results.get( 0 ).getRank().equalsIgnoreCase( "species" ) ) { + return false; + } + if ( !results.get( 0 ).getScientificName().equals( "Xenopus tropicalis" ) ) { + return false; + } + if ( !results.get( 0 ).getLineage().get( results.get( 0 ).getLineage().size() - 1 ) + .equals( "Xenopus tropicalis" ) ) { + System.out.println( results.get( 0 ).getLineage() ); + return false; + } + // + results = null; + results = SequenceDbWsTools.getTaxonomiesFromTaxonomyCode( "XENTR", 10 ); + if ( results.size() != 1 ) { + return false; + } + if ( !results.get( 0 ).getCode().equals( "XENTR" ) ) { + return false; + } + if ( !results.get( 0 ).getCommonName().equalsIgnoreCase( "Western clawed frog" ) ) { + return false; + } + if ( !results.get( 0 ).getId().equalsIgnoreCase( "8364" ) ) { + return false; + } + if ( !results.get( 0 ).getRank().equalsIgnoreCase( "species" ) ) { + return false; + } + if ( !results.get( 0 ).getScientificName().equals( "Xenopus tropicalis" ) ) { + return false; + } + if ( !results.get( 0 ).getLineage().get( results.get( 0 ).getLineage().size() - 1 ) + .equals( "Xenopus tropicalis" ) ) { + System.out.println( results.get( 0 ).getLineage() ); + return false; + } } catch ( final IOException e ) { System.out.println(); diff --git a/forester/java/src/org/forester/ws/seqdb/SequenceDbWsTools.java b/forester/java/src/org/forester/ws/seqdb/SequenceDbWsTools.java index 7c656d5..1bb2e98 100644 --- a/forester/java/src/org/forester/ws/seqdb/SequenceDbWsTools.java +++ b/forester/java/src/org/forester/ws/seqdb/SequenceDbWsTools.java @@ -49,16 +49,15 @@ import org.forester.util.SequenceIdParser; public final class SequenceDbWsTools { - private static final boolean ALLOW_TAXONOMY_CODE_HACKS = true; //TODO turn off for final realease! - public final static String BASE_UNIPROT_URL = "http://www.uniprot.org/"; - public final static String BASE_EMBL_DB_URL = "http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/"; - public final static String EMBL_DBS_EMBL = "embl"; - public final static String EMBL_DBS_REFSEQ_P = "refseqp"; - public final static String EMBL_DBS_REFSEQ_N = "refseqn"; - private final static String URL_ENC = "UTF-8"; - private final static boolean DEBUG = false; + public final static String BASE_UNIPROT_URL = "http://www.uniprot.org/"; + public final static String BASE_EMBL_DB_URL = "http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/"; + public final static String EMBL_DBS_EMBL = "embl"; + public final static String EMBL_DBS_REFSEQ_P = "refseqp"; + public final static String EMBL_DBS_REFSEQ_N = "refseqn"; + private final static String URL_ENC = "UTF-8"; + private final static boolean DEBUG = false; - public static List getTaxonomiesFromCommonName( final String cn, final int max_taxonomies_return ) + private static List getTaxonomiesFromCommonName( final String cn, final int max_taxonomies_return ) throws IOException { final List result = getTaxonomyStringFromCommonName( cn, max_taxonomies_return ); if ( result.size() > 0 ) { @@ -92,19 +91,9 @@ public final class SequenceDbWsTools { return null; } - public static List getTaxonomiesFromScientificName( final String sn, - final int max_taxonomies_return ) + private static List getTaxonomiesFromScientificName( final String sn, + final int max_taxonomies_return ) throws IOException { - // Hack! Craniata? .. - if ( sn.equals( "Drosophila" ) ) { - return uniProtTaxonomyToList( UniProtTaxonomy.DROSOPHILA_GENUS ); - } - else if ( sn.equals( "Xenopus" ) ) { - return uniProtTaxonomyToList( UniProtTaxonomy.XENOPUS_GENUS ); - } - // else if ( sn.equals( "Nucleariidae and Fonticula group" ) ) { - // return hack( UniProtTaxonomy.NUCLEARIIDAE_AND_FONTICULA ); - // } final List result = getTaxonomyStringFromScientificName( sn, max_taxonomies_return ); if ( result.size() > 0 ) { return parseUniProtTaxonomy( result ); @@ -138,12 +127,6 @@ public final class SequenceDbWsTools { final int max_taxonomies_return ) throws IOException { final String my_code = new String( code ); - if ( ALLOW_TAXONOMY_CODE_HACKS ) { - final List l = resolveFakeTaxonomyCodes( max_taxonomies_return, my_code ); - if ( l != null ) { - return l; - } - } final List result = getTaxonomyStringFromTaxonomyCode( my_code, max_taxonomies_return ); if ( result.size() > 0 ) { return parseUniProtTaxonomy( result ); @@ -394,112 +377,6 @@ public final class SequenceDbWsTools { return taxonomies; } - private static List resolveFakeTaxonomyCodes( final int max_taxonomies_return, final String code ) - throws IOException { - if ( code.equals( "CAP" ) ) { - return getTaxonomiesFromId( "283909", max_taxonomies_return ); - } - else if ( code.equals( "FUGRU" ) ) { - return getTaxonomiesFromId( "31033", max_taxonomies_return ); - } - else if ( code.equals( "GIALA" ) ) { - return getTaxonomiesFromId( "5741", max_taxonomies_return ); - } - else if ( code.equals( "TRIVE" ) ) { - return getTaxonomiesFromId( "413071", max_taxonomies_return ); - } - else if ( code.equals( "CAPOWC" ) ) { - return getTaxonomiesFromId( "192875", max_taxonomies_return ); - } - else if ( code.equals( "SPHARC" ) ) { - return getTaxonomiesFromId( "667725", max_taxonomies_return ); - } - else if ( code.equals( "THETRA" ) ) { - return getTaxonomiesFromId( "529818", max_taxonomies_return ); - } - else if ( code.equals( "CHLVUL" ) ) { - return getTaxonomiesFromId( "574566", max_taxonomies_return ); - } - else if ( code.equals( "CITCLE" ) ) { - return getTaxonomiesFromId( "85681", max_taxonomies_return ); - } - else if ( code.equals( "MYCPOP" ) ) { - return getTaxonomiesFromId( "85929", max_taxonomies_return ); - } - else if ( code.equals( "AGABB" ) ) { - return getTaxonomiesFromId( "597362", max_taxonomies_return ); - } - else if ( code.equals( "BAUCOM" ) ) { - return getTaxonomiesFromId( "430998", max_taxonomies_return ); - } - else if ( code.equals( "DICSQU" ) ) { - return getTaxonomiesFromId( "114155", max_taxonomies_return ); - } - else if ( code.equals( "FOMPIN" ) ) { - return getTaxonomiesFromId( "40483", max_taxonomies_return ); - } - else if ( code.equals( "HYDMA" ) ) { - return getTaxonomiesFromId( "6085", max_taxonomies_return ); - } - else if ( code.equals( "MYCFI" ) ) { - return getTaxonomiesFromId( "83344", max_taxonomies_return ); - } - else if ( code.equals( "OIDMAI" ) ) { - return getTaxonomiesFromId( "78148", max_taxonomies_return ); - } - else if ( code.equals( "OSTRC" ) ) { - return getTaxonomiesFromId( "385169", max_taxonomies_return ); - } - else if ( code.equals( "POSPL" ) ) { - return getTaxonomiesFromId( "104341", max_taxonomies_return ); - } - else if ( code.equals( "SAICOM" ) ) { - return getTaxonomiesFromId( "5606", max_taxonomies_return ); - } - else if ( code.equals( "SERLA" ) ) { - return getTaxonomiesFromId( "85982", max_taxonomies_return ); - } - else if ( code.equals( "SPORO" ) ) { - return getTaxonomiesFromId( "40563", max_taxonomies_return ); - } - else if ( code.equals( "ACRALC" ) ) { - return getTaxonomiesFromId( "398408", max_taxonomies_return ); - } - else if ( code.equals( "THITER" ) ) { - return getTaxonomiesFromId( "35720", max_taxonomies_return ); - } - else if ( code.equals( "MYCTHE" ) ) { - return getTaxonomiesFromId( "78579", max_taxonomies_return ); - } - else if ( code.equals( "CONPUT" ) ) { - return getTaxonomiesFromId( "80637", max_taxonomies_return ); - } - else if ( code.equals( "WOLCOC" ) ) { - return getTaxonomiesFromId( "81056", max_taxonomies_return ); - } - else if ( code.equals( "CLAGRA" ) ) { - return getTaxonomiesFromId( "27339", max_taxonomies_return ); - } - else if ( code.equals( "XANPAR" ) ) { - return getTaxonomiesFromId( "107463", max_taxonomies_return ); - } - else if ( code.equals( "HYDPIN" ) ) { - return getTaxonomiesFromId( "388859", max_taxonomies_return ); - } - else if ( code.equals( "SERLAC" ) ) { - return getTaxonomiesFromId( "85982", max_taxonomies_return ); - } - else { - return null; - } - } - - private static List uniProtTaxonomyToList( final UniProtTaxonomy tax ) { - final List l = new ArrayList(); - l.add( tax ); - return l; - } - public enum Db { UNIPROT, EMBL, NCBI, NONE, REFSEQ; } diff --git a/forester/java/src/org/forester/ws/seqdb/UniProtTaxonomy.java b/forester/java/src/org/forester/ws/seqdb/UniProtTaxonomy.java index b50d41e..2cdd93e 100644 --- a/forester/java/src/org/forester/ws/seqdb/UniProtTaxonomy.java +++ b/forester/java/src/org/forester/ws/seqdb/UniProtTaxonomy.java @@ -32,46 +32,18 @@ import org.forester.util.ForesterUtil; public final class UniProtTaxonomy { - private static final String ARCHAEA = "Archaea"; - private static final String BACTERIA = "Bacteria"; - private static final String EUKARYOTA = "Eukaryota"; - private final List _lineage; - private final String _code; - private final String _scientific_name; - private final String _common_name; - private final String _synonym; - private final String _rank; - private final String _id; - public final static String CELLULAR_ORGANISMS = "cellular organisms"; - public final static String VIRUSES = "Viruses"; - public final static UniProtTaxonomy DROSOPHILA_GENUS = new UniProtTaxonomy( new String[] { - CELLULAR_ORGANISMS, EUKARYOTA, "Metazoa", "Ecdysozoa", "Arthropoda", "Hexapoda", "Insecta", "Pterygota", - "Neoptera", "Endopterygota", "Diptera", "Brachycera", "Muscomorpha", "Ephydroidea", "Drosophilidae", - "Drosophila" }, - "", - "fruit flies", - "Drosophila", - "", - "genus", - "7215" ); - public final static UniProtTaxonomy XENOPUS_GENUS = new UniProtTaxonomy( new String[] { - CELLULAR_ORGANISMS, EUKARYOTA, "Metazoa", "Chordata", "Craniata", "Vertebrata", "Euteleostomi", "Amphibia", - "Batrachia", "Anura", "Mesobatrachia", "Pipoidea", "Pipidae", "Xenopodinae", "Xenopus" }, - "", - "", - "Xenopus", - "", - "genus", - "8353" ); - public final static UniProtTaxonomy CAPITELLA_TELATA_SPECIES = new UniProtTaxonomy( new String[] { - CELLULAR_ORGANISMS, EUKARYOTA, "Metazoa", "Annelida", "Polychaeta", "Scolecida", "Capitellida", - "Capitellidae", "Capitella", "Capitella teleta" }, - "", - "", - "Capitella teleta", - "Capitella sp. I", - "species", - "283909" ); + private static final String ARCHAEA = "Archaea"; + private static final String BACTERIA = "Bacteria"; + private static final String EUKARYOTA = "Eukaryota"; + private final List _lineage; + private final String _code; + private final String _scientific_name; + private final String _common_name; + private final String _synonym; + private final String _rank; + private final String _id; + public final static String CELLULAR_ORGANISMS = "cellular organisms"; + public final static String VIRUSES = "Viruses"; public UniProtTaxonomy( final String line ) { final String[] items = line.split( "\t" ); -- 1.7.10.2