inprogress
authorcmzmasek@gmail.com <cmzmasek@gmail.com@ca865154-3058-d1c3-3e42-d8f55a55bdbd>
Fri, 28 Jun 2013 03:46:04 +0000 (03:46 +0000)
committercmzmasek@gmail.com <cmzmasek@gmail.com@ca865154-3058-d1c3-3e42-d8f55a55bdbd>
Fri, 28 Jun 2013 03:46:04 +0000 (03:46 +0000)
forester/java/src/org/forester/analysis/TaxonomyDataManager.java
forester/java/src/org/forester/application/surfacing.java
forester/java/src/org/forester/io/parsers/util/ParserUtils.java
forester/java/src/org/forester/test/Test.java
forester/java/src/org/forester/ws/seqdb/SequenceDbWsTools.java
forester/java/src/org/forester/ws/seqdb/UniProtTaxonomy.java

index cddaa4f..1a1bef6 100644 (file)
@@ -326,11 +326,11 @@ public final class TaxonomyDataManager extends RunnableProcess {
     static final UniProtTaxonomy obtainUniProtTaxonomyFromLineage( final List<String> lineage )
             throws AncestralTaxonomyInferenceException, IOException {
         final String lineage_str = ForesterUtil.stringListToString( lineage, ">" );
-        UniProtTaxonomy up_tax = null;
         if ( TaxonomyDataManager.getLineageTaxCacheMap().containsKey( lineage_str ) ) {
-            up_tax = TaxonomyDataManager.getLineageTaxCacheMap().get( lineage_str ).copy();
+            return TaxonomyDataManager.getLineageTaxCacheMap().get( lineage_str ).copy();
         }
         else {
+            final List<UniProtTaxonomy> matching_taxonomies = new ArrayList<UniProtTaxonomy>();
             final List<UniProtTaxonomy> up_taxonomies = getTaxonomiesFromScientificName( lineage
                     .get( lineage.size() - 1 ) );
             if ( ( up_taxonomies != null ) && ( up_taxonomies.size() > 0 ) ) {
@@ -344,34 +344,46 @@ public final class TaxonomyDataManager extends RunnableProcess {
                         }
                     }
                     if ( match ) {
-                        if ( up_tax != null ) {
-                            //TODO this is dead code?!
-                            throw new AncestralTaxonomyInferenceException( "lineage \""
-                                    + ForesterUtil.stringListToString( lineage, " > " ) + "\" is not unique" );
-                        }
-                        up_tax = up_taxonomy;
+                        matching_taxonomies.add( up_taxonomy );
                     }
                 }
-                if ( up_tax == null ) {
+                if ( matching_taxonomies.isEmpty() ) {
                     throw new AncestralTaxonomyInferenceException( "lineage \""
                             + ForesterUtil.stringListToString( lineage, " > " ) + "\" not found" );
                 }
-                TaxonomyDataManager.getLineageTaxCacheMap().put( lineage_str, up_tax );
-                if ( !ForesterUtil.isEmpty( up_tax.getScientificName() ) ) {
-                    TaxonomyDataManager.getSnTaxCacheMap().put( up_tax.getScientificName(), up_tax );
+                //in case of more than one (e.g. "Xenopus" Genus and Subgenus), keep shorter, less specific  one:
+                int shortest = Integer.MAX_VALUE;
+                UniProtTaxonomy least_specific_up_tax = null;
+                for( final UniProtTaxonomy m : matching_taxonomies ) {
+                    final int s = m.getLineage().size();
+                    if ( s < shortest ) {
+                        shortest = s;
+                        least_specific_up_tax = m;
+                    }
                 }
-                if ( !ForesterUtil.isEmpty( up_tax.getCode() ) ) {
-                    TaxonomyDataManager.getCodeTaxCacheMap().put( up_tax.getCode(), up_tax );
+                TaxonomyDataManager.getLineageTaxCacheMap().put( lineage_str, least_specific_up_tax );
+                if ( !ForesterUtil.isEmpty( least_specific_up_tax.getScientificName() ) ) {
+                    TaxonomyDataManager.getSnTaxCacheMap().put( least_specific_up_tax.getScientificName(),
+                                                                least_specific_up_tax );
                 }
-                if ( !ForesterUtil.isEmpty( up_tax.getCommonName() ) ) {
-                    TaxonomyDataManager.getCnTaxCacheMap().put( up_tax.getCommonName(), up_tax );
+                if ( !ForesterUtil.isEmpty( least_specific_up_tax.getCode() ) ) {
+                    TaxonomyDataManager.getCodeTaxCacheMap().put( least_specific_up_tax.getCode(),
+                                                                  least_specific_up_tax );
                 }
-                if ( !ForesterUtil.isEmpty( up_tax.getId() ) ) {
-                    TaxonomyDataManager.getIdTaxCacheMap().put( up_tax.getId(), up_tax );
+                if ( !ForesterUtil.isEmpty( least_specific_up_tax.getCommonName() ) ) {
+                    TaxonomyDataManager.getCnTaxCacheMap().put( least_specific_up_tax.getCommonName(),
+                                                                least_specific_up_tax );
                 }
+                if ( !ForesterUtil.isEmpty( least_specific_up_tax.getId() ) ) {
+                    TaxonomyDataManager.getIdTaxCacheMap().put( least_specific_up_tax.getId(), least_specific_up_tax );
+                }
+                return least_specific_up_tax;
+            }
+            else {
+                throw new AncestralTaxonomyInferenceException( "taxonomy \"" + ( lineage.get( lineage.size() - 1 ) )
+                        + "\" not found" );
             }
         }
-        return up_tax;
     }
 
     synchronized final private static void updateTaxonomy( final QUERY_TYPE qt,
index f15ea36..f2f0c9e 100644 (file)
@@ -54,6 +54,7 @@ import org.forester.go.PfamToGoMapping;
 import org.forester.go.PfamToGoParser;
 import org.forester.io.parsers.HmmscanPerDomainTableParser;
 import org.forester.io.parsers.HmmscanPerDomainTableParser.INDIVIDUAL_SCORE_CUTOFF;
+import org.forester.io.parsers.phyloxml.PhyloXmlUtil;
 import org.forester.io.parsers.util.ParserUtils;
 import org.forester.phylogeny.Phylogeny;
 import org.forester.phylogeny.PhylogenyMethods;
@@ -1019,12 +1020,12 @@ public class surfacing {
             }
         }
         final String[][] input_file_properties = processInputGenomesFile( input_genomes_file );
-        for( final String[] input_file_propertie : input_file_properties ) {
-            for( final String element : input_file_propertie ) {
-                System.out.print( element + " " );
-            }
-            System.out.println();
-        }
+        //        for( final String[] input_file_propertie : input_file_properties ) {
+        //            for( final String element : input_file_propertie ) {
+        //                System.out.print( element + " " );
+        //            }
+        //            System.out.println();
+        //        }
         final int number_of_genomes = input_file_properties.length;
         if ( number_of_genomes < 2 ) {
             ForesterUtil.fatalError( surfacing.PRG_NAME, "cannot analyze less than two files" );
@@ -2627,7 +2628,23 @@ public class surfacing {
                                      "genomes files is to be in the following format \"<hmmpfam output file> <species>\": "
                                              + e.getLocalizedMessage() );
         }
+        final Set<String> specs = new HashSet<String>();
+        final Set<String> paths = new HashSet<String>();
         for( int i = 0; i < input_file_properties.length; ++i ) {
+            if ( !PhyloXmlUtil.TAXOMONY_CODE_PATTERN.matcher( input_file_properties[ i ][ 1 ] ).matches() ) {
+                ForesterUtil.fatalError( surfacing.PRG_NAME, "illegal format for species code: "
+                        + input_file_properties[ i ][ 1 ] );
+            }
+            if ( specs.contains( input_file_properties[ i ][ 1 ] ) ) {
+                ForesterUtil.fatalError( surfacing.PRG_NAME, "species code " + input_file_properties[ i ][ 1 ]
+                        + " is not unique" );
+            }
+            specs.add( input_file_properties[ i ][ 1 ] );
+            if ( paths.contains( input_file_properties[ i ][ 0 ] ) ) {
+                ForesterUtil.fatalError( surfacing.PRG_NAME, "path " + input_file_properties[ i ][ 0 ]
+                        + " is not unique" );
+            }
+            paths.add( input_file_properties[ i ][ 0 ] );
             final String error = ForesterUtil.isReadableFile( new File( input_file_properties[ i ][ 0 ] ) );
             if ( !ForesterUtil.isEmpty( error ) ) {
                 ForesterUtil.fatalError( surfacing.PRG_NAME, error );
index 26e35b3..faa0918 100644 (file)
@@ -55,7 +55,7 @@ import org.forester.util.ForesterUtil;
 
 public final class ParserUtils {
 
-    final public static String   TAX_CODE                        = "(?:[A-Z9][A-Z]{2}[A-Z0-9]{2})|RAT|PIG|PEA|CAP";
+    final public static String   TAX_CODE                        = "(?:[A-Z9][A-Z]{2}[A-Z0-9]{2})|RAT|PIG|PEA";
     final public static Pattern  TAXOMONY_SN_PATTERN             = Pattern
                                                                          .compile( "(?:\\b|_)[a-zA-Z0-9]{3,}_([A-Z][a-z]+_[a-z]{2,}(?:_[a-z][a-z0-9_]+)?)\\b" );
     final private static Pattern TAXOMONY_CODE_PATTERN_PFS       = Pattern.compile( "(?:\\b|_)[A-Z0-9]{4,}_("
index e8c173e..8f9341d 100644 (file)
@@ -11160,6 +11160,84 @@ public final class Test {
                 System.out.println( results.get( 0 ).getLineage() );
                 return false;
             }
+            //
+            results = null;
+            results = SequenceDbWsTools.getTaxonomiesFromScientificNameStrict( "Xenopus tropicalis", 10 );
+            if ( results.size() != 1 ) {
+                return false;
+            }
+            if ( !results.get( 0 ).getCode().equals( "XENTR" ) ) {
+                return false;
+            }
+            if ( !results.get( 0 ).getCommonName().equalsIgnoreCase( "Western clawed frog" ) ) {
+                return false;
+            }
+            if ( !results.get( 0 ).getId().equalsIgnoreCase( "8364" ) ) {
+                return false;
+            }
+            if ( !results.get( 0 ).getRank().equalsIgnoreCase( "species" ) ) {
+                return false;
+            }
+            if ( !results.get( 0 ).getScientificName().equals( "Xenopus tropicalis" ) ) {
+                return false;
+            }
+            if ( !results.get( 0 ).getLineage().get( results.get( 0 ).getLineage().size() - 1 )
+                    .equals( "Xenopus tropicalis" ) ) {
+                System.out.println( results.get( 0 ).getLineage() );
+                return false;
+            }
+            //
+            results = null;
+            results = SequenceDbWsTools.getTaxonomiesFromId( "8364", 10 );
+            if ( results.size() != 1 ) {
+                return false;
+            }
+            if ( !results.get( 0 ).getCode().equals( "XENTR" ) ) {
+                return false;
+            }
+            if ( !results.get( 0 ).getCommonName().equalsIgnoreCase( "Western clawed frog" ) ) {
+                return false;
+            }
+            if ( !results.get( 0 ).getId().equalsIgnoreCase( "8364" ) ) {
+                return false;
+            }
+            if ( !results.get( 0 ).getRank().equalsIgnoreCase( "species" ) ) {
+                return false;
+            }
+            if ( !results.get( 0 ).getScientificName().equals( "Xenopus tropicalis" ) ) {
+                return false;
+            }
+            if ( !results.get( 0 ).getLineage().get( results.get( 0 ).getLineage().size() - 1 )
+                    .equals( "Xenopus tropicalis" ) ) {
+                System.out.println( results.get( 0 ).getLineage() );
+                return false;
+            }
+            //
+            results = null;
+            results = SequenceDbWsTools.getTaxonomiesFromTaxonomyCode( "XENTR", 10 );
+            if ( results.size() != 1 ) {
+                return false;
+            }
+            if ( !results.get( 0 ).getCode().equals( "XENTR" ) ) {
+                return false;
+            }
+            if ( !results.get( 0 ).getCommonName().equalsIgnoreCase( "Western clawed frog" ) ) {
+                return false;
+            }
+            if ( !results.get( 0 ).getId().equalsIgnoreCase( "8364" ) ) {
+                return false;
+            }
+            if ( !results.get( 0 ).getRank().equalsIgnoreCase( "species" ) ) {
+                return false;
+            }
+            if ( !results.get( 0 ).getScientificName().equals( "Xenopus tropicalis" ) ) {
+                return false;
+            }
+            if ( !results.get( 0 ).getLineage().get( results.get( 0 ).getLineage().size() - 1 )
+                    .equals( "Xenopus tropicalis" ) ) {
+                System.out.println( results.get( 0 ).getLineage() );
+                return false;
+            }
         }
         catch ( final IOException e ) {
             System.out.println();
index 7c656d5..1bb2e98 100644 (file)
@@ -49,16 +49,15 @@ import org.forester.util.SequenceIdParser;
 
 public final class SequenceDbWsTools {
 
-    private static final boolean ALLOW_TAXONOMY_CODE_HACKS = true;                                         //TODO turn off for final realease!
-    public final static String   BASE_UNIPROT_URL          = "http://www.uniprot.org/";
-    public final static String   BASE_EMBL_DB_URL          = "http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/";
-    public final static String   EMBL_DBS_EMBL             = "embl";
-    public final static String   EMBL_DBS_REFSEQ_P         = "refseqp";
-    public final static String   EMBL_DBS_REFSEQ_N         = "refseqn";
-    private final static String  URL_ENC                   = "UTF-8";
-    private final static boolean DEBUG                     = false;
+    public final static String   BASE_UNIPROT_URL  = "http://www.uniprot.org/";
+    public final static String   BASE_EMBL_DB_URL  = "http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/";
+    public final static String   EMBL_DBS_EMBL     = "embl";
+    public final static String   EMBL_DBS_REFSEQ_P = "refseqp";
+    public final static String   EMBL_DBS_REFSEQ_N = "refseqn";
+    private final static String  URL_ENC           = "UTF-8";
+    private final static boolean DEBUG             = false;
 
-    public static List<UniProtTaxonomy> getTaxonomiesFromCommonName( final String cn, final int max_taxonomies_return )
+    private static List<UniProtTaxonomy> getTaxonomiesFromCommonName( final String cn, final int max_taxonomies_return )
             throws IOException {
         final List<String> result = getTaxonomyStringFromCommonName( cn, max_taxonomies_return );
         if ( result.size() > 0 ) {
@@ -92,19 +91,9 @@ public final class SequenceDbWsTools {
         return null;
     }
 
-    public static List<UniProtTaxonomy> getTaxonomiesFromScientificName( final String sn,
-                                                                         final int max_taxonomies_return )
+    private static List<UniProtTaxonomy> getTaxonomiesFromScientificName( final String sn,
+                                                                          final int max_taxonomies_return )
             throws IOException {
-        // Hack!  Craniata? .. 
-        if ( sn.equals( "Drosophila" ) ) {
-            return uniProtTaxonomyToList( UniProtTaxonomy.DROSOPHILA_GENUS );
-        }
-        else if ( sn.equals( "Xenopus" ) ) {
-            return uniProtTaxonomyToList( UniProtTaxonomy.XENOPUS_GENUS );
-        }
-        // else if ( sn.equals( "Nucleariidae and Fonticula group" ) ) {
-        //     return hack( UniProtTaxonomy.NUCLEARIIDAE_AND_FONTICULA );
-        // }
         final List<String> result = getTaxonomyStringFromScientificName( sn, max_taxonomies_return );
         if ( result.size() > 0 ) {
             return parseUniProtTaxonomy( result );
@@ -138,12 +127,6 @@ public final class SequenceDbWsTools {
                                                                        final int max_taxonomies_return )
             throws IOException {
         final String my_code = new String( code );
-        if ( ALLOW_TAXONOMY_CODE_HACKS ) {
-            final List<UniProtTaxonomy> l = resolveFakeTaxonomyCodes( max_taxonomies_return, my_code );
-            if ( l != null ) {
-                return l;
-            }
-        }
         final List<String> result = getTaxonomyStringFromTaxonomyCode( my_code, max_taxonomies_return );
         if ( result.size() > 0 ) {
             return parseUniProtTaxonomy( result );
@@ -394,112 +377,6 @@ public final class SequenceDbWsTools {
         return taxonomies;
     }
 
-    private static List<UniProtTaxonomy> resolveFakeTaxonomyCodes( final int max_taxonomies_return, final String code )
-            throws IOException {
-        if ( code.equals( "CAP" ) ) {
-            return getTaxonomiesFromId( "283909", max_taxonomies_return );
-        }
-        else if ( code.equals( "FUGRU" ) ) {
-            return getTaxonomiesFromId( "31033", max_taxonomies_return );
-        }
-        else if ( code.equals( "GIALA" ) ) {
-            return getTaxonomiesFromId( "5741", max_taxonomies_return );
-        }
-        else if ( code.equals( "TRIVE" ) ) {
-            return getTaxonomiesFromId( "413071", max_taxonomies_return );
-        }
-        else if ( code.equals( "CAPOWC" ) ) {
-            return getTaxonomiesFromId( "192875", max_taxonomies_return );
-        }
-        else if ( code.equals( "SPHARC" ) ) {
-            return getTaxonomiesFromId( "667725", max_taxonomies_return );
-        }
-        else if ( code.equals( "THETRA" ) ) {
-            return getTaxonomiesFromId( "529818", max_taxonomies_return );
-        }
-        else if ( code.equals( "CHLVUL" ) ) {
-            return getTaxonomiesFromId( "574566", max_taxonomies_return );
-        }
-        else if ( code.equals( "CITCLE" ) ) {
-            return getTaxonomiesFromId( "85681", max_taxonomies_return );
-        }
-        else if ( code.equals( "MYCPOP" ) ) {
-            return getTaxonomiesFromId( "85929", max_taxonomies_return );
-        }
-        else if ( code.equals( "AGABB" ) ) {
-            return getTaxonomiesFromId( "597362", max_taxonomies_return );
-        }
-        else if ( code.equals( "BAUCOM" ) ) {
-            return getTaxonomiesFromId( "430998", max_taxonomies_return );
-        }
-        else if ( code.equals( "DICSQU" ) ) {
-            return getTaxonomiesFromId( "114155", max_taxonomies_return );
-        }
-        else if ( code.equals( "FOMPIN" ) ) {
-            return getTaxonomiesFromId( "40483", max_taxonomies_return );
-        }
-        else if ( code.equals( "HYDMA" ) ) {
-            return getTaxonomiesFromId( "6085", max_taxonomies_return );
-        }
-        else if ( code.equals( "MYCFI" ) ) {
-            return getTaxonomiesFromId( "83344", max_taxonomies_return );
-        }
-        else if ( code.equals( "OIDMAI" ) ) {
-            return getTaxonomiesFromId( "78148", max_taxonomies_return );
-        }
-        else if ( code.equals( "OSTRC" ) ) {
-            return getTaxonomiesFromId( "385169", max_taxonomies_return );
-        }
-        else if ( code.equals( "POSPL" ) ) {
-            return getTaxonomiesFromId( "104341", max_taxonomies_return );
-        }
-        else if ( code.equals( "SAICOM" ) ) {
-            return getTaxonomiesFromId( "5606", max_taxonomies_return );
-        }
-        else if ( code.equals( "SERLA" ) ) {
-            return getTaxonomiesFromId( "85982", max_taxonomies_return );
-        }
-        else if ( code.equals( "SPORO" ) ) {
-            return getTaxonomiesFromId( "40563", max_taxonomies_return );
-        }
-        else if ( code.equals( "ACRALC" ) ) {
-            return getTaxonomiesFromId( "398408", max_taxonomies_return );
-        }
-        else if ( code.equals( "THITER" ) ) {
-            return getTaxonomiesFromId( "35720", max_taxonomies_return );
-        }
-        else if ( code.equals( "MYCTHE" ) ) {
-            return getTaxonomiesFromId( "78579", max_taxonomies_return );
-        }
-        else if ( code.equals( "CONPUT" ) ) {
-            return getTaxonomiesFromId( "80637", max_taxonomies_return );
-        }
-        else if ( code.equals( "WOLCOC" ) ) {
-            return getTaxonomiesFromId( "81056", max_taxonomies_return );
-        }
-        else if ( code.equals( "CLAGRA" ) ) {
-            return getTaxonomiesFromId( "27339", max_taxonomies_return );
-        }
-        else if ( code.equals( "XANPAR" ) ) {
-            return getTaxonomiesFromId( "107463", max_taxonomies_return );
-        }
-        else if ( code.equals( "HYDPIN" ) ) {
-            return getTaxonomiesFromId( "388859", max_taxonomies_return );
-        }
-        else if ( code.equals( "SERLAC" ) ) {
-            return getTaxonomiesFromId( "85982", max_taxonomies_return );
-        }
-        else {
-            return null;
-        }
-    }
-
-    private static List<UniProtTaxonomy> uniProtTaxonomyToList( final UniProtTaxonomy tax ) {
-        final List<UniProtTaxonomy> l = new ArrayList<UniProtTaxonomy>();
-        l.add( tax );
-        return l;
-    }
-
     public enum Db {
         UNIPROT, EMBL, NCBI, NONE, REFSEQ;
     }
index b50d41e..2cdd93e 100644 (file)
@@ -32,46 +32,18 @@ import org.forester.util.ForesterUtil;
 
 public final class UniProtTaxonomy {
 
-    private static final String         ARCHAEA                  = "Archaea";
-    private static final String         BACTERIA                 = "Bacteria";
-    private static final String         EUKARYOTA                = "Eukaryota";
-    private final List<String>          _lineage;
-    private final String                _code;
-    private final String                _scientific_name;
-    private final String                _common_name;
-    private final String                _synonym;
-    private final String                _rank;
-    private final String                _id;
-    public final static String          CELLULAR_ORGANISMS       = "cellular organisms";
-    public final static String          VIRUSES                  = "Viruses";
-    public final static UniProtTaxonomy DROSOPHILA_GENUS         = new UniProtTaxonomy( new String[] {
-            CELLULAR_ORGANISMS, EUKARYOTA, "Metazoa", "Ecdysozoa", "Arthropoda", "Hexapoda", "Insecta", "Pterygota",
-            "Neoptera", "Endopterygota", "Diptera", "Brachycera", "Muscomorpha", "Ephydroidea", "Drosophilidae",
-            "Drosophila"                                                               },
-                                                                                        "",
-                                                                                        "fruit flies",
-                                                                                        "Drosophila",
-                                                                                        "",
-                                                                                        "genus",
-                                                                                        "7215" );
-    public final static UniProtTaxonomy XENOPUS_GENUS            = new UniProtTaxonomy( new String[] {
-            CELLULAR_ORGANISMS, EUKARYOTA, "Metazoa", "Chordata", "Craniata", "Vertebrata", "Euteleostomi", "Amphibia",
-            "Batrachia", "Anura", "Mesobatrachia", "Pipoidea", "Pipidae", "Xenopodinae", "Xenopus" },
-                                                                                        "",
-                                                                                        "",
-                                                                                        "Xenopus",
-                                                                                        "",
-                                                                                        "genus",
-                                                                                        "8353" );
-    public final static UniProtTaxonomy CAPITELLA_TELATA_SPECIES = new UniProtTaxonomy( new String[] {
-            CELLULAR_ORGANISMS, EUKARYOTA, "Metazoa", "Annelida", "Polychaeta", "Scolecida", "Capitellida",
-            "Capitellidae", "Capitella", "Capitella teleta"                            },
-                                                                                        "",
-                                                                                        "",
-                                                                                        "Capitella teleta",
-                                                                                        "Capitella sp. I",
-                                                                                        "species",
-                                                                                        "283909" );
+    private static final String ARCHAEA            = "Archaea";
+    private static final String BACTERIA           = "Bacteria";
+    private static final String EUKARYOTA          = "Eukaryota";
+    private final List<String>  _lineage;
+    private final String        _code;
+    private final String        _scientific_name;
+    private final String        _common_name;
+    private final String        _synonym;
+    private final String        _rank;
+    private final String        _id;
+    public final static String  CELLULAR_ORGANISMS = "cellular organisms";
+    public final static String  VIRUSES            = "Viruses";
 
     public UniProtTaxonomy( final String line ) {
         final String[] items = line.split( "\t" );