in progress
authorcmzmasek <cmzmasek@ca865154-3058-d1c3-3e42-d8f55a55bdbd>
Sun, 22 Apr 2012 05:09:17 +0000 (05:09 +0000)
committercmzmasek <cmzmasek@ca865154-3058-d1c3-3e42-d8f55a55bdbd>
Sun, 22 Apr 2012 05:09:17 +0000 (05:09 +0000)
forester/java/src/org/forester/application/surfacing.java
forester/java/src/org/forester/surfacing/DomainParsimonyCalculator.java
forester/java/src/org/forester/surfacing/SurfacingUtil.java

index 951ce21..62c6981 100644 (file)
@@ -279,6 +279,12 @@ public class surfacing {
     public static final String                                INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_OUTPUT_SUFFIX        = "_indep_dc_gains_fitch_lists_for_go_mapping.txt";
     public static final String                                INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_OUTPUT_UNIQUE_SUFFIX = "_indep_dc_gains_fitch_lists_for_go_mapping_unique.txt";
     public static final String                                LIMIT_SPEC_FOR_PROT_EX                                                 = null;                                                                                                                                                                                       // e.g. "HUMAN"; set to null for not using this feature (default).
+    public static final String BINARY_DOMAIN_COMBINATIONS_PARSIMONY_TREE_OUTPUT_SUFFIX_FITCH_MAPPED  = "_dc_MAPPED_secondary_features_fitch"
+        + ForesterConstants.PHYLO_XML_SUFFIX;
+    public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_COUNTS_MAPPED_OUTPUT_SUFFIX =  "_indep_dc_gains_fitch_counts_MAPPED.txt";
+    public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_DC_MAPPED_OUTPUT_SUFFIX =  "_indep_dc_gains_fitch_lists_MAPPED.txt";
+    public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_MAPPED_OUTPUT_SUFFIX = "_indep_dc_gains_fitch_lists_for_go_mapping_MAPPED.txt";
+    public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_MAPPED_OUTPUT_UNIQUE_SUFFIX = "_indep_dc_gains_fitch_lists_for_go_mapping_unique_MAPPED.txt";
 
     private static void checkWriteabilityForPairwiseComparisons( final PrintableDomainSimilarity.PRINT_OPTION domain_similarity_print_option,
                                                                  final String[][] input_file_properties,
index 2396b90..18e0879 100644 (file)
 
 package org.forester.surfacing;
 
+import java.util.HashMap;
 import java.util.HashSet;
+import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
+import java.util.Map.Entry;
 import java.util.Set;
 import java.util.SortedSet;
 import java.util.TreeSet;
@@ -208,10 +211,54 @@ public final class DomainParsimonyCalculator {
         setTotalLosses( fitch.getTotalLosses() );
         setTotalUnchanged( fitch.getTotalUnchanged() );
     }
+    
+    private void executeFitchParsimonyOnSecondaryFeatures( 
+                                        final boolean use_last,
+                                        final boolean randomize,
+                                        final long random_number_seed ) {
+        reset();
+        if ( use_last ) {
+            System.out.println( "   Fitch parsimony: use_last = true" );
+        }
+        final FitchParsimony<BinaryStates> fitch = new FitchParsimony<BinaryStates>();
+        fitch.setRandomize( randomize );
+        if ( randomize ) {
+            fitch.setRandomNumberSeed( random_number_seed );
+        }
+        fitch.setUseLast( use_last );
+        fitch.setReturnGainLossMatrix( true );
+        fitch.setReturnInternalStates( true );
+        
+        final Map<DomainId, Set<String>> map = getDomainIdToSecondaryFeaturesMap();
+        final Map<DomainId, String> newmap = new HashMap<DomainId, String>();
+        final Iterator<Entry<DomainId, Set<String>>> it = map.entrySet().iterator();
+        while (it.hasNext()) {
+            final Map.Entry<DomainId, Set<String>> pair =  (Map.Entry<DomainId, Set<String>>)it.next();
+            if ( pair.getValue().size() != 1 ) {
+                throw new IllegalArgumentException( pair.getKey().getId() + " mapps to " + pair.getValue().size() + " items" );
+            }
+            newmap.put( pair.getKey(), ( String ) pair.getValue().toArray()[ 0 ] );
+        }
+        
+        CharacterStateMatrix<BinaryStates> states  =createMatrixOfSecondaryFeatureBinaryDomainCombinationPresenceOrAbsence( getGenomeWideCombinableDomainsList(),
+                                                                                newmap );
+       
+        fitch.execute( getPhylogeny(), states );
+        setGainLossMatrix( fitch.getGainLossMatrix() );
+        setBinaryInternalStatesMatrix( fitch.getInternalStatesMatrix() );
+        setCost( fitch.getCost() );
+        setTotalGains( fitch.getTotalGains() );
+        setTotalLosses( fitch.getTotalLosses() );
+        setTotalUnchanged( fitch.getTotalUnchanged() );
+    }
 
     public void executeFitchParsimonyOnBinaryDomainCombintion( final boolean use_last ) {
         executeFitchParsimony( false, use_last, false, 0 );
     }
+    
+    public void executeFitchParsimonyOnBinaryDomainCombintionOnSecondaryFeatures( final boolean use_last ) {
+        executeFitchParsimonyOnSecondaryFeatures( use_last, false, 0 );
+    }
 
     public void executeFitchParsimonyOnBinaryDomainCombintion( final long random_number_seed ) {
         executeFitchParsimony( false, false, true, random_number_seed );
@@ -498,6 +545,155 @@ public final class DomainParsimonyCalculator {
         return new DomainParsimonyCalculator( phylogeny, gwcd_list, domain_id_to_secondary_features_map );
     }
 
+    
+    /**
+     * For folds instead of Pfam-domains, for example
+     * 
+     * 
+     * @param gwcd_list
+     * @return
+     */
+    static CharacterStateMatrix<BinaryStates> createMatrixOfSecondaryFeaturePresenceOrAbsence( final List<GenomeWideCombinableDomains> gwcd_list,
+                                                                                               final Map<DomainId, Set<String>> domain_id_to_second_features_map,
+                                                                                               final Map<Species, MappingResults> mapping_results_map ) {
+        if ( gwcd_list.isEmpty() ) {
+            throw new IllegalArgumentException( "genome wide combinable domains list is empty" );
+        }
+        if ( ( domain_id_to_second_features_map == null ) || domain_id_to_second_features_map.isEmpty() ) {
+            throw new IllegalArgumentException( "domain id to secondary features map is null or empty" );
+        }
+        final int number_of_identifiers = gwcd_list.size();
+        final SortedSet<String> all_secondary_features = new TreeSet<String>();
+        for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
+            int mapped = 0;
+            int not_mapped = 0;
+            for( final DomainId domain : gwcd.getAllDomainIds() ) {
+                if ( domain_id_to_second_features_map.containsKey( domain ) ) {
+                    all_secondary_features.addAll( domain_id_to_second_features_map.get( domain ) );
+                    mapped++;
+                }
+                else {
+                    not_mapped++;
+                }
+            }
+            if ( mapping_results_map != null ) {
+                final MappingResults mr = new MappingResults();
+                mr.setDescription( gwcd.getSpecies().getSpeciesId() );
+                mr.setSumOfSuccesses( mapped );
+                mr.setSumOfFailures( not_mapped );
+                mapping_results_map.put( gwcd.getSpecies(), mr );
+            }
+        }
+        final int number_of_characters = all_secondary_features.size();
+        final CharacterStateMatrix<CharacterStateMatrix.BinaryStates> matrix = new BasicCharacterStateMatrix<CharacterStateMatrix.BinaryStates>( number_of_identifiers,
+                                                                                                                                                 number_of_characters );
+        int character_index = 0;
+        for( final String second_id : all_secondary_features ) {
+            matrix.setCharacter( character_index++, second_id );
+        }
+        int identifier_index = 0;
+        final Set<String> all_identifiers = new HashSet<String>();
+        for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
+            final String species_id = gwcd.getSpecies().getSpeciesId();
+            if ( all_identifiers.contains( species_id ) ) {
+                throw new IllegalArgumentException( "species [" + species_id + "] is not unique" );
+            }
+            all_identifiers.add( species_id );
+            matrix.setIdentifier( identifier_index, species_id );
+            final Set<String> all_second_per_gwcd = new HashSet<String>();
+            for( final DomainId domain : gwcd.getAllDomainIds() ) {
+                if ( domain_id_to_second_features_map.containsKey( domain ) ) {
+                    all_second_per_gwcd.addAll( domain_id_to_second_features_map.get( domain ) );
+                }
+            }
+            for( int ci = 0; ci < matrix.getNumberOfCharacters(); ++ci ) {
+                if ( all_second_per_gwcd.contains( matrix.getCharacter( ci ) ) ) {
+                    matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.PRESENT );
+                }
+                else {
+                    matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.ABSENT );
+                }
+            }
+            ++identifier_index;
+        }
+        return matrix;
+    }
+    
+    
+    public static CharacterStateMatrix<BinaryStates> createMatrixOfSecondaryFeatureBinaryDomainCombinationPresenceOrAbsence( final List<GenomeWideCombinableDomains> gwcd_list,
+                                                                                                                             final Map<DomainId, String> domain_id_to_second_features_map) {
+        if ( gwcd_list.isEmpty() ) {
+            throw new IllegalArgumentException( "genome wide combinable domains list is empty" );
+        }
+        if ( ( domain_id_to_second_features_map == null ) || domain_id_to_second_features_map.isEmpty() ) {
+            throw new IllegalArgumentException( "domain id to secondary features map is null or empty" );
+        }
+        final int number_of_identifiers = gwcd_list.size();
+        final SortedSet<BinaryDomainCombination> all_binary_combinations_mapped = new TreeSet<BinaryDomainCombination>();
+        final Set<BinaryDomainCombination>[] binary_combinations_per_genome_mapped = new HashSet[ number_of_identifiers ];
+        int identifier_index = 0;
+        for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
+            binary_combinations_per_genome_mapped[ identifier_index ] = new HashSet<BinaryDomainCombination>();
+            for( final BinaryDomainCombination bc : gwcd.toBinaryDomainCombinations() ) {
+                
+                if ( !domain_id_to_second_features_map.containsKey( bc.getId0() ) ) {
+                    throw new IllegalArgumentException( "no mapping found for " + bc.getId0() );
+                }
+                if ( !domain_id_to_second_features_map.containsKey( bc.getId1() ) ) {
+                    throw new IllegalArgumentException( "no mapping found for " + bc.getId1() );
+                }
+                
+                
+                final BinaryDomainCombination mapped_bc = new  BasicBinaryDomainCombination( domain_id_to_second_features_map.get(  bc.getId0()) ,
+                                                                                             domain_id_to_second_features_map.get( bc.getId1()) );
+                all_binary_combinations_mapped.add( mapped_bc );
+                binary_combinations_per_genome_mapped[ identifier_index ].add( mapped_bc );
+            }
+            ++identifier_index;
+        }
+       
+        final int number_of_characters = all_binary_combinations_mapped.size();
+        final CharacterStateMatrix<CharacterStateMatrix.BinaryStates> matrix = new BasicCharacterStateMatrix<CharacterStateMatrix.BinaryStates>( number_of_identifiers,
+                                                                                                                                                 number_of_characters );
+        int character_index = 0;
+        for( final BinaryDomainCombination bc : all_binary_combinations_mapped ) {
+            matrix.setCharacter( character_index++, bc.toString() );
+        }
+        identifier_index = 0;
+        final Set<String> all_identifiers = new HashSet<String>();
+        for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
+            final String species_id = gwcd.getSpecies().getSpeciesId();
+            if ( all_identifiers.contains( species_id ) ) {
+                throw new AssertionError( "species [" + species_id + "] is not unique" );
+            }
+            all_identifiers.add( species_id );
+            matrix.setIdentifier( identifier_index, species_id );
+            for( int ci = 0; ci < matrix.getNumberOfCharacters(); ++ci ) {
+                BinaryDomainCombination bc = null;
+                if ( gwcd.getDomainCombinationType() == DomainCombinationType.DIRECTED_ADJACTANT ) {
+                    bc = AdjactantDirectedBinaryDomainCombination.createInstance( matrix.getCharacter( ci ) );
+                }
+                else if ( gwcd.getDomainCombinationType() == DomainCombinationType.DIRECTED ) {
+                    bc = DirectedBinaryDomainCombination.createInstance( matrix.getCharacter( ci ) );
+                }
+                else {
+                    bc = BasicBinaryDomainCombination.createInstance( matrix.getCharacter( ci ) );
+                }
+                if ( binary_combinations_per_genome_mapped[ identifier_index ].contains( bc ) ) {
+                    matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.PRESENT );
+                }
+                else {
+                    matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.ABSENT );
+                }
+            }
+            ++identifier_index;
+        }
+        return matrix;
+    }
+    
+    
+    
+    
     public static CharacterStateMatrix<BinaryStates> createMatrixOfBinaryDomainCombinationPresenceOrAbsence( final List<GenomeWideCombinableDomains> gwcd_list ) {
         if ( gwcd_list.isEmpty() ) {
             throw new IllegalArgumentException( "genome wide combinable domains list is empty" );
@@ -621,78 +817,7 @@ public final class DomainParsimonyCalculator {
         return matrix;
     }
 
-    /**
-     * For folds instead of Pfam-domains, for example
-     * 
-     * 
-     * @param gwcd_list
-     * @return
-     */
-    static CharacterStateMatrix<BinaryStates> createMatrixOfSecondaryFeaturePresenceOrAbsence( final List<GenomeWideCombinableDomains> gwcd_list,
-                                                                                               final Map<DomainId, Set<String>> domain_id_to_second_features_map,
-                                                                                               final Map<Species, MappingResults> mapping_results_map ) {
-        if ( gwcd_list.isEmpty() ) {
-            throw new IllegalArgumentException( "genome wide combinable domains list is empty" );
-        }
-        if ( ( domain_id_to_second_features_map == null ) || domain_id_to_second_features_map.isEmpty() ) {
-            throw new IllegalArgumentException( "domain id to secondary features map is null or empty" );
-        }
-        final int number_of_identifiers = gwcd_list.size();
-        final SortedSet<String> all_secondary_features = new TreeSet<String>();
-        for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
-            int mapped = 0;
-            int not_mapped = 0;
-            for( final DomainId domain : gwcd.getAllDomainIds() ) {
-                if ( domain_id_to_second_features_map.containsKey( domain ) ) {
-                    all_secondary_features.addAll( domain_id_to_second_features_map.get( domain ) );
-                    mapped++;
-                }
-                else {
-                    not_mapped++;
-                }
-            }
-            if ( mapping_results_map != null ) {
-                final MappingResults mr = new MappingResults();
-                mr.setDescription( gwcd.getSpecies().getSpeciesId() );
-                mr.setSumOfSuccesses( mapped );
-                mr.setSumOfFailures( not_mapped );
-                mapping_results_map.put( gwcd.getSpecies(), mr );
-            }
-        }
-        final int number_of_characters = all_secondary_features.size();
-        final CharacterStateMatrix<CharacterStateMatrix.BinaryStates> matrix = new BasicCharacterStateMatrix<CharacterStateMatrix.BinaryStates>( number_of_identifiers,
-                                                                                                                                                 number_of_characters );
-        int character_index = 0;
-        for( final String second_id : all_secondary_features ) {
-            matrix.setCharacter( character_index++, second_id );
-        }
-        int identifier_index = 0;
-        final Set<String> all_identifiers = new HashSet<String>();
-        for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
-            final String species_id = gwcd.getSpecies().getSpeciesId();
-            if ( all_identifiers.contains( species_id ) ) {
-                throw new IllegalArgumentException( "species [" + species_id + "] is not unique" );
-            }
-            all_identifiers.add( species_id );
-            matrix.setIdentifier( identifier_index, species_id );
-            final Set<String> all_second_per_gwcd = new HashSet<String>();
-            for( final DomainId domain : gwcd.getAllDomainIds() ) {
-                if ( domain_id_to_second_features_map.containsKey( domain ) ) {
-                    all_second_per_gwcd.addAll( domain_id_to_second_features_map.get( domain ) );
-                }
-            }
-            for( int ci = 0; ci < matrix.getNumberOfCharacters(); ++ci ) {
-                if ( all_second_per_gwcd.contains( matrix.getCharacter( ci ) ) ) {
-                    matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.PRESENT );
-                }
-                else {
-                    matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.ABSENT );
-                }
-            }
-            ++identifier_index;
-        }
-        return matrix;
-    }
+   
 
     private static int getStateSumDeltaOnNode( final String node_identifier,
                                                final CharacterStateMatrix<GainLossStates> gain_loss_matrix,
index f979897..9118dd4 100644 (file)
@@ -802,7 +802,7 @@ public final class SurfacingUtil {
         writeToNexus( outfile_name + surfacing.NEXUS_SECONDARY_FEATURES,
                       secondary_features_parsimony.createMatrixOfSecondaryFeaturePresenceOrAbsence( null ),
                       phylogeny );
-        final Phylogeny local_phylogeny_copy = phylogeny.copy();
+        Phylogeny local_phylogeny_copy = phylogeny.copy();
         secondary_features_parsimony.executeDolloParsimonyOnSecondaryFeatures( mapping_results_map );
         SurfacingUtil.writeMatrixToFile( secondary_features_parsimony.getGainLossMatrix(), outfile_name
                 + surfacing.PARSIMONY_OUTPUT_GL_SUFFIX_DOLLO_SECONDARY_FEATURES, Format.FORESTER );
@@ -840,6 +840,35 @@ public final class SurfacingUtil {
                           parameters_str );
         SurfacingUtil.writePhylogenyToFile( local_phylogeny_copy, outfile_name
                 + surfacing.SECONDARY_FEATURES_PARSIMONY_TREE_OUTPUT_SUFFIX_DOLLO );
+        
+        
+        // FITCH DOMAIN COMBINATIONS
+        // -------------------------
+         local_phylogeny_copy = phylogeny.copy();
+        String randomization = "no";
+       
+        secondary_features_parsimony.executeFitchParsimonyOnBinaryDomainCombintion( true );
+        
+      
+     
+      
+        preparePhylogeny( local_phylogeny_copy,
+                          secondary_features_parsimony,
+                          date_time,
+                          "Fitch parsimony on secondary binary domain combination presence/absence randomization: "
+                                  + randomization,
+                          "fitch_on_binary_domain_combinations_" + outfile_name,
+                          parameters_str );
+        SurfacingUtil.writePhylogenyToFile( local_phylogeny_copy, outfile_name
+                + surfacing.BINARY_DOMAIN_COMBINATIONS_PARSIMONY_TREE_OUTPUT_SUFFIX_FITCH_MAPPED );
+        calculateIndependentDomainCombinationGains( local_phylogeny_copy, outfile_name
+                + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_COUNTS_MAPPED_OUTPUT_SUFFIX, outfile_name
+                + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_MAPPED_OUTPUT_SUFFIX, outfile_name
+                + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_MAPPED_OUTPUT_SUFFIX, outfile_name
+                + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_MAPPED_OUTPUT_UNIQUE_SUFFIX, outfile_name
+                + "_MAPPED_indep_dc_gains_fitch_lca_ranks.txt", outfile_name + "_MAPPED_indep_dc_gains_fitch_lca_taxonomies.txt" );
+   
+        
     }
 
     public static void extractProteinNames( final List<Protein> proteins,