in progress
[jalview.git] / forester / java / src / org / forester / surfacing / SurfacingUtil.java
index 2a607cd..c0b8b8f 100644 (file)
@@ -71,6 +71,12 @@ import org.forester.phylogeny.PhylogenyNodeI.NH_CONVERSION_SUPPORT_VALUE_STYLE;
 import org.forester.phylogeny.data.BinaryCharacters;
 import org.forester.phylogeny.data.Confidence;
 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
+import org.forester.protein.BasicProtein;
+import org.forester.protein.BinaryDomainCombination;
+import org.forester.protein.Domain;
+import org.forester.protein.DomainId;
+import org.forester.protein.Protein;
+import org.forester.species.Species;
 import org.forester.surfacing.DomainSimilarityCalculator.Detailedness;
 import org.forester.surfacing.DomainSimilarityCalculator.GoAnnotationOutput;
 import org.forester.surfacing.GenomeWideCombinableDomains.GenomeWideCombinableDomainsSortOrder;
@@ -105,6 +111,7 @@ public final class SurfacingUtil {
                                                                                  }
                                                                              };
     public final static Pattern             PATTERN_SP_STYLE_TAXONOMY        = Pattern.compile( "^[A-Z0-9]{3,5}$" );
+    private static final boolean            USE_LAST                         = true;
 
     private SurfacingUtil() {
         // Hidden constructor.
@@ -673,7 +680,7 @@ public final class SurfacingUtil {
                 randomization = "yes, seed = " + random_number_seed_for_fitch_parsimony;
             }
             else {
-                domain_parsimony.executeFitchParsimonyOnBinaryDomainCombintion( false );
+                domain_parsimony.executeFitchParsimonyOnBinaryDomainCombintion( USE_LAST );
             }
             SurfacingUtil.writeMatrixToFile( domain_parsimony.getGainLossMatrix(), outfile_name
                     + surfacing.PARSIMONY_OUTPUT_GL_SUFFIX_FITCH_BINARY_COMBINATIONS, Format.FORESTER );
@@ -802,7 +809,7 @@ public final class SurfacingUtil {
         writeToNexus( outfile_name + surfacing.NEXUS_SECONDARY_FEATURES,
                       secondary_features_parsimony.createMatrixOfSecondaryFeaturePresenceOrAbsence( null ),
                       phylogeny );
-        final Phylogeny local_phylogeny_copy = phylogeny.copy();
+        Phylogeny local_phylogeny_copy = phylogeny.copy();
         secondary_features_parsimony.executeDolloParsimonyOnSecondaryFeatures( mapping_results_map );
         SurfacingUtil.writeMatrixToFile( secondary_features_parsimony.getGainLossMatrix(), outfile_name
                 + surfacing.PARSIMONY_OUTPUT_GL_SUFFIX_DOLLO_SECONDARY_FEATURES, Format.FORESTER );
@@ -840,70 +847,61 @@ public final class SurfacingUtil {
                           parameters_str );
         SurfacingUtil.writePhylogenyToFile( local_phylogeny_copy, outfile_name
                 + surfacing.SECONDARY_FEATURES_PARSIMONY_TREE_OUTPUT_SUFFIX_DOLLO );
+        // FITCH DOMAIN COMBINATIONS
+        // -------------------------
+        local_phylogeny_copy = phylogeny.copy();
+        final String randomization = "no";
+        secondary_features_parsimony.executeFitchParsimonyOnBinaryDomainCombintionOnSecondaryFeatures( USE_LAST );
+        preparePhylogeny( local_phylogeny_copy,
+                          secondary_features_parsimony,
+                          date_time,
+                          "Fitch parsimony on secondary binary domain combination presence/absence randomization: "
+                                  + randomization,
+                          "fitch_on_binary_domain_combinations_" + outfile_name,
+                          parameters_str );
+        SurfacingUtil.writePhylogenyToFile( local_phylogeny_copy, outfile_name
+                + surfacing.BINARY_DOMAIN_COMBINATIONS_PARSIMONY_TREE_OUTPUT_SUFFIX_FITCH_MAPPED );
+        calculateIndependentDomainCombinationGains( local_phylogeny_copy, outfile_name
+                + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_COUNTS_MAPPED_OUTPUT_SUFFIX, outfile_name
+                + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_MAPPED_OUTPUT_SUFFIX, outfile_name
+                + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_MAPPED_OUTPUT_SUFFIX, outfile_name
+                + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_MAPPED_OUTPUT_UNIQUE_SUFFIX, outfile_name
+                + "_MAPPED_indep_dc_gains_fitch_lca_ranks.txt", outfile_name
+                + "_MAPPED_indep_dc_gains_fitch_lca_taxonomies.txt" );
     }
 
     public static void extractProteinNames( final List<Protein> proteins,
                                             final List<DomainId> query_domain_ids_nc_order,
                                             final Writer out,
-                                            final String separator ) throws IOException {
+                                            final String separator,
+                                            final String limit_to_species ) throws IOException {
         for( final Protein protein : proteins ) {
-            if ( protein.contains( query_domain_ids_nc_order, true ) ) {
-                out.write( protein.getSpecies().getSpeciesId() );
-                out.write( separator );
-                out.write( protein.getProteinId().getId() );
-                out.write( separator );
-                out.write( "[" );
-                final Set<DomainId> visited_domain_ids = new HashSet<DomainId>();
-                boolean first = true;
-                for( final Domain domain : protein.getProteinDomains() ) {
-                    if ( !visited_domain_ids.contains( domain.getDomainId() ) ) {
-                        visited_domain_ids.add( domain.getDomainId() );
-                        if ( first ) {
-                            first = false;
-                        }
-                        else {
-                            out.write( " " );
-                        }
-                        out.write( domain.getDomainId().getId() );
-                        out.write( " {" );
-                        out.write( "" + domain.getTotalCount() );
-                        out.write( "}" );
-                    }
-                }
-                out.write( "]" );
-                out.write( separator );
-                if ( !( ForesterUtil.isEmpty( protein.getDescription() ) || protein.getDescription()
-                        .equals( SurfacingConstants.NONE ) ) ) {
-                    out.write( protein.getDescription() );
-                }
-                out.write( separator );
-                if ( !( ForesterUtil.isEmpty( protein.getAccession() ) || protein.getAccession()
-                        .equals( SurfacingConstants.NONE ) ) ) {
-                    out.write( protein.getAccession() );
-                }
-                out.write( SurfacingConstants.NL );
-            }
-        }
-        out.flush();
-    }
-
-    public static void extractProteinNames( final SortedMap<Species, List<Protein>> protein_lists_per_species,
-                                            final DomainId domain_id,
-                                            final Writer out,
-                                            final String separator ) throws IOException {
-        for( final Species species : protein_lists_per_species.keySet() ) {
-            for( final Protein protein : protein_lists_per_species.get( species ) ) {
-                final List<Domain> domains = protein.getProteinDomains( domain_id );
-                if ( domains.size() > 0 ) {
-                    final DescriptiveStatistics stats = new BasicDescriptiveStatistics();
-                    for( final Domain domain : domains ) {
-                        stats.addValue( domain.getPerSequenceEvalue() );
-                    }
+            if ( ForesterUtil.isEmpty( limit_to_species )
+                    || protein.getSpecies().getSpeciesId().equalsIgnoreCase( limit_to_species ) ) {
+                if ( protein.contains( query_domain_ids_nc_order, true ) ) {
                     out.write( protein.getSpecies().getSpeciesId() );
                     out.write( separator );
                     out.write( protein.getProteinId().getId() );
                     out.write( separator );
-                    out.write( "[" + FORMATTER.format( stats.median() ) + "]" );
+                    out.write( "[" );
+                    final Set<DomainId> visited_domain_ids = new HashSet<DomainId>();
+                    boolean first = true;
+                    for( final Domain domain : protein.getProteinDomains() ) {
+                        if ( !visited_domain_ids.contains( domain.getDomainId() ) ) {
+                            visited_domain_ids.add( domain.getDomainId() );
+                            if ( first ) {
+                                first = false;
+                            }
+                            else {
+                                out.write( " " );
+                            }
+                            out.write( domain.getDomainId().getId() );
+                            out.write( " {" );
+                            out.write( "" + domain.getTotalCount() );
+                            out.write( "}" );
+                        }
+                    }
+                    out.write( "]" );
                     out.write( separator );
                     if ( !( ForesterUtil.isEmpty( protein.getDescription() ) || protein.getDescription()
                             .equals( SurfacingConstants.NONE ) ) ) {
@@ -921,6 +919,44 @@ public final class SurfacingUtil {
         out.flush();
     }
 
+    public static void extractProteinNames( final SortedMap<Species, List<Protein>> protein_lists_per_species,
+                                            final DomainId domain_id,
+                                            final Writer out,
+                                            final String separator,
+                                            final String limit_to_species ) throws IOException {
+        for( final Species species : protein_lists_per_species.keySet() ) {
+            for( final Protein protein : protein_lists_per_species.get( species ) ) {
+                if ( ForesterUtil.isEmpty( limit_to_species )
+                        || protein.getSpecies().getSpeciesId().equalsIgnoreCase( limit_to_species ) ) {
+                    final List<Domain> domains = protein.getProteinDomains( domain_id );
+                    if ( domains.size() > 0 ) {
+                        final DescriptiveStatistics stats = new BasicDescriptiveStatistics();
+                        for( final Domain domain : domains ) {
+                            stats.addValue( domain.getPerSequenceEvalue() );
+                        }
+                        out.write( protein.getSpecies().getSpeciesId() );
+                        out.write( separator );
+                        out.write( protein.getProteinId().getId() );
+                        out.write( separator );
+                        out.write( "[" + FORMATTER.format( stats.median() ) + "]" );
+                        out.write( separator );
+                        if ( !( ForesterUtil.isEmpty( protein.getDescription() ) || protein.getDescription()
+                                .equals( SurfacingConstants.NONE ) ) ) {
+                            out.write( protein.getDescription() );
+                        }
+                        out.write( separator );
+                        if ( !( ForesterUtil.isEmpty( protein.getAccession() ) || protein.getAccession()
+                                .equals( SurfacingConstants.NONE ) ) ) {
+                            out.write( protein.getAccession() );
+                        }
+                        out.write( SurfacingConstants.NL );
+                    }
+                }
+            }
+        }
+        out.flush();
+    }
+
     public static SortedSet<DomainId> getAllDomainIds( final List<GenomeWideCombinableDomains> gwcd_list ) {
         final SortedSet<DomainId> all_domains_ids = new TreeSet<DomainId>();
         for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
@@ -1124,7 +1160,7 @@ public final class SurfacingUtil {
                                                     final boolean remove_engulfed_domains,
                                                     final Protein protein ) {
         final Protein pruned_protein = new BasicProtein( protein.getProteinId().getId(), protein.getSpecies()
-                .getSpeciesId() );
+                .getSpeciesId(), protein.getLength() );
         final List<Domain> sorted = SurfacingUtil.sortDomainsWithAscendingConfidenceValues( protein );
         final List<Boolean> covered_positions = new ArrayList<Boolean>();
         for( final Domain domain : sorted ) {
@@ -2437,17 +2473,32 @@ public final class SurfacingUtil {
         try {
             writer.write( genome );
             writer.write( "\t" );
-            writer.write( stats.arithmeticMean() + "" );
-            writer.write( "\t" );
-            writer.write( stats.sampleStandardDeviation() + "" );
-            writer.write( "\t" );
-            writer.write( stats.median() + "" );
-            writer.write( "\t" );
-            writer.write( stats.getN() + "" );
-            writer.write( "\t" );
-            writer.write( stats.getMin() + "" );
-            writer.write( "\t" );
-            writer.write( stats.getMax() + "" );
+            if ( stats.getN() >= 1 ) {
+                writer.write( stats.arithmeticMean() + "" );
+                writer.write( "\t" );
+                if ( stats.getN() >= 2 ) {
+                    writer.write( stats.sampleStandardDeviation() + "" );
+                }
+                else {
+                    writer.write( "" );
+                }
+                writer.write( "\t" );
+                writer.write( stats.median() + "" );
+                writer.write( "\t" );
+                writer.write( stats.getN() + "" );
+                writer.write( "\t" );
+                writer.write( stats.getMin() + "" );
+                writer.write( "\t" );
+                writer.write( stats.getMax() + "" );
+            }
+            else {
+                writer.write( "\t" );
+                writer.write( "\t" );
+                writer.write( "\t" );
+                writer.write( "0" );
+                writer.write( "\t" );
+                writer.write( "\t" );
+            }
             writer.write( "\n" );
         }
         catch ( final IOException e ) {