inprogress
[jalview.git] / forester / java / src / org / forester / surfacing / SurfacingUtil.java
index b7590aa..abcd31f 100644 (file)
@@ -22,7 +22,7 @@
 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
 //
 // Contact: phylosoft @ gmail . com
-// WWW: www.phylosoft.org/forester
+// WWW: https://sites.google.com/site/cmzmasek/home/software/forester
 
 package org.forester.surfacing;
 
@@ -39,6 +39,7 @@ import java.util.Collections;
 import java.util.Comparator;
 import java.util.HashMap;
 import java.util.HashSet;
+import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
 import java.util.Map.Entry;
@@ -69,7 +70,7 @@ import org.forester.io.writers.PhylogenyWriter;
 import org.forester.phylogeny.Phylogeny;
 import org.forester.phylogeny.PhylogenyMethods;
 import org.forester.phylogeny.PhylogenyNode;
-import org.forester.phylogeny.PhylogenyNodeI.NH_CONVERSION_SUPPORT_VALUE_STYLE;
+import org.forester.phylogeny.PhylogenyNode.NH_CONVERSION_SUPPORT_VALUE_STYLE;
 import org.forester.phylogeny.data.BinaryCharacters;
 import org.forester.phylogeny.data.Confidence;
 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
@@ -120,6 +121,75 @@ public final class SurfacingUtil {
         // Hidden constructor.
     }
 
+    public static void performDomainArchitectureAnalysis( final SortedMap<String, Set<String>> domain_architecutures,
+                                                          final SortedMap<String, Integer> domain_architecuture_counts,
+                                                          final int min_count,
+                                                          final File da_counts_outfile,
+                                                          final File unique_da_outfile ) {
+        checkForOutputFileWriteability( da_counts_outfile );
+        checkForOutputFileWriteability( unique_da_outfile );
+        try {
+            final BufferedWriter da_counts_out = new BufferedWriter( new FileWriter( da_counts_outfile ) );
+            final BufferedWriter unique_da_out = new BufferedWriter( new FileWriter( unique_da_outfile ) );
+            final Iterator<Entry<String, Integer>> it = domain_architecuture_counts.entrySet().iterator();
+            while ( it.hasNext() ) {
+                final Map.Entry<String, Integer> e = it.next();
+                final String da = e.getKey();
+                final int count = e.getValue();
+                if ( count >= min_count ) {
+                    da_counts_out.write( da );
+                    da_counts_out.write( "\t" );
+                    da_counts_out.write( String.valueOf( count ) );
+                    da_counts_out.write( ForesterUtil.LINE_SEPARATOR );
+                }
+                if ( count == 1 ) {
+                    final Iterator<Entry<String, Set<String>>> it2 = domain_architecutures.entrySet().iterator();
+                    while ( it2.hasNext() ) {
+                        final Map.Entry<String, Set<String>> e2 = it2.next();
+                        final String genome = e2.getKey();
+                        final Set<String> das = e2.getValue();
+                        if ( das.contains( da ) ) {
+                            unique_da_out.write( genome );
+                            unique_da_out.write( "\t" );
+                            unique_da_out.write( da );
+                            unique_da_out.write( ForesterUtil.LINE_SEPARATOR );
+                        }
+                    }
+                }
+            }
+            unique_da_out.close();
+            da_counts_out.close();
+        }
+        catch ( final IOException e ) {
+            ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() );
+        }
+        ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote distance matrices to \"" + da_counts_outfile + "\"" );
+        ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote distance matrices to \"" + unique_da_outfile + "\"" );
+        //
+    }
+
+    public static int storeDomainArchitectures( final String genome,
+                                                final SortedMap<String, Set<String>> domain_architecutures,
+                                                final List<Protein> protein_list,
+                                                final Map<String, Integer> distinct_domain_architecuture_counts ) {
+        final Set<String> da = new HashSet<String>();
+        domain_architecutures.put( genome, da );
+        for( final Protein protein : protein_list ) {
+            final String da_str = ( ( BasicProtein ) protein ).toDomainArchitectureString( "~", 3, "=" );
+            if ( !da.contains( da_str ) ) {
+                if ( !distinct_domain_architecuture_counts.containsKey( da_str ) ) {
+                    distinct_domain_architecuture_counts.put( da_str, 1 );
+                }
+                else {
+                    distinct_domain_architecuture_counts.put( da_str,
+                                                              distinct_domain_architecuture_counts.get( da_str ) + 1 );
+                }
+                da.add( da_str );
+            }
+        }
+        return da.size();
+    }
+
     public static void addAllBinaryDomainCombinationToSet( final GenomeWideCombinableDomains genome,
                                                            final SortedSet<BinaryDomainCombination> binary_domain_combinations ) {
         final SortedMap<DomainId, CombinableDomains> all_cd = genome.getAllCombinableDomainsIds();
@@ -372,7 +442,7 @@ public final class SurfacingUtil {
                         nodes.add( n );
                     }
                 }
-                for( int i = 0; i < nodes.size() - 1; ++i ) {
+                for( int i = 0; i < ( nodes.size() - 1 ); ++i ) {
                     for( int j = i + 1; j < nodes.size(); ++j ) {
                         final PhylogenyNode lca = PhylogenyMethods.calculateLCA( nodes.get( i ), nodes.get( j ) );
                         String rank = "unknown";
@@ -627,7 +697,7 @@ public final class SurfacingUtil {
 
     public static Map<DomainId, Set<String>> createDomainIdToSecondaryFeaturesMap( final File secondary_features_map_file )
             throws IOException {
-        final BasicTable<String> primary_table = BasicTableParser.parse( secondary_features_map_file, "\t" );
+        final BasicTable<String> primary_table = BasicTableParser.parse( secondary_features_map_file, '\t' );
         final Map<DomainId, Set<String>> map = new TreeMap<DomainId, Set<String>>();
         for( int r = 0; r < primary_table.getNumberOfRows(); ++r ) {
             final DomainId domain_id = new DomainId( primary_table.getValue( 0, r ) );
@@ -1621,58 +1691,50 @@ public final class SurfacingUtil {
                     + all_pfams_encountered.size() );
             ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams without a mapping                 : "
                     + pfams_without_mappings_counter + " ["
-                    + ( 100 * pfams_without_mappings_counter / all_pfams_encountered.size() ) + "%]" );
+                    + ( ( 100 * pfams_without_mappings_counter ) / all_pfams_encountered.size() ) + "%]" );
             ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams without mapping to proc. or func. : "
                     + pfams_without_mappings_to_bp_or_mf_counter + " ["
-                    + ( 100 * pfams_without_mappings_to_bp_or_mf_counter / all_pfams_encountered.size() ) + "%]" );
-            ForesterUtil.programMessage( surfacing.PRG_NAME,
-                                         "Pfams with a mapping                    : " + pfams_with_mappings_counter
-                                                 + " ["
-                                                 + ( 100 * pfams_with_mappings_counter / all_pfams_encountered.size() )
-                                                 + "%]" );
+                    + ( ( 100 * pfams_without_mappings_to_bp_or_mf_counter ) / all_pfams_encountered.size() ) + "%]" );
+            ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams with a mapping                    : "
+                    + pfams_with_mappings_counter + " ["
+                    + ( ( 100 * pfams_with_mappings_counter ) / all_pfams_encountered.size() ) + "%]" );
             ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams with a mapping to proc. or func.  : "
                     + pfams_with_mappings_to_bp_or_mf_counter + " ["
-                    + ( 100 * pfams_with_mappings_to_bp_or_mf_counter / all_pfams_encountered.size() ) + "%]" );
-            ForesterUtil.programMessage( surfacing.PRG_NAME,
-                                         "Pfams with mapping to biological process: " + biological_process_counter
-                                                 + " ["
-                                                 + ( 100 * biological_process_counter / all_pfams_encountered.size() )
-                                                 + "%]" );
-            ForesterUtil.programMessage( surfacing.PRG_NAME,
-                                         "Pfams with mapping to molecular function: " + molecular_function_counter
-                                                 + " ["
-                                                 + ( 100 * molecular_function_counter / all_pfams_encountered.size() )
-                                                 + "%]" );
-            ForesterUtil.programMessage( surfacing.PRG_NAME,
-                                         "Pfams with mapping to cellular component: " + cellular_component_counter
-                                                 + " ["
-                                                 + ( 100 * cellular_component_counter / all_pfams_encountered.size() )
-                                                 + "%]" );
+                    + ( ( 100 * pfams_with_mappings_to_bp_or_mf_counter ) / all_pfams_encountered.size() ) + "%]" );
+            ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams with mapping to biological process: "
+                    + biological_process_counter + " ["
+                    + ( ( 100 * biological_process_counter ) / all_pfams_encountered.size() ) + "%]" );
+            ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams with mapping to molecular function: "
+                    + molecular_function_counter + " ["
+                    + ( ( 100 * molecular_function_counter ) / all_pfams_encountered.size() ) + "%]" );
+            ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams with mapping to cellular component: "
+                    + cellular_component_counter + " ["
+                    + ( ( 100 * cellular_component_counter ) / all_pfams_encountered.size() ) + "%]" );
             summary_writer.write( ForesterUtil.LINE_SEPARATOR );
             summary_writer.write( "# Sum of Pfams encountered                : " + all_pfams_encountered.size() );
             summary_writer.write( ForesterUtil.LINE_SEPARATOR );
             summary_writer.write( "# Pfams without a mapping                 : " + pfams_without_mappings_counter
-                    + " [" + ( 100 * pfams_without_mappings_counter / all_pfams_encountered.size() ) + "%]" );
+                    + " [" + ( ( 100 * pfams_without_mappings_counter ) / all_pfams_encountered.size() ) + "%]" );
             summary_writer.write( ForesterUtil.LINE_SEPARATOR );
             summary_writer.write( "# Pfams without mapping to proc. or func. : "
                     + pfams_without_mappings_to_bp_or_mf_counter + " ["
-                    + ( 100 * pfams_without_mappings_to_bp_or_mf_counter / all_pfams_encountered.size() ) + "%]" );
+                    + ( ( 100 * pfams_without_mappings_to_bp_or_mf_counter ) / all_pfams_encountered.size() ) + "%]" );
             summary_writer.write( ForesterUtil.LINE_SEPARATOR );
             summary_writer.write( "# Pfams with a mapping                    : " + pfams_with_mappings_counter + " ["
-                    + ( 100 * pfams_with_mappings_counter / all_pfams_encountered.size() ) + "%]" );
+                    + ( ( 100 * pfams_with_mappings_counter ) / all_pfams_encountered.size() ) + "%]" );
             summary_writer.write( ForesterUtil.LINE_SEPARATOR );
             summary_writer.write( "# Pfams with a mapping to proc. or func.  : "
                     + pfams_with_mappings_to_bp_or_mf_counter + " ["
-                    + ( 100 * pfams_with_mappings_to_bp_or_mf_counter / all_pfams_encountered.size() ) + "%]" );
+                    + ( ( 100 * pfams_with_mappings_to_bp_or_mf_counter ) / all_pfams_encountered.size() ) + "%]" );
             summary_writer.write( ForesterUtil.LINE_SEPARATOR );
             summary_writer.write( "# Pfams with mapping to biological process: " + biological_process_counter + " ["
-                    + ( 100 * biological_process_counter / all_pfams_encountered.size() ) + "%]" );
+                    + ( ( 100 * biological_process_counter ) / all_pfams_encountered.size() ) + "%]" );
             summary_writer.write( ForesterUtil.LINE_SEPARATOR );
             summary_writer.write( "# Pfams with mapping to molecular function: " + molecular_function_counter + " ["
-                    + ( 100 * molecular_function_counter / all_pfams_encountered.size() ) + "%]" );
+                    + ( ( 100 * molecular_function_counter ) / all_pfams_encountered.size() ) + "%]" );
             summary_writer.write( ForesterUtil.LINE_SEPARATOR );
             summary_writer.write( "# Pfams with mapping to cellular component: " + cellular_component_counter + " ["
-                    + ( 100 * cellular_component_counter / all_pfams_encountered.size() ) + "%]" );
+                    + ( ( 100 * cellular_component_counter ) / all_pfams_encountered.size() ) + "%]" );
             summary_writer.write( ForesterUtil.LINE_SEPARATOR );
             summary_writer.close();
         }