in progress
[jalview.git] / forester / java / src / org / forester / go / etc / MetaOntologizer.java
index 970f939..36b2801 100644 (file)
@@ -5,7 +5,7 @@
 // Copyright (C) 2008-2009 Christian M. Zmasek
 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
 // All rights reserved
-// 
+//
 // This library is free software; you can redistribute it and/or
 // modify it under the terms of the GNU Lesser General Public
 // License as published by the Free Software Foundation; either
 // but WITHOUT ANY WARRANTY; without even the implied warranty of
 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 // Lesser General Public License for more details.
-// 
+//
 // You should have received a copy of the GNU Lesser General Public
 // License along with this library; if not, write to the Free Software
 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
 //
 // Contact: phylosoft @ gmail . com
-// WWW: www.phylosoft.org
+// WWW: https://sites.google.com/site/cmzmasek/home/software/forester
 
 package org.forester.go.etc;
 
@@ -51,9 +51,8 @@ import org.forester.go.GoTerm;
 import org.forester.go.GoUtils;
 import org.forester.go.OBOparser;
 import org.forester.go.PfamToGoMapping;
-import org.forester.surfacing.BasicSpecies;
-import org.forester.surfacing.DomainId;
-import org.forester.surfacing.Species;
+import org.forester.species.BasicSpecies;
+import org.forester.species.Species;
 import org.forester.surfacing.SurfacingConstants;
 import org.forester.surfacing.SurfacingUtil;
 import org.forester.util.ForesterUtil;
@@ -66,7 +65,10 @@ public class MetaOntologizer {
     final static private String       PRG_NAME                         = "meta_ontologizer";
     private static final boolean      VERBOSE                          = true;
     //table-a_41_dollo_all_gains_d-Topology-Elim-Bonferroni.txt:
-    private final static Pattern      PATTERN_ONTOLOGIZER_TABLE_OUTPUT = Pattern.compile( ".*table-(.+)_dollo_.*",
+    //TODO change back
+    // private final static Pattern      PATTERN_ONTOLOGIZER_TABLE_OUTPUT = Pattern.compile( ".*table-(.+)_dollo_.*",
+    //                                                                                      Pattern.CASE_INSENSITIVE ); //TODO this might need some work...
+    private final static Pattern      PATTERN_ONTOLOGIZER_TABLE_OUTPUT = Pattern.compile( ".*table-(.+)\\.txt",
                                                                                           Pattern.CASE_INSENSITIVE ); //TODO this might need some work...
 
     private static boolean hasResultsForSpecies( final Map<GoId, GoTerm> go_id_to_terms,
@@ -82,12 +84,12 @@ public class MetaOntologizer {
     }
 
     private static StringBuilder obtainDomainsForGoId( final List<PfamToGoMapping> pfam_to_go,
-                                                       final SortedSet<DomainId> domains_per_species,
+                                                       final SortedSet<String> domains_per_species,
                                                        final Map<GoId, GoTerm> all_go_terms,
                                                        final GoId query_go_id,
-                                                       final Set<DomainId> found_domain_ids ) {
+                                                       final Set<String> found_domain_ids ) {
         final StringBuilder sb = new StringBuilder();
-        D: for( final DomainId domain_id : domains_per_species ) {
+        D: for( final String domain_id : domains_per_species ) {
             for( final PfamToGoMapping ptg : pfam_to_go ) {
                 if ( ptg.getKey().equals( domain_id ) ) {
                     final GoId go_id = ptg.getValue();
@@ -115,7 +117,7 @@ public class MetaOntologizer {
             species = matcher.group( 1 );
             if ( VERBOSE ) {
                 ForesterUtil
-                        .programMessage( PRG_NAME, "species for [" + ontologizer_outfile + "] is [" + species + "]" );
+                .programMessage( PRG_NAME, "species for [" + ontologizer_outfile + "] is [" + species + "]" );
             }
         }
         else {
@@ -125,13 +127,12 @@ public class MetaOntologizer {
         return species;
     }
 
-    private static SortedMap<Species, SortedSet<DomainId>> parseDomainGainLossFile( final File input )
-            throws IOException {
+    private static SortedMap<Species, SortedSet<String>> parseDomainGainLossFile( final File input ) throws IOException {
         final String error = ForesterUtil.isReadableFile( input );
         if ( !ForesterUtil.isEmpty( error ) ) {
             throw new IOException( error );
         }
-        final SortedMap<Species, SortedSet<DomainId>> speciesto_to_domain_id = new TreeMap<Species, SortedSet<DomainId>>();
+        final SortedMap<Species, SortedSet<String>> speciesto_to_domain_id = new TreeMap<Species, SortedSet<String>>();
         final BufferedReader br = new BufferedReader( new FileReader( input ) );
         String line;
         int line_number = 0;
@@ -145,13 +146,16 @@ public class MetaOntologizer {
                 }
                 else if ( line.startsWith( "#" ) ) {
                     current_species = new BasicSpecies( line.substring( 1 ) );
-                    speciesto_to_domain_id.put( current_species, new TreeSet<DomainId>() );
+                    speciesto_to_domain_id.put( current_species, new TreeSet<String>() );
+                    if ( VERBOSE ) {
+                        ForesterUtil.programMessage( PRG_NAME, "saw " + current_species );
+                    }
                 }
                 else {
                     if ( current_species == null ) {
                         throw new IOException( "parsing problem [at line " + line_number + "] in [" + input + "]" );
                     }
-                    speciesto_to_domain_id.get( current_species ).add( new DomainId( line ) );
+                    speciesto_to_domain_id.get( current_species ).add( new String( line ) );
                 }
             }
         }
@@ -172,9 +176,9 @@ public class MetaOntologizer {
                                            final SortedMap<String, SortedSet<OntologizerResult>> species_to_results_map,
                                            final String species,
                                            final double p_adjusted_upper_limit,
-                                           final SortedSet<DomainId> domains_per_species,
+                                           final SortedSet<String> domains_per_species,
                                            final List<PfamToGoMapping> pfam_to_go,
-                                           final Set<DomainId> domain_ids_with_go_annot ) throws IOException {
+                                           final Set<String> domain_ids_with_go_annot ) throws IOException {
         final SortedSet<OntologizerResult> ontologizer_results = species_to_results_map.get( species );
         for( final OntologizerResult ontologizer_result : ontologizer_results ) {
             final GoTerm go_term = go_id_to_terms.get( ontologizer_result.getGoId() );
@@ -222,9 +226,9 @@ public class MetaOntologizer {
         }
         if ( ( p_adjusted_upper_limit < 0.0 ) || ( p_adjusted_upper_limit > 1.0 ) ) {
             throw new IllegalArgumentException( "adjusted P values limit [" + p_adjusted_upper_limit
-                    + "] is out of range" );
+                                                + "] is out of range" );
         }
-        SortedMap<Species, SortedSet<DomainId>> speciesto_to_domain_id = null;
+        SortedMap<Species, SortedSet<String>> speciesto_to_domain_id = null;
         if ( domain_gain_loss_file != null ) {
             if ( !domain_gain_loss_file.exists() ) {
                 throw new IllegalArgumentException( "[" + domain_gain_loss_file + "] does not exist" );
@@ -232,7 +236,7 @@ public class MetaOntologizer {
             speciesto_to_domain_id = parseDomainGainLossFile( domain_gain_loss_file );
             if ( VERBOSE ) {
                 ForesterUtil.programMessage( PRG_NAME, "parsed gain/loss domains for " + speciesto_to_domain_id.size()
-                        + " species from [" + domain_gain_loss_file + "]" );
+                                             + " species from [" + domain_gain_loss_file + "]" );
             }
         }
         final String[] children = ontologizer_outdir.list();
@@ -249,7 +253,7 @@ public class MetaOntologizer {
         }
         if ( VERBOSE ) {
             ForesterUtil.programMessage( PRG_NAME, "need to analyze " + ontologizer_outfiles.size()
-                    + " Ontologizer outfiles from [" + ontologizer_outdir + "]" );
+                                         + " Ontologizer outfiles from [" + ontologizer_outdir + "]" );
         }
         final OBOparser parser = new OBOparser( obo_file, OBOparser.ReturnType.BASIC_GO_TERM );
         final List<GoTerm> go_terms = parser.parse();
@@ -277,7 +281,7 @@ public class MetaOntologizer {
         for( final File ontologizer_outfile : ontologizer_outfiles ) {
             final String species = obtainSpecies( ontologizer_outfile );
             final List<OntologizerResult> ontologizer_results = OntologizerResult.parse( new File( ontologizer_outdir
-                    + ForesterUtil.FILE_SEPARATOR + ontologizer_outfile ) );
+                                                                                                   + ForesterUtil.FILE_SEPARATOR + ontologizer_outfile ) );
             final SortedSet<OntologizerResult> filtered_ontologizer_results = new TreeSet<OntologizerResult>();
             for( final OntologizerResult ontologizer_result : ontologizer_results ) {
                 if ( ontologizer_result.getPAdjusted() <= p_adjusted_upper_limit ) {
@@ -296,18 +300,18 @@ public class MetaOntologizer {
         writeHtmlHeader( b_html_writer,
                          GoNameSpace.GoNamespaceType.BIOLOGICAL_PROCESS.toString() + " | Pmax = "
                                  + p_adjusted_upper_limit + " | " + comment,
-                         ontologizer_outdir.getAbsolutePath(),
-                         domain_gain_loss_file_full_path_str );
+                                 ontologizer_outdir.getAbsolutePath(),
+                                 domain_gain_loss_file_full_path_str );
         writeHtmlHeader( c_html_writer,
                          GoNameSpace.GoNamespaceType.CELLULAR_COMPONENT.toString() + " | Pmax = "
                                  + p_adjusted_upper_limit + " | " + comment,
-                         ontologizer_outdir.getAbsolutePath(),
-                         domain_gain_loss_file_full_path_str );
+                                 ontologizer_outdir.getAbsolutePath(),
+                                 domain_gain_loss_file_full_path_str );
         writeHtmlHeader( m_html_writer,
                          GoNameSpace.GoNamespaceType.MOLECULAR_FUNCTION.toString() + " | Pmax = "
                                  + p_adjusted_upper_limit + " | " + comment,
-                         ontologizer_outdir.getAbsolutePath(),
-                         domain_gain_loss_file_full_path_str );
+                                 ontologizer_outdir.getAbsolutePath(),
+                                 domain_gain_loss_file_full_path_str );
         for( final String species : species_to_results_map.keySet() ) {
             if ( hasResultsForSpecies( go_id_to_terms,
                                        species_to_results_map,
@@ -327,11 +331,11 @@ public class MetaOntologizer {
                                        GoNameSpace.GoNamespaceType.MOLECULAR_FUNCTION ) ) {
                 writeHtmlSpecies( m_html_writer, species );
             }
-            SortedSet<DomainId> domains_per_species = null;
+            SortedSet<String> domains_per_species = null;
             if ( ( speciesto_to_domain_id != null ) && ( speciesto_to_domain_id.size() > 0 ) ) {
                 domains_per_species = speciesto_to_domain_id.get( new BasicSpecies( species ) );
             }
-            final Set<DomainId> domain_ids_with_go_annot = new HashSet<DomainId>();
+            final Set<String> domain_ids_with_go_annot = new HashSet<String>();
             processOneSpecies( go_id_to_terms,
                                b_html_writer,
                                b_tab_writer,
@@ -377,30 +381,30 @@ public class MetaOntologizer {
         m_tab_writer.close();
         if ( VERBOSE ) {
             ForesterUtil.programMessage( PRG_NAME, "successfully wrote biological process summary to [" + b_file_html
-                    + "]" );
+                                         + "]" );
             ForesterUtil.programMessage( PRG_NAME, "successfully wrote biological process summary to [" + b_file_txt
-                    + "]" );
+                                         + "]" );
             ForesterUtil.programMessage( PRG_NAME, "successfully wrote molecular function summary to [" + m_file_html
-                    + "]" );
+                                         + "]" );
             ForesterUtil.programMessage( PRG_NAME, "successfully wrote molecular function summary to [" + m_file_txt
-                    + "]" );
+                                         + "]" );
             ForesterUtil.programMessage( PRG_NAME, "successfully wrote cellular component summary to [" + c_file_html
-                    + "]" );
+                                         + "]" );
             ForesterUtil.programMessage( PRG_NAME, "successfully wrote cellular component summary to [" + c_file_txt
-                    + "]" );
+                                         + "]" );
         }
     }
 
     private static void writeHtmlDomains( final Writer writer,
-                                          final SortedSet<DomainId> domains,
-                                          final Set<DomainId> domain_ids_with_go_annot ) throws IOException {
+                                          final SortedSet<String> domains,
+                                          final Set<String> domain_ids_with_go_annot ) throws IOException {
         writer.write( "<tr>" );
         writer.write( "<td colspan=\"10\">" );
         if ( domains != null ) {
-            for( final DomainId domain : domains ) {
+            for( final String domain : domains ) {
                 if ( !domain_ids_with_go_annot.contains( domain ) ) {
                     writer.write( "[<a class=\"new_type\" href=\"" + SurfacingConstants.PFAM_FAMILY_ID_LINK + domain
-                            + "\">" + domain + "</a>] " );
+                                  + "\">" + domain + "</a>] " );
                 }
             }
         }
@@ -438,22 +442,18 @@ public class MetaOntologizer {
         w.write( ForesterUtil.LINE_SEPARATOR );
         w.write( "a.new_type:link { font-size: 7pt; color : #505050; text-decoration : none; }" );
         w.write( ForesterUtil.LINE_SEPARATOR );
-        w
-                .write( "a.new_type:hover { font-size: 7pt; color : #000000; background-color : #FFFF00; text-decoration : none; }" );
+        w.write( "a.new_type:hover { font-size: 7pt; color : #000000; background-color : #FFFF00; text-decoration : none; }" );
         w.write( ForesterUtil.LINE_SEPARATOR );
-        w
-                .write( "a.new_type:hover { font-size: 7pt; color : #000000; background-color : #FFFF00; text-decoration : none; }" );
+        w.write( "a.new_type:hover { font-size: 7pt; color : #000000; background-color : #FFFF00; text-decoration : none; }" );
         w.write( ForesterUtil.LINE_SEPARATOR );
         w.write( "td { text-align: left; vertical-align: top; font-family: Verdana, Arial, Helvetica; font-size: 8pt}" );
         w.write( ForesterUtil.LINE_SEPARATOR );
-        w
-                .write( "th { text-align: left; vertical-align: top; font-family: Verdana, Arial, Helvetica; font-size: 10pt; font-weight: bold }" );
+        w.write( "th { text-align: left; vertical-align: top; font-family: Verdana, Arial, Helvetica; font-size: 10pt; font-weight: bold }" );
         w.write( ForesterUtil.LINE_SEPARATOR );
         w.write( "h1 { color : #000000; font-family: Verdana, Arial, Helvetica; font-size: 18pt; font-weight: bold }" );
         w.write( ForesterUtil.LINE_SEPARATOR );
         w.write( "h2 { color : #000000; font-family: Verdana, Arial, Helvetica; font-size: 16pt; font-weight: bold }" );
-        w
-                .write( "h3 { margin-top: 12px;  margin-bottom: 0px; color : #000000; font-family: Verdana, Arial, Helvetica; font-size: 12pt; font-weight: bold }" );
+        w.write( "h3 { margin-top: 12px;  margin-bottom: 0px; color : #000000; font-family: Verdana, Arial, Helvetica; font-size: 12pt; font-weight: bold }" );
         w.write( ForesterUtil.LINE_SEPARATOR );
         w.write( "</style>" );
         w.write( ForesterUtil.LINE_SEPARATOR );
@@ -518,7 +518,7 @@ public class MetaOntologizer {
         writer.write( "<tr>" );
         writer.write( "<td><h3>" );
         writer.write( species );
-        SurfacingUtil.writeTaxonomyLinks( writer, species );
+        SurfacingUtil.writeTaxonomyLinks( writer, species, null );
         writer.write( "</h3></td>" );
         writer.write( "</tr>" );
         writer.write( ForesterUtil.LINE_SEPARATOR );
@@ -553,9 +553,9 @@ public class MetaOntologizer {
                                                  final double p_adjusted_upper_limit,
                                                  final String species,
                                                  final Map<GoId, GoTerm> go_id_to_terms,
-                                                 final SortedSet<DomainId> domains_per_species,
+                                                 final SortedSet<String> domains_per_species,
                                                  final List<PfamToGoMapping> pfam_to_go,
-                                                 final Set<DomainId> domain_ids_with_go_annot ) throws IOException {
+                                                 final Set<String> domain_ids_with_go_annot ) throws IOException {
         final Color p_adj_color = ForesterUtil.calcColor( ontologizer_result.getPAdjusted(),
                                                           0,
                                                           p_adjusted_upper_limit,
@@ -573,7 +573,7 @@ public class MetaOntologizer {
         writer.write( "</font>" );
         writer.write( "</td><td>" );
         writer.write( "<a href=\"" + SurfacingConstants.GO_LINK + ontologizer_result.getGoId().getId()
-                + "\" target=\"amigo_window\">" + ontologizer_result.getGoId().getId() + "</a>" );
+                      + "\" target=\"amigo_window\">" + ontologizer_result.getGoId().getId() + "</a>" );
         writer.write( "</td><td>" );
         writer.write( "<font color=\"#" + ForesterUtil.colorToHex( p_adj_color ) + "\">" );
         writer.write( FORMATER.format( ontologizer_result.getPAdjusted() ) );
@@ -592,8 +592,11 @@ public class MetaOntologizer {
         writer.write( String.valueOf( ontologizer_result.getStudyTerm() ) );
         writer.write( "</td><td>" );
         if ( domains_per_species != null ) {
-            final StringBuilder sb = obtainDomainsForGoId( pfam_to_go, domains_per_species, go_id_to_terms, go_term
-                    .getGoId(), domain_ids_with_go_annot );
+            final StringBuilder sb = obtainDomainsForGoId( pfam_to_go,
+                                                           domains_per_species,
+                                                           go_id_to_terms,
+                                                           go_term.getGoId(),
+                                                           domain_ids_with_go_annot );
             writer.write( sb.toString() );
         }
         else {