From: cmzmasek Date: Mon, 27 Mar 2017 05:01:52 +0000 (-0700) Subject: in progress... X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;h=2320da44a171273ec44c0adcaf991687a4780b6e;p=jalview.git in progress... --- diff --git a/forester/java/src/org/forester/application/surfacing.java b/forester/java/src/org/forester/application/surfacing.java index c35c98d..88be3d2 100644 --- a/forester/java/src/org/forester/application/surfacing.java +++ b/forester/java/src/org/forester/application/surfacing.java @@ -1775,32 +1775,27 @@ public class surfacing { ForesterUtil .programMessage( PRG_NAME, "Wrote domain promiscuities to: " + per_genome_domain_promiscuity_statistics_file ); - // - if ( true ) { //TODO - try { - MinimalDomainomeCalculator.calcOme( false, - intrees[ 0 ], - protein_lists_per_species, - "---", - 1000, - out_dir.toString() + "/" + output_file ); - } - catch ( IOException e ) { - ForesterUtil.fatalError( surfacing.PRG_NAME, e.getLocalizedMessage() ); - } + try { + MinimalDomainomeCalculator.calcOme( false, + intrees[ 0 ], + protein_lists_per_species, + "---", + -1, + out_dir.toString() + "/" + output_file ); } - if ( true ) { //TODO - try { - MinimalDomainomeCalculator.calcOme( true, - intrees[ 0 ], - protein_lists_per_species, - "---", - 1000, - out_dir.toString() + "/" + output_file ); - } - catch ( IOException e ) { - ForesterUtil.fatalError( surfacing.PRG_NAME, e.getLocalizedMessage() ); - } + catch ( IOException e ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, e.getLocalizedMessage() ); + } + try { + MinimalDomainomeCalculator.calcOme( true, + intrees[ 0 ], + protein_lists_per_species, + "---", + -1, + out_dir.toString() + "/" + output_file ); + } + catch ( IOException e ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, e.getLocalizedMessage() ); } if ( da_analysis ) { SurfacingUtil.performDomainArchitectureAnalysis( distinct_domain_architecutures_per_genome, diff --git a/forester/java/src/org/forester/protein/BasicProtein.java b/forester/java/src/org/forester/protein/BasicProtein.java index 975e48d..91f5eb4 100644 --- a/forester/java/src/org/forester/protein/BasicProtein.java +++ b/forester/java/src/org/forester/protein/BasicProtein.java @@ -233,6 +233,8 @@ public class BasicProtein implements Protein { return sb.toString(); } + + public final String toDomainArchitectureString( final String separator ) { return toDomainArchitectureString( separator, -1 ); } diff --git a/forester/java/src/org/forester/surfacing/MinimalDomainomeCalculator.java b/forester/java/src/org/forester/surfacing/MinimalDomainomeCalculator.java index ba667ce..9abb02a 100644 --- a/forester/java/src/org/forester/surfacing/MinimalDomainomeCalculator.java +++ b/forester/java/src/org/forester/surfacing/MinimalDomainomeCalculator.java @@ -5,11 +5,11 @@ import java.io.BufferedWriter; import java.io.File; import java.io.FileWriter; import java.io.IOException; +import java.io.Writer; import java.util.ArrayList; -import java.util.HashMap; +import java.util.Arrays; import java.util.HashSet; import java.util.List; -import java.util.Map; import java.util.Map.Entry; import java.util.Set; import java.util.SortedMap; @@ -25,60 +25,11 @@ import org.forester.protein.Domain; import org.forester.protein.Protein; import org.forester.species.BasicSpecies; import org.forester.species.Species; +import org.forester.surfacing.SurfacingUtil.DomainComparator; import org.forester.util.ForesterUtil; public final class MinimalDomainomeCalculator { - static final public void calcDomainome( final Phylogeny tre, - final SortedMap> protein_lists_per_species, - final double ie_cutoff ) { - if ( protein_lists_per_species == null || tre == null ) { - throw new IllegalArgumentException( "argument is null" ); - } - if ( protein_lists_per_species.size() < 2 ) { - throw new IllegalArgumentException( "not enough genomes" ); - } - for( final PhylogenyNodeIterator iter = tre.iteratorPostorder(); iter.hasNext(); ) { - final PhylogenyNode node = iter.next(); - if ( node.isInternal() ) { - System.out.println(); - if ( node.getNodeData().isHasTaxonomy() ) { - System.out.println( node.getNodeData().getTaxonomy().getScientificName() + ":" ); - } - else { - System.out.println( node.getName() + ":" ); - } - final List external_descs = node.getAllExternalDescendants(); - final List> domains_per_genome_list = new ArrayList>(); - for( final PhylogenyNode external_desc : external_descs ) { - final String code = external_desc.getNodeData().getTaxonomy().getTaxonomyCode(); - System.out.print( code + " " ); - final List proteins_per_species = protein_lists_per_species - .get( new BasicSpecies( code ) ); - if ( proteins_per_species != null ) { - final SortedSet domains_per_genome = new TreeSet(); - for( final Protein protein : proteins_per_species ) { - List domains = protein.getProteinDomains(); - for( final Domain domain : domains ) { - if ( ( domain.getPerDomainEvalue() <= ie_cutoff ) || ( ie_cutoff <= -1 ) ) { - domains_per_genome.add( domain.getDomainId() ); - } - } - } - if ( domains_per_genome.size() > 0 ) { - domains_per_genome_list.add( domains_per_genome ); - } - } - } - System.out.println(); - if ( domains_per_genome_list.size() > 0 ) { - Set intersection = calcIntersection( domains_per_genome_list ); - System.out.println( intersection ); - } - } - } - } - static final public void calcOme( final boolean use_domain_architectures, final Phylogeny tre, final SortedMap> protein_lists_per_species, @@ -86,7 +37,7 @@ public final class MinimalDomainomeCalculator { final double ie_cutoff, final String outfile_base ) throws IOException { - final SortedMap> species_to_das_map = new TreeMap>(); + final SortedMap> species_to_features_map = new TreeMap>(); if ( protein_lists_per_species == null || tre == null ) { throw new IllegalArgumentException( "argument is null" ); } @@ -106,7 +57,7 @@ public final class MinimalDomainomeCalculator { SurfacingUtil.checkForOutputFileWriteability( outfile_table ); final BufferedWriter out = new BufferedWriter( new FileWriter( outfile ) ); final BufferedWriter out_table = new BufferedWriter( new FileWriter( outfile_table ) ); - out.write( "SPECIES\tCOMMON NAME\tCODE\tRANK\t#EXT NODES\tEXT NODE CODES\t#DA\tDA" ); + out.write( "SPECIES\tCOMMON NAME\tCODE\tRANK\t#EXT NODES\tEXT NODE CODES\t#" + x + "\t" + x + "" ); out.write( ForesterUtil.LINE_SEPARATOR ); for( final PhylogenyNodeIterator iter = tre.iteratorPostorder(); iter.hasNext(); ) { final PhylogenyNode node = iter.next(); @@ -143,7 +94,7 @@ public final class MinimalDomainomeCalculator { else { out.write( "\t\t" ); } - final List> das_per_genome_list = new ArrayList>(); + final List> features_per_genome_list = new ArrayList>(); boolean first = true; for( final PhylogenyNode external_desc : external_descs ) { final String code = external_desc.getNodeData().getTaxonomy().getTaxonomyCode(); @@ -158,28 +109,28 @@ public final class MinimalDomainomeCalculator { } final List proteins_per_species = protein_lists_per_species.get( new BasicSpecies( code ) ); if ( proteins_per_species != null ) { - final SortedSet das_per_genome = new TreeSet(); + final SortedSet features_per_genome = new TreeSet(); for( final Protein protein : proteins_per_species ) { if ( use_domain_architectures ) { final String da = protein.toDomainArchitectureString( separator, ie_cutoff ); - das_per_genome.add( da ); + features_per_genome.add( da ); } else { List domains = protein.getProteinDomains(); for( final Domain domain : domains ) { if ( ( ie_cutoff <= -1 ) || ( domain.getPerDomainEvalue() <= ie_cutoff ) ) { - das_per_genome.add( domain.getDomainId() ); + features_per_genome.add( domain.getDomainId() ); } } } } - if ( das_per_genome.size() > 0 ) { - das_per_genome_list.add( das_per_genome ); + if ( features_per_genome.size() > 0 ) { + features_per_genome_list.add( features_per_genome ); } } } - if ( das_per_genome_list.size() > 0 ) { - SortedSet intersection = calcIntersection( das_per_genome_list ); + if ( features_per_genome_list.size() > 0 ) { + SortedSet intersection = calcIntersection( features_per_genome_list ); out.write( "\t" + intersection.size() + "\t" ); first = true; for( final String s : intersection ) { @@ -192,15 +143,15 @@ public final class MinimalDomainomeCalculator { out.write( s ); } out.write( ForesterUtil.LINE_SEPARATOR ); - species_to_das_map.put( species_name, intersection ); + species_to_features_map.put( species_name, intersection ); } } final SortedSet all_species_names = new TreeSet(); - final SortedSet all_das = new TreeSet(); - for( final Entry> e : species_to_das_map.entrySet() ) { + final SortedSet all_features = new TreeSet(); + for( final Entry> e : species_to_features_map.entrySet() ) { all_species_names.add( e.getKey() ); - for( final String das : e.getValue() ) { - all_das.add( das ); + for( final String f : e.getValue() ) { + all_features.add( f ); } } out_table.write( '\t' ); @@ -215,7 +166,7 @@ public final class MinimalDomainomeCalculator { out_table.write( species_name ); } out_table.write( ForesterUtil.LINE_SEPARATOR ); - for( final String das : all_das ) { + for( final String das : all_features ) { out_table.write( das ); out_table.write( '\t' ); first = true; @@ -226,7 +177,7 @@ public final class MinimalDomainomeCalculator { else { out_table.write( '\t' ); } - if ( species_to_das_map.get( species_name ).contains( das ) ) { + if ( species_to_features_map.get( species_name ).contains( das ) ) { out_table.write( '1' ); } else { @@ -241,152 +192,26 @@ public final class MinimalDomainomeCalculator { out_table.close(); ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote minimal DAome data to : " + outfile ); ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote minimal DAome data to (as table): " + outfile_table ); - } - - static final public void calcDAome( final Phylogeny tre, - final SortedMap> protein_lists_per_species, - final String separator, - final double ie_cutoff, - final String outfile_base ) - throws IOException { - final SortedMap> species_to_das_map = new TreeMap>(); - if ( protein_lists_per_species == null || tre == null ) { - throw new IllegalArgumentException( "argument is null" ); - } - if ( protein_lists_per_species.size() < 2 ) { - throw new IllegalArgumentException( "not enough genomes" ); - } - final File outfile = new File( outfile_base + "_minimal_daome.txt" ); - final File outfile_table = new File( outfile_base + "_minimal_daome.tsv" ); - SurfacingUtil.checkForOutputFileWriteability( outfile ); - SurfacingUtil.checkForOutputFileWriteability( outfile_table ); - final BufferedWriter out = new BufferedWriter( new FileWriter( outfile ) ); - final BufferedWriter out_table = new BufferedWriter( new FileWriter( outfile_table ) ); - out.write( "SPECIES\tCOMMON NAME\tCODE\tRANK\t#EXT NODES\tEXT NODE CODES\t#DA\tDA" ); - out.write( ForesterUtil.LINE_SEPARATOR ); - for( final PhylogenyNodeIterator iter = tre.iteratorPostorder(); iter.hasNext(); ) { - final PhylogenyNode node = iter.next(); - final String species_name = node.getNodeData().isHasTaxonomy() - ? node.getNodeData().getTaxonomy().getScientificName() : node.getName(); - final String common = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy().getCommonName() - : ""; - final String tcode = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy().getTaxonomyCode() - : ""; - final String rank = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy().getRank() : ""; - out.write( species_name ); - if ( !ForesterUtil.isEmpty( common ) ) { - out.write( "\t" + common ); - } - else { - out.write( "\t" ); - } - if ( !ForesterUtil.isEmpty( tcode ) ) { - out.write( "\t" + tcode ); - } - else { - out.write( "\t" ); - } - if ( !ForesterUtil.isEmpty( rank ) ) { - out.write( "\t" + rank ); - } - else { - out.write( "\t" ); - } - final List external_descs = node.getAllExternalDescendants(); - if ( node.isInternal() ) { - out.write( "\t" + external_descs.size() + "\t" ); - } - else { - out.write( "\t\t" ); - } - final List> das_per_genome_list = new ArrayList>(); - boolean first = true; - for( final PhylogenyNode external_desc : external_descs ) { - final String code = external_desc.getNodeData().getTaxonomy().getTaxonomyCode(); - if ( node.isInternal() ) { - if ( first ) { - first = false; - } - else { - out.write( ", " ); - } - out.write( code ); - } - final List proteins_per_species = protein_lists_per_species.get( new BasicSpecies( code ) ); - if ( proteins_per_species != null ) { - final SortedSet das_per_genome = new TreeSet(); - for( final Protein protein : proteins_per_species ) { - final String da = protein.toDomainArchitectureString( separator, ie_cutoff ); - das_per_genome.add( da ); - } - if ( das_per_genome.size() > 0 ) { - das_per_genome_list.add( das_per_genome ); - } - } - } - if ( das_per_genome_list.size() > 0 ) { - SortedSet intersection = calcIntersection( das_per_genome_list ); - out.write( "\t" + intersection.size() + "\t" ); - first = true; - for( final String s : intersection ) { - if ( first ) { - first = false; - } - else { - out.write( ", " ); - } - out.write( s ); - } - out.write( ForesterUtil.LINE_SEPARATOR ); - species_to_das_map.put( species_name, intersection ); - } - } - final SortedSet all_species_names = new TreeSet(); - final SortedSet all_das = new TreeSet(); - for( final Entry> e : species_to_das_map.entrySet() ) { - all_species_names.add( e.getKey() ); - for( final String das : e.getValue() ) { - all_das.add( das ); - } - } - out_table.write( '\t' ); - boolean first = true; - for( final String species_name : all_species_names ) { - if ( first ) { - first = false; + for( String f : all_features ) { + final String a; + if ( use_domain_architectures ) { + a = "DA_"; } else { - out_table.write( '\t' ); - } - out_table.write( species_name ); - } - out_table.write( ForesterUtil.LINE_SEPARATOR ); - for( final String das : all_das ) { - out_table.write( das ); - out_table.write( '\t' ); - first = true; - for( final String species_name : all_species_names ) { - if ( first ) { - first = false; - } - else { - out_table.write( '\t' ); - } - if ( species_to_das_map.get( species_name ).contains( das ) ) { - out_table.write( '1' ); - } - else { - out_table.write( '0' ); - } + a = "domain_"; } - out_table.write( ForesterUtil.LINE_SEPARATOR ); + final File prot_dir = new File( outfile_base + "_prot" ); + prot_dir.mkdir(); + final File outt = new File( outfile_base + "_prot/" + a + f + surfacing.SEQ_EXTRACT_SUFFIX ); + final Writer proteins_file_writer = new BufferedWriter( new FileWriter( outt ) ); + extractProteinFeatures( use_domain_architectures, + protein_lists_per_species, + f, + proteins_file_writer, + ie_cutoff, + separator ); + proteins_file_writer.close(); } - out.flush(); - out.close(); - out_table.flush(); - out_table.close(); - ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote minimal DAome data to : " + outfile ); - ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote minimal DAome data to (as table): " + outfile_table ); } private final static SortedSet calcIntersection( final List> features_per_genome_list ) { @@ -401,6 +226,102 @@ public final class MinimalDomainomeCalculator { return my_first; } + public static void extractProteinFeatures( final boolean use_domain_architectures, + final SortedMap> protein_lists_per_species, + final String domain_id, + final Writer out, + final double ie_cutoff, + final String domain_separator ) + throws IOException { + final String separator_for_output = "\t"; + for( final Species species : protein_lists_per_species.keySet() ) { + final List proteins_per_species = protein_lists_per_species.get( species ); + for( final Protein protein : proteins_per_species ) { + if ( use_domain_architectures ) { + if ( domain_id.equals( protein.toDomainArchitectureString( domain_separator, ie_cutoff ) ) ) { + int from = Integer.MAX_VALUE; + int to = -1; + for( final Domain d : protein.getProteinDomains() ) { + if ( ( ie_cutoff <= -1 ) || ( d.getPerDomainEvalue() <= ie_cutoff ) ) { + if ( d.getFrom() < from ) { + from = d.getFrom(); + } + if ( d.getTo() > to ) { + to = d.getTo(); + } + } + } + out.write( protein.getSpecies().getSpeciesId() ); + out.write( separator_for_output ); + out.write( protein.getProteinId().getId() ); + out.write( separator_for_output ); + out.write( domain_id ); + out.write( separator_for_output ); + out.write( "/" ); + out.write( from + "-" + to ); + out.write( "/" ); + out.write( SurfacingConstants.NL ); + } + } + else { + final List domains = protein.getProteinDomains( domain_id ); + if ( domains.size() > 0 ) { + out.write( protein.getSpecies().getSpeciesId() ); + out.write( separator_for_output ); + out.write( protein.getProteinId().getId() ); + out.write( separator_for_output ); + out.write( domain_id ); + out.write( separator_for_output ); + for( final Domain domain : domains ) { + if ( ( ie_cutoff < 0 ) || ( domain.getPerDomainEvalue() <= ie_cutoff ) ) { + out.write( "/" ); + out.write( domain.getFrom() + "-" + domain.getTo() ); + } + } + out.write( "/" ); + out.write( separator_for_output ); + final List domain_list = new ArrayList(); + for( final Domain domain : protein.getProteinDomains() ) { + if ( ( ie_cutoff < 0 ) || ( domain.getPerDomainEvalue() <= ie_cutoff ) ) { + domain_list.add( domain ); + } + } + final Domain domain_ary[] = new Domain[ domain_list.size() ]; + for( int i = 0; i < domain_list.size(); ++i ) { + domain_ary[ i ] = domain_list.get( i ); + } + Arrays.sort( domain_ary, new DomainComparator( true ) ); + out.write( "{" ); + boolean first = true; + for( final Domain domain : domain_ary ) { + if ( first ) { + first = false; + } + else { + out.write( "," ); + } + out.write( domain.getDomainId().toString() ); + out.write( ":" + domain.getFrom() + "-" + domain.getTo() ); + out.write( ":" + domain.getPerDomainEvalue() ); + } + out.write( "}" ); + if ( !( ForesterUtil.isEmpty( protein.getDescription() ) + || protein.getDescription().equals( SurfacingConstants.NONE ) ) ) { + out.write( protein.getDescription() ); + } + out.write( separator_for_output ); + if ( !( ForesterUtil.isEmpty( protein.getAccession() ) + || protein.getAccession().equals( SurfacingConstants.NONE ) ) ) { + out.write( protein.getAccession() ); + } + out.write( SurfacingConstants.NL ); + } + } + } + } + out.flush(); + } + public static void main( final String[] args ) { Set a = new HashSet(); Set b = new HashSet();