import java.io.File;
import java.io.IOException;
+import java.io.Writer;
+import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import org.forester.phylogeny.factories.PhylogenyFactory;
import org.forester.sdi.GSDI;
import org.forester.sdi.SDI;
+import org.forester.sdi.SDI.TaxonomyComparisonBase;
import org.forester.sdi.SDIse;
import org.forester.util.CommandLineArguments;
import org.forester.util.ForesterUtil;
public final class gsdi {
- final static public boolean REPLACE_UNDERSCORES_IN_NH_SPECIES_TREE = true;
- final static private String STRIP_OPTION = "s";
- final static private String ALLOW_STRIPPING_OF_GENE_TREE_OPTION = "g";
-
- final static private String SDI_OPTION = "b";
- final static private String MOST_PARSIMONIOUS_OPTION = "m";
- final static private String GUESS_FORMAT_OF_SPECIES_TREE = "q";
- final static private String HELP_OPTION_1 = "help";
- final static private String HELP_OPTION_2 = "h";
- final static private String DEFAULT_OUTFILE_SUFFIX = "_gsdi_out.phylo.xml";
+ private enum BASE_ALGORITHM {
+ GSDI, SDI
+ }
+ final static public boolean REPLACE_UNDERSCORES_IN_NH_SPECIES_TREE = true;
+ final static private String STRIP_OPTION = "s";
+ final static private String ALLOW_STRIPPING_OF_GENE_TREE_OPTION = "g";
+ final static private String SDI_OPTION = "b";
+ final static private String MOST_PARSIMONIOUS_OPTION = "m";
+ final static private String GUESS_FORMAT_OF_SPECIES_TREE = "q";
+ final static private String HELP_OPTION_1 = "help";
+ final static private String HELP_OPTION_2 = "h";
+ final static private String DEFAULT_OUTFILE_SUFFIX = "_gsdi_out.phylo.xml";
final static private String SUFFIX_FOR_LIST_OF_STIPPED_GENE_TREE_NODES = "_stripped_gene_tree_nodes.txt";
- final static private String SUFFIX_FOR_LOG_FILE = "_gsdi_log.txt";
- final static private String PRG_NAME = "gsdi";
- final static private String PRG_VERSION = "0.901";
- final static private String PRG_DATE = "120608";
- final static private String PRG_DESC = "general speciation duplication inference";
- final static private String E_MAIL = "phylosoft@gmail.com";
- final static private String WWW = "www.phylosoft.org/forester";
+ final static private String LOGFILE_SUFFIX = "_gsdi_log.txt";
+ final static private String PRG_NAME = "gsdi";
+ final static private String PRG_VERSION = "0.901";
+ final static private String PRG_DATE = "120608";
+ final static private String PRG_DESC = "general speciation duplication inference";
+ final static private String E_MAIL = "phylosoft@gmail.com";
+ final static private String WWW = "www.phylosoft.org/forester";
public static void main( final String args[] ) {
- ForesterUtil.printProgramInformation( PRG_NAME,
- PRG_DESC,
- PRG_VERSION,
- PRG_DATE,
- E_MAIL,
- WWW,
- ForesterUtil.getForesterLibraryInformation() );
- CommandLineArguments cla = null;
try {
- cla = new CommandLineArguments( args );
- }
- catch ( final Exception e ) {
- ForesterUtil.fatalError( PRG_NAME, e.getMessage() );
- }
- if ( cla.isOptionSet( gsdi.HELP_OPTION_1 ) || cla.isOptionSet( gsdi.HELP_OPTION_2 ) ) {
- System.out.println();
- gsdi.print_help();
- System.exit( 0 );
- }
- else if ( ( args.length < 2 ) || ( cla.getNumberOfNames() < 2 ) || ( cla.getNumberOfNames() > 3 ) ) {
- System.out.println();
- System.out.println( "Wrong number of arguments." );
- System.out.println();
- gsdi.print_help();
- System.exit( -1 );
+ ForesterUtil.printProgramInformation( PRG_NAME,
+ PRG_DESC,
+ PRG_VERSION,
+ PRG_DATE,
+ E_MAIL,
+ WWW,
+ ForesterUtil.getForesterLibraryInformation() );
+ CommandLineArguments cla = null;
+ try {
+ cla = new CommandLineArguments( args );
+ }
+ catch ( final Exception e ) {
+ ForesterUtil.fatalError( PRG_NAME, e.getMessage() );
+ }
+ if ( cla.isOptionSet( gsdi.HELP_OPTION_1 ) || cla.isOptionSet( gsdi.HELP_OPTION_2 ) ) {
+ System.out.println();
+ gsdi.print_help();
+ System.exit( 0 );
+ }
+ else if ( ( args.length < 2 ) || ( cla.getNumberOfNames() < 2 ) || ( cla.getNumberOfNames() > 3 ) ) {
+ System.out.println();
+ System.out.println( "Wrong number of arguments." );
+ System.out.println();
+ gsdi.print_help();
+ System.exit( -1 );
+ }
+ final List<String> allowed_options = new ArrayList<String>();
+ allowed_options.add( gsdi.STRIP_OPTION );
+ allowed_options.add( gsdi.SDI_OPTION );
+ allowed_options.add( gsdi.GUESS_FORMAT_OF_SPECIES_TREE );
+ allowed_options.add( gsdi.MOST_PARSIMONIOUS_OPTION );
+ allowed_options.add( gsdi.ALLOW_STRIPPING_OF_GENE_TREE_OPTION );
+ final String dissallowed_options = cla.validateAllowedOptionsAsString( allowed_options );
+ if ( dissallowed_options.length() > 0 ) {
+ ForesterUtil.fatalError( gsdi.PRG_NAME, "unknown option(s): " + dissallowed_options );
+ }
+ execute( cla );
}
- final List<String> allowed_options = new ArrayList<String>();
- allowed_options.add( gsdi.STRIP_OPTION );
- allowed_options.add( gsdi.SDI_OPTION );
- allowed_options.add( gsdi.GUESS_FORMAT_OF_SPECIES_TREE );
- allowed_options.add( gsdi.MOST_PARSIMONIOUS_OPTION );
- allowed_options.add( gsdi.ALLOW_STRIPPING_OF_GENE_TREE_OPTION );
-
- final String dissallowed_options = cla.validateAllowedOptionsAsString( allowed_options );
- if ( dissallowed_options.length() > 0 ) {
- ForesterUtil.fatalError( gsdi.PRG_NAME, "unknown option(s): " + dissallowed_options );
+ catch ( final IOException e ) {
+ ForesterUtil.fatalError( gsdi.PRG_NAME, e.getMessage() );
}
- boolean use_sdise = false;
+ }
+
+ private static void execute( final CommandLineArguments cla ) throws IOException {
+ BASE_ALGORITHM base_algorithm = BASE_ALGORITHM.GSDI;
boolean strip = false;
boolean most_parsimonous_duplication_model = false;
boolean species_tree_in_phyloxml = true;
strip = true;
}
if ( cla.isOptionSet( gsdi.SDI_OPTION ) ) {
- use_sdise = true;
+ base_algorithm = BASE_ALGORITHM.SDI;
}
if ( cla.isOptionSet( gsdi.MOST_PARSIMONIOUS_OPTION ) ) {
- if ( use_sdise ) {
+ if ( base_algorithm != BASE_ALGORITHM.GSDI ) {
ForesterUtil.fatalError( gsdi.PRG_NAME, "Can only use most parsimonious duplication mode with GSDI" );
}
most_parsimonous_duplication_model = true;
species_tree_in_phyloxml = false;
}
if ( cla.isOptionSet( gsdi.ALLOW_STRIPPING_OF_GENE_TREE_OPTION ) ) {
- if ( use_sdise ) {
+ if ( base_algorithm != BASE_ALGORITHM.GSDI ) {
ForesterUtil.fatalError( gsdi.PRG_NAME, "Can only allow stripping of gene tree with GSDI" );
}
allow_stripping_of_gene_tree = true;
}
-
-
-
Phylogeny species_tree = null;
Phylogeny gene_tree = null;
File gene_tree_file = null;
File species_tree_file = null;
File out_file = null;
+ File log_file = null;
+ Writer log_writer = null;
try {
gene_tree_file = cla.getFile( 0 );
species_tree_file = cla.getFile( 1 );
}
else {
out_file = new File( ForesterUtil.removeSuffix( gene_tree_file.toString() ) + DEFAULT_OUTFILE_SUFFIX );
- //out_file = new File( gsdi.DEFAULT_OUTFILE );
}
+ log_file = new File( ForesterUtil.removeSuffix( out_file.toString() ) + LOGFILE_SUFFIX );
}
catch ( final IllegalArgumentException e ) {
ForesterUtil.fatalError( gsdi.PRG_NAME, "error in command line: " + e.getMessage() );
if ( ForesterUtil.isWritableFile( out_file ) != null ) {
ForesterUtil.fatalError( gsdi.PRG_NAME, ForesterUtil.isWritableFile( out_file ) );
}
+ if ( ForesterUtil.isWritableFile( log_file ) != null ) {
+ ForesterUtil.fatalError( gsdi.PRG_NAME, ForesterUtil.isWritableFile( log_file ) );
+ }
+ try {
+ log_writer = ForesterUtil.createBufferedWriter( log_file );
+ }
+ catch ( final IOException e ) {
+ ForesterUtil.fatalError( gsdi.PRG_NAME, "Failed to create [" + log_file + "]: " + e.getMessage() );
+ }
+ try {
+ final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance();
+ gene_tree = factory.create( gene_tree_file, new PhyloXmlParser() )[ 0 ];
+ }
+ catch ( final IOException e ) {
+ ForesterUtil.fatalError( gsdi.PRG_NAME,
+ "Failed to read gene tree from [" + gene_tree_file + "]: " + e.getMessage() );
+ }
try {
final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance();
if ( species_tree_in_phyloxml ) {
( ( NHXParser ) p ).setReplaceUnderscores( true );
}
species_tree = factory.create( species_tree_file, p )[ 0 ];
- PhylogenyMethods.transferNodeNameToField( species_tree,
- PhylogenyMethods.PhylogenyNodeField.TAXONOMY_SCIENTIFIC_NAME );
+ final TaxonomyComparisonBase comp_base = GSDI.determineTaxonomyComparisonBase( gene_tree );
+ switch ( comp_base ) {
+ case SCIENTIFIC_NAME:
+ PhylogenyMethods
+ .transferNodeNameToField( species_tree,
+ PhylogenyMethods.PhylogenyNodeField.TAXONOMY_ID_UNIPROT_1 );
+ break;
+ case CODE:
+ PhylogenyMethods.transferNodeNameToField( species_tree,
+ PhylogenyMethods.PhylogenyNodeField.TAXONOMY_CODE );
+ break;
+ case ID:
+ PhylogenyMethods.transferNodeNameToField( species_tree,
+ PhylogenyMethods.PhylogenyNodeField.TAXONOMY_ID );
+ break;
+ default:
+ ForesterUtil.fatalError( gsdi.PRG_NAME, "unable to determine comparison base" );
+ }
}
}
catch ( final IOException e ) {
ForesterUtil.fatalError( gsdi.PRG_NAME,
"Failed to read species tree from [" + gene_tree_file + "]: " + e.getMessage() );
}
- try {
- final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance();
- gene_tree = factory.create( gene_tree_file, new PhyloXmlParser() )[ 0 ];
- }
- catch ( final IOException e ) {
- ForesterUtil.fatalError( gsdi.PRG_NAME,
- "Failed to read gene tree from [" + gene_tree_file + "]: " + e.getMessage() );
- }
gene_tree.setRooted( true );
species_tree.setRooted( true );
if ( !gene_tree.isCompletelyBinary() ) {
ForesterUtil.fatalError( gsdi.PRG_NAME, "gene tree (\"" + gene_tree_file + "\") is not completely binary" );
}
- if ( use_sdise ) {
+ if ( base_algorithm != BASE_ALGORITHM.GSDI ) {
if ( !species_tree.isCompletelyBinary() ) {
ForesterUtil.fatalError( gsdi.PRG_NAME, "species tree (\"" + species_tree_file
- + "\") is not completely binary" );
+ + "\") is not completely binary, use GSDI instead" );
}
}
// For timing.
// Helper.randomizeSpecies( 1, 8192, gene_tree );
// Helper.intervalNumberSpecies( gene_tree, 4096 );
// Helper.numberSpeciesInDescOrder( gene_tree );
+ log_writer.write( PRG_NAME + " " + PRG_VERSION + " " + PRG_DATE );
+ log_writer.write( ForesterUtil.LINE_SEPARATOR );
+ log_writer.write( PRG_DESC );
+ log_writer.write( ForesterUtil.LINE_SEPARATOR );
+ log_writer.write( PRG_DESC );
+ log_writer.write( ForesterUtil.LINE_SEPARATOR );
+ log_writer.write( ForesterUtil.LINE_SEPARATOR );
+ log_writer.write( new SimpleDateFormat( "yyyyMMdd HH:mm:ss" ).format( new Date() ) );
+ log_writer.write( ForesterUtil.LINE_SEPARATOR );
System.out.println();
System.out.println( "Strip species tree: " + strip );
+ log_writer.write( "Strip species tree: " + strip );
+ log_writer.write( ForesterUtil.LINE_SEPARATOR );
SDI sdi = null;
final long start_time = new Date().getTime();
try {
- if ( use_sdise ) {
+ if ( base_algorithm == BASE_ALGORITHM.GSDI ) {
System.out.println();
- System.out.println( "Using SDIse algorithm" );
- sdi = new SDIse( gene_tree, species_tree );
+ System.out.println( "Use most parsimonous duplication model: " + most_parsimonous_duplication_model );
+ System.out.println( "Allow stripping of gene tree nodes : " + allow_stripping_of_gene_tree );
+ log_writer.write( "Use most parsimonous duplication model: " + most_parsimonous_duplication_model );
+ log_writer.write( ForesterUtil.LINE_SEPARATOR );
+ log_writer.write( "Allow stripping of gene tree nodes : " + allow_stripping_of_gene_tree );
+ log_writer.write( ForesterUtil.LINE_SEPARATOR );
+ sdi = new GSDI( gene_tree,
+ species_tree,
+ most_parsimonous_duplication_model,
+ allow_stripping_of_gene_tree );
}
else {
System.out.println();
- System.out.println( "Using GSDI algorithm" );
- System.out.println();
- System.out.println( "Use most parsimonous duplication model: " + most_parsimonous_duplication_model );
- System.out.println( "Allow stripping of gene tree nodes : " + allow_stripping_of_gene_tree );
- sdi = new GSDI( gene_tree, species_tree, most_parsimonous_duplication_model, allow_stripping_of_gene_tree );
-
+ System.out.println( "Using SDIse algorithm" );
+ log_writer.write( "Using SDIse algorithm" );
+ log_writer.write( ForesterUtil.LINE_SEPARATOR );
+ sdi = new SDIse( gene_tree, species_tree );
}
}
catch ( final Exception e ) {
}
System.out.println();
System.out.println( "Running time (excluding I/O): " + ( new Date().getTime() - start_time ) + "ms" );
+ log_writer.write( "Running time (excluding I/O): " + ( new Date().getTime() - start_time ) + "ms" );
+ log_writer.write( ForesterUtil.LINE_SEPARATOR );
try {
final PhylogenyWriter writer = new PhylogenyWriter();
writer.toPhyloXML( out_file, gene_tree, 0 );
System.out.println();
System.out.println( "Successfully wrote resulting gene tree to: " + out_file );
System.out.println();
- if ( use_sdise ) {
+ log_writer.write( "Wrote resulting gene tree to: " + out_file );
+ log_writer.write( ForesterUtil.LINE_SEPARATOR );
+ if ( base_algorithm == BASE_ALGORITHM.SDI ) {
sdi.computeMappingCostL();
System.out.println( "Mapping cost : " + sdi.computeMappingCostL() );
+ log_writer.write( "Mapping cost : " + sdi.computeMappingCostL() );
+ log_writer.write( ForesterUtil.LINE_SEPARATOR );
}
System.out.println( "Number of duplications : " + sdi.getDuplicationsSum() );
- if ( !use_sdise && !most_parsimonous_duplication_model ) {
- System.out.println( "Number of potential duplications: "
- + ( ( GSDI ) sdi ).getSpeciationOrDuplicationEventsSum() );
+ log_writer.write( "Number of duplications : " + sdi.getDuplicationsSum() );
+ log_writer.write( ForesterUtil.LINE_SEPARATOR );
+ if ( ( base_algorithm == BASE_ALGORITHM.GSDI ) && !most_parsimonous_duplication_model ) {
+ final int duplications = ( ( GSDI ) sdi ).getSpeciationOrDuplicationEventsSum();
+ System.out.println( "Number of potential duplications: " + duplications );
+ log_writer.write( "Number of potential duplications: " + duplications );
+ log_writer.write( ForesterUtil.LINE_SEPARATOR );
}
- if ( !use_sdise ) {
- System.out.println( "Number of speciations : " + ( ( GSDI ) sdi ).getSpeciationsSum() );
+ if ( base_algorithm == BASE_ALGORITHM.GSDI ) {
+ final int spec = ( ( GSDI ) sdi ).getSpeciationsSum();
+ System.out.println( "Number of speciations : " + spec );
+ log_writer.write( "Number of speciations : " + spec );
+ log_writer.write( ForesterUtil.LINE_SEPARATOR );
}
System.out.println();
- some stat on gene tree:
- filename, name
- number of external nodes, strppided nodes
- some stats on sepcies tree, external nodes,
- filename, name
- internal nodes
- how many of which are polytomies
+ // some stat on gene tree:
+ // filename, name
+ // number of external nodes, strppided nodes
+ // some stats on sepcies tree, external nodes,
+ // filename, name
+ // internal nodes
+ // how many of which are polytomies
//wrote log file to
- if ( allow_stripping_of_gene_tree ) {
- stripped x nodes, y external nodes remain
-
-
- }
+ // if ( allow_stripping_of_gene_tree ) {
+ // stripped x nodes, y external nodes remain
+ // }
}
private static void print_help() {
+ " [-options] <gene tree in phyloXML format> <species tree in phyloXML format> [outfile]" );
System.out.println();
System.out.println( "Options:" );
- System.out.println( " -" + gsdi.ALLOW_STRIPPING_OF_GENE_TREE_OPTION + ": to allow stripping of gene tree nodes without a matching species in the species tree (writes list of stripped nodes to " + );
- System.out.println( " -" + gsdi.STRIP_OPTION + ": to strip the species tree of unneeded species prior to duplication inference" );
+ // System.out.println( " -" + gsdi.ALLOW_STRIPPING_OF_GENE_TREE_OPTION + ": to allow stripping of gene tree nodes without a matching species in the species tree (writes list of stripped nodes to " + );
+ System.out.println( " -" + gsdi.STRIP_OPTION
+ + ": to strip the species tree of unneeded species prior to duplication inference" );
System.out.println( " -" + gsdi.SDI_OPTION + ": to use SDI algorithm instead of GSDI algorithm" );//TODO gsdi.ALLOW_STRIPPING_OF_GENE_TREE_OPTION not allowed
System.out.println( " -" + gsdi.MOST_PARSIMONIOUS_OPTION
+ ": use most parimonious duplication model for GSDI: " );
System.out.println( " assign nodes as speciations which would otherwise be assiged" );
System.out.println( " as unknown because of polytomies in the species tree" );
- System.out.println( " -" + gsdi.GUESS_FORMAT_OF_SPECIES_TREE + ": to allow species tree in other formats than" );
- System.out.println( " phyloXML (Newick, NHX, Nexus)" );
-
-
+ System.out.println( " -" + gsdi.GUESS_FORMAT_OF_SPECIES_TREE
+ + ": to allow species tree in other formats than phyloXML (i.e. Newick, NHX, Nexus)" );
System.out.println();
System.out.println( "Species tree:" );
- System.out.println( " In phyloXML format (unless option -" + gsdi.GUESS_FORMAT_OF_SPECIES_TREE
- + " is used, in which case the species matching between gene tree and species tree must be via scientific name), with taxonomy data in appropriate fields" );
+ System.out
+ .println( " In phyloXML format (unless option -"
+ + gsdi.GUESS_FORMAT_OF_SPECIES_TREE
+ + " is used, in which case the species matching between gene tree and species tree must be via scientific name), with taxonomy data in appropriate fields" );
System.out.println();
System.out.println( "Gene tree:" );
System.out.println( " In phyloXM format, with taxonomy and sequence data in appropriate fields" );
System.out.println();
System.out.println( "Example:" );
- System.out.println( "gsdi
- System.out.println();
+ // System.out.println( "gsdi
+ // System.out.println();
System.out.println( "Note -- GSDI algorithm is under development" );
System.out.println();
}
import java.util.HashMap;
import java.util.HashSet;
+import java.util.Map;
import java.util.Set;
import org.forester.phylogeny.Phylogeny;
private final boolean _strip_gene_tree;
private int _speciation_or_duplication_events_sum;
private int _speciations_sum;
- private final Set<PhylogenyNode> _stripped_gene_tree_nodes;
+ private final Set<PhylogenyNode> _stripped_gene_tree_nodes;
/**
* Constructor which sets the gene tree and the species tree to be compared.
_speciations_sum = 0;
_most_parsimonious_duplication_model = most_parsimonious_duplication_model;
_transversal_counts = new HashMap<PhylogenyNode, Integer>();
- _duplications_sum = 0;
+ _duplications_sum = 0;
_strip_gene_tree = strip_gene_tree;
- _stripped_gene_tree_nodes = new HashSet<PhylogenyNode>();
+ _stripped_gene_tree_nodes = new HashSet<PhylogenyNode>();
getSpeciesTree().preOrderReId();
linkNodesOfG();
geneTreePostOrderTraversal( getGeneTree().getRoot() );
}
}
+ final void linkNodesOfG2() {
+ final HashMap<Taxonomy, PhylogenyNode> speciestree_ext_nodes = createTaxonomyToNodeMap();
+ if ( _strip_gene_tree ) {
+ stripGeneTree( speciestree_ext_nodes );
+ if ( ( _gene_tree == null ) || ( _gene_tree.getNumberOfExternalNodes() < 2 ) ) {
+ throw new IllegalArgumentException( "species tree does not contain any"
+ + " nodes matching species in the gene tree" );
+ }
+ }
+ // Retrieve the reference to the PhylogenyNode with a matching species.
+ for( final PhylogenyNodeIterator iter = _gene_tree.iteratorExternalForward(); iter.hasNext(); ) {
+ final PhylogenyNode g = iter.next();
+ if ( !g.getNodeData().isHasTaxonomy() ) {
+ throw new IllegalArgumentException( "gene tree node " + g + " has no taxonomic data" );
+ }
+ final PhylogenyNode s = speciestree_ext_nodes.get( g.getNodeData().getTaxonomy() );
+ if ( s == null ) {
+ throw new IllegalArgumentException( "species " + g.getNodeData().getTaxonomy()
+ + " not present in species tree" );
+ }
+ g.setLink( s );
+ }
+ //////
+ final Map<String, PhylogenyNode> speciestree_ext_nodes = new HashMap<String, PhylogenyNode>();
+ final TaxonomyComparisonBase tax_comp_base = determineTaxonomyComparisonBase( _gene_tree );
+ if ( _strip_gene_tree ) {
+ stripGeneTree( speciestree_ext_nodes );
+ if ( ( _gene_tree == null ) || ( _gene_tree.getNumberOfExternalNodes() < 2 ) ) {
+ throw new IllegalArgumentException( "species tree does not contain any"
+ + " nodes matching species in the gene tree" );
+ }
+ }
+ // Put references to all external nodes of the species tree into a map.
+ // Stringyfied taxonomy is the key, node is the value.
+ for( final PhylogenyNodeIterator iter = _species_tree.iteratorExternalForward(); iter.hasNext(); ) {
+ final PhylogenyNode s = iter.next();
+ final String tax_str = taxonomyToString( s, tax_comp_base );
+ if ( speciestree_ext_nodes.containsKey( tax_str ) ) {
+ throw new IllegalArgumentException( "taxonomy [" + s + "] is not unique in species phylogeny" );
+ }
+ speciestree_ext_nodes.put( tax_str, s );
+ }
+ // Retrieve the reference to the node with a matching stringyfied taxonomy.
+ for( final PhylogenyNodeIterator iter = _gene_tree.iteratorExternalForward(); iter.hasNext(); ) {
+ final PhylogenyNode g = iter.next();
+ final String tax_str = taxonomyToString( g, tax_comp_base );
+ final PhylogenyNode s = speciestree_ext_nodes.get( tax_str );
+ if ( s == null ) {
+ throw new IllegalArgumentException( "taxonomy [" + g.getNodeData().getTaxonomy()
+ + "] not present in species tree" );
+ }
+ g.setLink( s );
+ }
+ }
+
final private HashMap<Taxonomy, PhylogenyNode> createTaxonomyToNodeMap() {
final HashMap<Taxonomy, PhylogenyNode> speciestree_ext_nodes = new HashMap<Taxonomy, PhylogenyNode>();
for( final PhylogenyNodeIterator iter = _species_tree.iteratorLevelOrder(); iter.hasNext(); ) {
}
private final void stripGeneTree( final HashMap<Taxonomy, PhylogenyNode> speciestree_ext_nodes ) {
- // final Set<PhylogenyNode> to_delete = new HashSet<PhylogenyNode>();
-
+ // final Set<PhylogenyNode> to_delete = new HashSet<PhylogenyNode>();
for( final PhylogenyNodeIterator iter = _gene_tree.iteratorExternalForward(); iter.hasNext(); ) {
final PhylogenyNode g = iter.next();
if ( !g.getNodeData().isHasTaxonomy() ) {
throw new IllegalArgumentException( "gene tree node " + g + " has no taxonomic data" );
}
- final PhylogenyNode s = speciestree_ext_nodes.get( g.getNodeData().getTaxonomy() );
- if ( s == null ) {
+ if ( !speciestree_ext_nodes.containsKey( g.getNodeData().getTaxonomy() ) ) {
_stripped_gene_tree_nodes.add( g );
}
}
- for( final PhylogenyNode n : _stripped_gene_tree_nodes ) {
+ for( final PhylogenyNode n : _stripped_gene_tree_nodes ) {
_gene_tree.deleteSubtree( n, true );
-
}
-
}
-
+
+ public static TaxonomyComparisonBase determineTaxonomyComparisonBase( final Phylogeny gene_tree ) {
+ int with_id_count = 0;
+ int with_code_count = 0;
+ int with_sn_count = 0;
+ int max = 0;
+ for( final PhylogenyNodeIterator iter = gene_tree.iteratorExternalForward(); iter.hasNext(); ) {
+ final PhylogenyNode g = iter.next();
+ if ( g.getNodeData().isHasTaxonomy() ) {
+ final Taxonomy tax = g.getNodeData().getTaxonomy();
+ if ( ( tax.getIdentifier() != null ) && !ForesterUtil.isEmpty( tax.getIdentifier().getValue() ) ) {
+ if ( ++with_id_count > max ) {
+ max = with_id_count;
+ }
+ }
+ if ( !ForesterUtil.isEmpty( tax.getTaxonomyCode() ) ) {
+ if ( ++with_code_count > max ) {
+ max = with_code_count;
+ }
+ }
+ if ( !ForesterUtil.isEmpty( tax.getScientificName() ) ) {
+ if ( ++with_sn_count > max ) {
+ max = with_sn_count;
+ }
+ }
+ }
+ }
+ if ( max == 0 ) {
+ throw new IllegalArgumentException( "gene tree has no taxonomic data" );
+ }
+ else if ( max == 1 ) {
+ throw new IllegalArgumentException( "gene tree has only one node with taxonomic data" );
+ }
+ else if ( max == with_sn_count ) {
+ return SDI.TaxonomyComparisonBase.SCIENTIFIC_NAME;
+ }
+ else if ( max == with_id_count ) {
+ return SDI.TaxonomyComparisonBase.ID;
+ }
+ else {
+ return SDI.TaxonomyComparisonBase.CODE;
+ }
+ }
+
public Set<PhylogenyNode> getStrippedExternalGeneTreeNodes() {
return _stripped_gene_tree_nodes;
}