// $Id: // // FORESTER -- software libraries and applications // for evolutionary biology research and applications. // // Copyright (C) 2008-2009 Christian M. Zmasek // Copyright (C) 2008-2009 Burnham Institute for Medical Research // All rights reserved // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA // // Contact: phylosoft @ gmail . com // WWW: https://sites.google.com/site/cmzmasek/home/software/forester package org.forester.evoinference.parsimony; import java.text.DecimalFormat; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Random; import java.util.SortedSet; import java.util.TreeSet; import org.forester.evoinference.matrix.character.BasicCharacterStateMatrix; import org.forester.evoinference.matrix.character.CharacterStateMatrix; import org.forester.evoinference.matrix.character.CharacterStateMatrix.BinaryStates; import org.forester.evoinference.matrix.character.CharacterStateMatrix.GainLossStates; import org.forester.phylogeny.Phylogeny; import org.forester.phylogeny.PhylogenyNode; import org.forester.phylogeny.iterators.PhylogenyNodeIterator; import org.forester.util.FailedConditionCheckException; import org.forester.util.ForesterUtil; public class FitchParsimony { final static private BinaryStates ABSENT = BinaryStates.ABSENT; final static private GainLossStates GAIN = GainLossStates.GAIN; final static private GainLossStates LOSS = GainLossStates.LOSS; final static private BinaryStates PRESENT = BinaryStates.PRESENT; private static final long RANDOM_NUMBER_SEED_DEFAULT = 21; private static final boolean RANDOMIZE_DEFAULT = false; private static final boolean RETURN_GAIN_LOSS_MATRIX_DEFAULT = false; private static final boolean RETURN_INTERNAL_STATES_DEFAULT = false; final static private GainLossStates UNCHANGED_ABSENT = GainLossStates.UNCHANGED_ABSENT; final static private GainLossStates UNCHANGED_PRESENT = GainLossStates.UNCHANGED_PRESENT; private static final boolean USE_LAST_DEFAULT = false; private int _cost; private CharacterStateMatrix _gain_loss_matrix; private CharacterStateMatrix _internal_states_matrix_after_traceback; private CharacterStateMatrix> _internal_states_matrix_prior_to_traceback; private Random _random_generator; private long _random_number_seed; private boolean _randomize; private boolean _return_gain_loss = false; private boolean _return_internal_states = false; private int _total_gains; private int _total_losses; private int _total_unchanged; private boolean _use_last; private boolean _verbose = false; public FitchParsimony() { init(); } public void execute( final Phylogeny p, final CharacterStateMatrix external_node_states_matrix ) { execute( p, external_node_states_matrix, false ); } public void execute( final Phylogeny p, final CharacterStateMatrix external_node_states_matrix, final boolean verbose ) { if ( !p.isRooted() ) { throw new IllegalArgumentException( "attempt to execute Fitch parsimony on unroored phylogeny" ); } if ( external_node_states_matrix.isEmpty() ) { throw new IllegalArgumentException( "character matrix is empty" ); } if ( external_node_states_matrix.getNumberOfIdentifiers() != p.getNumberOfExternalNodes() ) { throw new IllegalArgumentException( "number of external nodes in phylogeny [" + p.getNumberOfExternalNodes() + "] and number of indentifiers [" + external_node_states_matrix.getNumberOfIdentifiers() + "] in matrix are not equal" ); } setVerbose( verbose ); reset(); if ( isReturnInternalStates() ) { initializeInternalStates( p, external_node_states_matrix ); } if ( isReturnGainLossMatrix() ) { initializeGainLossMatrix( p, external_node_states_matrix ); } final DecimalFormat pf = new java.text.DecimalFormat( "000000" ); if ( isVerbose() ) { System.out.println( "Number of characters: " + external_node_states_matrix.getNumberOfCharacters() ); } for( int character_index = 0; character_index < external_node_states_matrix.getNumberOfCharacters(); ++character_index ) { if ( isVerbose() ) { ForesterUtil.updateProgress( character_index, pf ); } executeForOneCharacter( p, getStatesForCharacter( p, external_node_states_matrix, character_index ), getStatesForCharacterForTraceback( p, external_node_states_matrix, character_index ), character_index ); } if ( isVerbose() ) { System.out.println(); } if ( external_node_states_matrix.getState( 0, 0 ) instanceof BinaryStates ) { if ( ( external_node_states_matrix.getNumberOfCharacters() * p.getNumberOfBranches() ) != ( getTotalGains() + getTotalLosses() + getTotalUnchanged() ) ) { throw new FailedConditionCheckException( "this should not have happened: something is deeply wrong with Fitch parsimony implementation" ); } } } public int getCost() { return _cost; } public CharacterStateMatrix getGainLossMatrix() { if ( !isReturnGainLossMatrix() ) { throw new RuntimeException( "creation of gain-loss matrix has not been enabled" ); } return _gain_loss_matrix; } public CharacterStateMatrix getInternalStatesMatrix() { if ( !isReturnInternalStates() ) { throw new RuntimeException( "creation of internal state matrix has not been enabled" ); } return _internal_states_matrix_after_traceback; } /** * Returns a view of the internal states prior to trace-back. * * @return */ public CharacterStateMatrix> getInternalStatesMatrixPriorToTraceback() { if ( !isReturnInternalStates() ) { throw new RuntimeException( "creation of internal state matrix has not been enabled" ); } return _internal_states_matrix_prior_to_traceback; } public int getTotalGains() { return _total_gains; } public int getTotalLosses() { return _total_losses; } public int getTotalUnchanged() { return _total_unchanged; } public boolean isVerbose() { return _verbose; } public void setRandomize( final boolean randomize ) { if ( randomize && isUseLast() ) { throw new IllegalArgumentException( "attempt to allways use last state (ordered) if more than one choices and randomization at the same time" ); } _randomize = randomize; } public void setRandomNumberSeed( final long random_number_seed ) { if ( !isRandomize() ) { throw new IllegalArgumentException( "attempt to set random number generator seed without randomization enabled" ); } _random_number_seed = random_number_seed; } public void setReturnGainLossMatrix( final boolean return_gain_loss ) { _return_gain_loss = return_gain_loss; } public void setReturnInternalStates( final boolean return_internal_states ) { _return_internal_states = return_internal_states; } /** * This sets whether to use the first or last state in the sorted * states at the undecided internal nodes. * For randomized choices set randomize to true (and this to false). * * Note. It might be advisable to set this to false * for BinaryStates if absence at the root is preferred * (given the enum BinaryStates sorts in the following order: * ABSENT, UNKNOWN, PRESENT). * * * @param use_last */ public void setUseLast( final boolean use_last ) { if ( use_last && isRandomize() ) { throw new IllegalArgumentException( "attempt to allways use last state (ordered) if more than one choices and randomization at the same time" ); } _use_last = use_last; } public void setVerbose( final boolean verbose ) { _verbose = verbose; } private int determineIndex( final SortedSet current_node_states, int i ) { if ( isRandomize() ) { i = getRandomGenerator().nextInt( current_node_states.size() ); } else if ( isUseLast() ) { i = current_node_states.size() - 1; } return i; } private void executeForOneCharacter( final Phylogeny p, final Map> states, final Map traceback_states, final int character_state_column ) { postOrderTraversal( p, states ); preOrderTraversal( p, states, traceback_states, character_state_column ); } private SortedSet getIntersectionOfStatesOfChildNodes( final Map> states, final PhylogenyNode node ) throws AssertionError { final SortedSet states_in_child_nodes = new TreeSet(); for( int i = 0; i < node.getNumberOfDescendants(); ++i ) { final PhylogenyNode node_child = node.getChildNode( i ); if ( !states.containsKey( node_child ) ) { throw new AssertionError( "this should not have happened: node [" + node_child.getName() + "] not found in node state map" ); } if ( i == 0 ) { states_in_child_nodes.addAll( states.get( node_child ) ); } else { states_in_child_nodes.retainAll( states.get( node_child ) ); } } return states_in_child_nodes; } private Random getRandomGenerator() { return _random_generator; } private long getRandomNumberSeed() { return _random_number_seed; } private STATE_TYPE getStateAt( final int i, final SortedSet states ) { final Iterator it = states.iterator(); for( int j = 0; j < i; ++j ) { it.next(); } return it.next(); } private Map> getStatesForCharacter( final Phylogeny p, final CharacterStateMatrix matrix, final int character_index ) { final Map> states = new HashMap>( matrix .getNumberOfIdentifiers() ); for( int indentifier_index = 0; indentifier_index < matrix.getNumberOfIdentifiers(); ++indentifier_index ) { final STATE_TYPE state = matrix.getState( indentifier_index, character_index ); if ( state == null ) { throw new IllegalArgumentException( "value at [" + indentifier_index + ", " + character_index + "] is null" ); } final SortedSet l = new TreeSet(); l.add( state ); states.put( p.getNode( matrix.getIdentifier( indentifier_index ) ), l ); } return states; } private Map getStatesForCharacterForTraceback( final Phylogeny p, final CharacterStateMatrix matrix, final int character_index ) { final Map states = new HashMap( matrix.getNumberOfIdentifiers() ); for( int indentifier_index = 0; indentifier_index < matrix.getNumberOfIdentifiers(); ++indentifier_index ) { final STATE_TYPE state = matrix.getState( indentifier_index, character_index ); if ( state == null ) { throw new IllegalArgumentException( "value at [" + indentifier_index + ", " + character_index + "] is null" ); } states.put( p.getNode( matrix.getIdentifier( indentifier_index ) ), state ); } return states; } private SortedSet getUnionOfStatesOfChildNodes( final Map> states, final PhylogenyNode node ) throws AssertionError { final SortedSet states_in_child_nodes = new TreeSet(); for( int i = 0; i < node.getNumberOfDescendants(); ++i ) { final PhylogenyNode node_child = node.getChildNode( i ); if ( !states.containsKey( node_child ) ) { throw new AssertionError( "this should not have happened: node [" + node_child.getName() + "] not found in node state map" ); } states_in_child_nodes.addAll( states.get( node_child ) ); } return states_in_child_nodes; } private void increaseCost() { ++_cost; } private void init() { setReturnInternalStates( RETURN_INTERNAL_STATES_DEFAULT ); setReturnGainLossMatrix( RETURN_GAIN_LOSS_MATRIX_DEFAULT ); setRandomize( RANDOMIZE_DEFAULT ); setUseLast( USE_LAST_DEFAULT ); _random_number_seed = RANDOM_NUMBER_SEED_DEFAULT; reset(); } private void initializeGainLossMatrix( final Phylogeny p, final CharacterStateMatrix external_node_states_matrix ) { final List nodes = new ArrayList(); for( final PhylogenyNodeIterator postorder = p.iteratorPostorder(); postorder.hasNext(); ) { nodes.add( postorder.next() ); } setGainLossMatrix( new BasicCharacterStateMatrix( nodes.size(), external_node_states_matrix .getNumberOfCharacters() ) ); int identifier_index = 0; for( final PhylogenyNode node : nodes ) { getGainLossMatrix().setIdentifier( identifier_index++, ForesterUtil.isEmpty( node.getName() ) ? node.getId() + "" : node .getName() ); } for( int character_index = 0; character_index < external_node_states_matrix.getNumberOfCharacters(); ++character_index ) { getGainLossMatrix().setCharacter( character_index, external_node_states_matrix.getCharacter( character_index ) ); } } private void initializeInternalStates( final Phylogeny p, final CharacterStateMatrix external_node_states_matrix ) { final List internal_nodes = new ArrayList(); for( final PhylogenyNodeIterator postorder = p.iteratorPostorder(); postorder.hasNext(); ) { final PhylogenyNode node = postorder.next(); if ( node.isInternal() ) { internal_nodes.add( node ); } } setInternalStatesMatrixPriorToTraceback( new BasicCharacterStateMatrix>( internal_nodes.size(), external_node_states_matrix .getNumberOfCharacters() ) ); setInternalStatesMatrixTraceback( new BasicCharacterStateMatrix( internal_nodes.size(), external_node_states_matrix .getNumberOfCharacters() ) ); int identifier_index = 0; for( final PhylogenyNode node : internal_nodes ) { getInternalStatesMatrix().setIdentifier( identifier_index, ForesterUtil.isEmpty( node.getName() ) ? node.getId() + "" : node .getName() ); getInternalStatesMatrixPriorToTraceback().setIdentifier( identifier_index, ForesterUtil.isEmpty( node.getName() ) ? node .getId() + "" : node.getName() ); ++identifier_index; } for( int character_index = 0; character_index < external_node_states_matrix.getNumberOfCharacters(); ++character_index ) { getInternalStatesMatrix().setCharacter( character_index, external_node_states_matrix.getCharacter( character_index ) ); getInternalStatesMatrixPriorToTraceback().setCharacter( character_index, external_node_states_matrix .getCharacter( character_index ) ); } } private boolean isRandomize() { return _randomize; } private boolean isReturnGainLossMatrix() { return _return_gain_loss; } private boolean isReturnInternalStates() { return _return_internal_states; } private boolean isUseLast() { return _use_last; } private void postOrderTraversal( final Phylogeny p, final Map> states ) { for( final PhylogenyNodeIterator postorder = p.iteratorPostorder(); postorder.hasNext(); ) { final PhylogenyNode node = postorder.next(); if ( !node.isExternal() ) { SortedSet states_in_children = getIntersectionOfStatesOfChildNodes( states, node ); if ( states_in_children.isEmpty() ) { states_in_children = getUnionOfStatesOfChildNodes( states, node ); } states.put( node, states_in_children ); } } } private void preOrderTraversal( final Phylogeny p, final Map> states, final Map traceback_states, final int character_state_column ) { for( final PhylogenyNodeIterator preorder = p.iteratorPreorder(); preorder.hasNext(); ) { final PhylogenyNode current_node = preorder.next(); final SortedSet current_node_states = states.get( current_node ); STATE_TYPE parent_state = null; if ( current_node.isRoot() ) { int i = 0; i = determineIndex( current_node_states, i ); traceback_states.put( current_node, getStateAt( i, current_node_states ) ); } else { parent_state = traceback_states.get( current_node.getParent() ); if ( current_node_states.contains( parent_state ) ) { traceback_states.put( current_node, parent_state ); } else { increaseCost(); int i = 0; i = determineIndex( current_node_states, i ); traceback_states.put( current_node, getStateAt( i, current_node_states ) ); } } if ( isReturnInternalStates() ) { if ( !current_node.isExternal() ) { setInternalNodeStatePriorToTraceback( states, character_state_column, current_node ); setInternalNodeState( traceback_states, character_state_column, current_node ); } } if ( isReturnGainLossMatrix() && !current_node.isRoot() ) { if ( !( parent_state instanceof BinaryStates ) ) { throw new RuntimeException( "attempt to create gain loss matrix for not binary states" ); } final BinaryStates parent_binary_state = ( BinaryStates ) parent_state; final BinaryStates current_binary_state = ( BinaryStates ) traceback_states.get( current_node ); if ( ( parent_binary_state == PRESENT ) && ( current_binary_state == ABSENT ) ) { ++_total_losses; setGainLossState( character_state_column, current_node, LOSS ); } else if ( ( ( parent_binary_state == ABSENT ) || ( parent_binary_state == null ) ) && ( current_binary_state == PRESENT ) ) { ++_total_gains; setGainLossState( character_state_column, current_node, GAIN ); } else { ++_total_unchanged; if ( current_binary_state == PRESENT ) { setGainLossState( character_state_column, current_node, UNCHANGED_PRESENT ); } else if ( current_binary_state == ABSENT ) { setGainLossState( character_state_column, current_node, UNCHANGED_ABSENT ); } } } else if ( isReturnGainLossMatrix() && current_node.isRoot() ) { final BinaryStates current_binary_state = ( BinaryStates ) traceback_states.get( current_node ); ++_total_unchanged; //new if ( current_binary_state == PRESENT ) {//new setGainLossState( character_state_column, current_node, UNCHANGED_PRESENT );//new }//new else if ( current_binary_state == ABSENT ) {//new setGainLossState( character_state_column, current_node, UNCHANGED_ABSENT );//new }//new // setGainLossState( character_state_column, current_node, UNKNOWN_GAIN_LOSS ); } } } private void reset() { setCost( 0 ); setTotalLosses( 0 ); setTotalGains( 0 ); setTotalUnchanged( 0 ); setRandomGenerator( new Random( getRandomNumberSeed() ) ); } private void setCost( final int cost ) { _cost = cost; } private void setGainLossMatrix( final CharacterStateMatrix gain_loss_matrix ) { _gain_loss_matrix = gain_loss_matrix; } private void setGainLossState( final int character_state_column, final PhylogenyNode node, final GainLossStates state ) { getGainLossMatrix().setState( ForesterUtil.isEmpty( node.getName() ) ? node.getId() + "" : node.getName(), character_state_column, state ); } private void setInternalNodeState( final Map states, final int character_state_column, final PhylogenyNode node ) { getInternalStatesMatrix() .setState( ForesterUtil.isEmpty( node.getName() ) ? node.getId() + "" : node.getName(), character_state_column, states.get( node ) ); } private void setInternalNodeStatePriorToTraceback( final Map> states, final int character_state_column, final PhylogenyNode node ) { getInternalStatesMatrixPriorToTraceback() .setState( ForesterUtil.isEmpty( node.getName() ) ? String.valueOf( node.getId() ) : node.getName(), character_state_column, toListSorted( states.get( node ) ) ); } private void setInternalStatesMatrixPriorToTraceback( final CharacterStateMatrix> internal_states_matrix_prior_to_traceback ) { _internal_states_matrix_prior_to_traceback = internal_states_matrix_prior_to_traceback; } private void setInternalStatesMatrixTraceback( final CharacterStateMatrix internal_states_matrix_after_traceback ) { _internal_states_matrix_after_traceback = internal_states_matrix_after_traceback; } private void setRandomGenerator( final Random random_generator ) { _random_generator = random_generator; } private void setTotalGains( final int total_gains ) { _total_gains = total_gains; } private void setTotalLosses( final int total_losses ) { _total_losses = total_losses; } private void setTotalUnchanged( final int total_unchanged ) { _total_unchanged = total_unchanged; } private List toListSorted( final SortedSet states ) { final List l = new ArrayList( states.size() ); for( final STATE_TYPE state : states ) { l.add( state ); } return l; } }