pacai.core.mdp
This file provides the core infrastructure for Markov Decision Processes (MDPs).
1""" 2This file provides the core infrastructure for Markov Decision Processes (MDPs). 3""" 4 5import abc 6import math 7import typing 8 9import edq.util.json 10 11import pacai.core.action 12import pacai.core.board 13import pacai.core.gamestate 14import pacai.util.comparable 15 16ACTION_EXIT: pacai.core.action.Action = pacai.core.action.Action('exit') 17""" A new action for exiting the MDP (used to reach the true terminal state). """ 18 19TERMINAL_POSITION: pacai.core.board.Position = pacai.core.board.Position(-1, -1) 20""" A special (impossible) position representing the terminal state. """ 21 22class MDPState(edq.util.json.DictConverter): 23 """ 24 A state or "node" in an MDP. 25 """ 26 27 @abc.abstractmethod 28 def is_terminal(self) -> bool: 29 """ Whether or not this state is the terminal state. """ 30 31class MDPStatePosition(MDPState): 32 """ 33 An MDP state that relies solely on position. 34 35 Because this class is fairly high-traffic (and MDPStateBoard can be large), 36 this class will prefer optimization over generalization. 37 """ 38 39 def __init__(self, 40 position: pacai.core.board.Position | dict[str, typing.Any] | None = None, 41 **kwargs: typing.Any) -> None: 42 if (position is None): 43 raise ValueError("Cannot create a MDPStatePosition without a position.") 44 45 if (isinstance(position, dict)): 46 position = pacai.core.board.Position.from_dict(position) 47 48 if (not isinstance(position, pacai.core.board.Position)): 49 raise ValueError(f"Cannot create a MDPStatePosition with a non-position type: {type(position)}.") 50 51 self.position: pacai.core.board.Position = position 52 """ The board position of this MDP state. """ 53 54 def is_terminal(self) -> bool: 55 return (self.position == TERMINAL_POSITION) 56 57 def __lt__(self, other: 'MDPStatePosition') -> bool: # type: ignore[override] 58 return (self.position < other.position) 59 60 def __eq__(self, other: object) -> bool: 61 if (not isinstance(other, MDPStatePosition)): 62 return False 63 64 return (self.position == other.position) 65 66 def __hash__(self) -> int: 67 return self.position._hash 68 69 def __str__(self) -> str: 70 return str(self.position) 71 72 def __repr__(self) -> str: 73 return str(self) 74 75 def to_dict(self) -> dict[str, typing.Any]: 76 return { 77 'position': self.position.to_dict(), 78 } 79 80 @classmethod 81 def from_dict(cls, data: dict[str, typing.Any]) -> typing.Any: 82 return MDPStatePosition(data['position']) 83 84class MDPStateBoard(MDPStatePosition): 85 """ 86 An MDP state that uses an entire board. 87 Technically, this only uses the non-wall markers for a board. 88 89 Using this will be quite slow and is generally not recommended for larger problems. 90 Because this class is fairly high-traffic (and the board data can be large), 91 this class will prefer optimization over generalization. 92 """ 93 94 def __init__(self, 95 board: pacai.core.board.Board | None = None, 96 game_state: pacai.core.gamestate.GameState | None = None, 97 _board_string: str | None = None, 98 **kwargs: typing.Any) -> None: 99 super().__init__(**kwargs) 100 101 if (_board_string is None): 102 if ((board is None) and (game_state is not None)): 103 board = game_state.board 104 105 if (board is None): 106 raise ValueError("Cannot create a MDPStateBoard without a board.") 107 108 _board_string = f"{self.position}::{board.get_nonwall_string()}" 109 110 self._board_string: str = _board_string 111 """ 112 The board represented as a string. 113 Converting the board to a string makes future comparisons easier. 114 """ 115 116 def __lt__(self, other: object) -> bool: 117 if (not isinstance(other, MDPStateBoard)): 118 return False 119 120 return self._board_string < other._board_string 121 122 def __eq__(self, other: object) -> bool: 123 if (not isinstance(other, MDPStateBoard)): 124 return False 125 126 return self._board_string == other._board_string 127 128 def __hash__(self) -> int: 129 return hash(self._board_string) 130 131 def __str__(self) -> str: 132 return super().__str__() + "::" + str(hash(self)) 133 134 def __repr__(self) -> str: 135 return str(self) 136 137 def to_dict(self) -> dict[str, typing.Any]: 138 return { 139 'position': self.position.to_dict(), 140 '_board_string': self._board_string, 141 } 142 143 @classmethod 144 def from_dict(cls, data: dict[str, typing.Any]) -> typing.Any: 145 return MDPStateBoard(position = data['position'], _board_string = data['_board_string']) 146 147StateType = typing.TypeVar('StateType', bound = MDPState) # pylint: disable=invalid-name 148 149class Transition(typing.Generic[StateType]): 150 """ 151 A possible result of taking some action in an MDP. 152 """ 153 154 def __init__(self, 155 state: StateType, 156 action: pacai.core.action.Action, 157 probability: float, 158 reward: float, 159 **kwargs: typing.Any) -> None: 160 self.state = state 161 """ The MDP state reached by this transition. """ 162 163 self.action = action 164 """ The action taken to reach the given state. """ 165 166 self.probability = probability 167 """ The probability that this transition will be taken. """ 168 169 self.reward = reward 170 """ The reward for taking this transition. """ 171 172 def update(self, other: 'Transition', check: bool = True) -> None: 173 """ 174 Add the probability from the other transition into this one. 175 If check is true, then the states and rewards for these transitions must match. 176 """ 177 178 if (check and (self.state != other.state)): 179 raise ValueError(f"State of merging transitions does not match. Expected: '{self.state}', Found: '{other.state}'.") 180 181 if (check and (not math.isclose(self.reward, other.reward))): 182 raise ValueError(f"Reward of merging transitions does not match. Expected: '{self.reward}', Found: '{other.reward}'.") 183 184 self.probability += other.probability 185 186class MarkovDecisionProcess(typing.Generic[StateType], edq.util.json.DictConverter): 187 """ 188 A class that implements a Markov Decision Process (MDP). 189 190 See: https://en.wikipedia.org/wiki/Markov_decision_process . 191 """ 192 193 def game_start(self, initial_game_state: pacai.core.gamestate.GameState) -> None: 194 """ 195 Inform the MDP about the game's start. 196 This is the MDP's first chance to see the game/board and initialize the appropriate data. 197 """ 198 199 @abc.abstractmethod 200 def get_starting_state(self) -> StateType: 201 """ Return the starting state of this MDP. """ 202 203 @abc.abstractmethod 204 def get_states(self) -> list[StateType]: 205 """ Return a list of all states in this MDP. """ 206 207 @abc.abstractmethod 208 def is_terminal_state(self, state: StateType) -> bool: 209 """ 210 Returns true if the given state is a terminal state. 211 By convention, a terminal state has zero future rewards. 212 Sometimes the terminal state(s) may have no possible actions. 213 It is also common to think of the terminal state as having 214 a self-loop action 'pass' with zero reward; the formulations are equivalent. 215 """ 216 217 @abc.abstractmethod 218 def get_possible_actions(self, state: StateType) -> list[pacai.core.action.Action]: 219 """ Return the possible actions from the given MDP state. """ 220 221 @abc.abstractmethod 222 def get_transitions(self, state: StateType, action: pacai.core.action.Action) -> list[Transition[StateType]]: 223 """ 224 Get a list of the possible transitions from the given state with the given action. 225 A transition consists of the target state, the probability that the transition is taken, 226 and the reward for taking that transition. 227 228 All transition probabilities should add up to 1.0. 229 230 Note that in some methods like Q-Learning and reinforcement learning, 231 we do not know these probabilities nor do we directly model them. 232 """ 233 234 def to_dict(self) -> dict[str, typing.Any]: 235 return vars(self).copy() 236 237 @classmethod 238 def from_dict(cls, data: dict[str, typing.Any]) -> typing.Any: 239 return cls(**data)
A new action for exiting the MDP (used to reach the true terminal state).
A special (impossible) position representing the terminal state.
23class MDPState(edq.util.json.DictConverter): 24 """ 25 A state or "node" in an MDP. 26 """ 27 28 @abc.abstractmethod 29 def is_terminal(self) -> bool: 30 """ Whether or not this state is the terminal state. """
A state or "node" in an MDP.
28 @abc.abstractmethod 29 def is_terminal(self) -> bool: 30 """ Whether or not this state is the terminal state. """
Whether or not this state is the terminal state.
Inherited Members
32class MDPStatePosition(MDPState): 33 """ 34 An MDP state that relies solely on position. 35 36 Because this class is fairly high-traffic (and MDPStateBoard can be large), 37 this class will prefer optimization over generalization. 38 """ 39 40 def __init__(self, 41 position: pacai.core.board.Position | dict[str, typing.Any] | None = None, 42 **kwargs: typing.Any) -> None: 43 if (position is None): 44 raise ValueError("Cannot create a MDPStatePosition without a position.") 45 46 if (isinstance(position, dict)): 47 position = pacai.core.board.Position.from_dict(position) 48 49 if (not isinstance(position, pacai.core.board.Position)): 50 raise ValueError(f"Cannot create a MDPStatePosition with a non-position type: {type(position)}.") 51 52 self.position: pacai.core.board.Position = position 53 """ The board position of this MDP state. """ 54 55 def is_terminal(self) -> bool: 56 return (self.position == TERMINAL_POSITION) 57 58 def __lt__(self, other: 'MDPStatePosition') -> bool: # type: ignore[override] 59 return (self.position < other.position) 60 61 def __eq__(self, other: object) -> bool: 62 if (not isinstance(other, MDPStatePosition)): 63 return False 64 65 return (self.position == other.position) 66 67 def __hash__(self) -> int: 68 return self.position._hash 69 70 def __str__(self) -> str: 71 return str(self.position) 72 73 def __repr__(self) -> str: 74 return str(self) 75 76 def to_dict(self) -> dict[str, typing.Any]: 77 return { 78 'position': self.position.to_dict(), 79 } 80 81 @classmethod 82 def from_dict(cls, data: dict[str, typing.Any]) -> typing.Any: 83 return MDPStatePosition(data['position'])
An MDP state that relies solely on position.
Because this class is fairly high-traffic (and MDPStateBoard can be large), this class will prefer optimization over generalization.
40 def __init__(self, 41 position: pacai.core.board.Position | dict[str, typing.Any] | None = None, 42 **kwargs: typing.Any) -> None: 43 if (position is None): 44 raise ValueError("Cannot create a MDPStatePosition without a position.") 45 46 if (isinstance(position, dict)): 47 position = pacai.core.board.Position.from_dict(position) 48 49 if (not isinstance(position, pacai.core.board.Position)): 50 raise ValueError(f"Cannot create a MDPStatePosition with a non-position type: {type(position)}.") 51 52 self.position: pacai.core.board.Position = position 53 """ The board position of this MDP state. """
76 def to_dict(self) -> dict[str, typing.Any]: 77 return { 78 'position': self.position.to_dict(), 79 }
Return a dict that can be used to represent this object. If the dict is passed to from_dict(), an identical object should be reconstructed.
A general (but inefficient) implementation is provided by default.
81 @classmethod 82 def from_dict(cls, data: dict[str, typing.Any]) -> typing.Any: 83 return MDPStatePosition(data['position'])
Return an instance of this subclass created using the given dict. If the dict came from to_dict(), the returned object should be identical to the original.
A general (but inefficient) implementation is provided by default.
85class MDPStateBoard(MDPStatePosition): 86 """ 87 An MDP state that uses an entire board. 88 Technically, this only uses the non-wall markers for a board. 89 90 Using this will be quite slow and is generally not recommended for larger problems. 91 Because this class is fairly high-traffic (and the board data can be large), 92 this class will prefer optimization over generalization. 93 """ 94 95 def __init__(self, 96 board: pacai.core.board.Board | None = None, 97 game_state: pacai.core.gamestate.GameState | None = None, 98 _board_string: str | None = None, 99 **kwargs: typing.Any) -> None: 100 super().__init__(**kwargs) 101 102 if (_board_string is None): 103 if ((board is None) and (game_state is not None)): 104 board = game_state.board 105 106 if (board is None): 107 raise ValueError("Cannot create a MDPStateBoard without a board.") 108 109 _board_string = f"{self.position}::{board.get_nonwall_string()}" 110 111 self._board_string: str = _board_string 112 """ 113 The board represented as a string. 114 Converting the board to a string makes future comparisons easier. 115 """ 116 117 def __lt__(self, other: object) -> bool: 118 if (not isinstance(other, MDPStateBoard)): 119 return False 120 121 return self._board_string < other._board_string 122 123 def __eq__(self, other: object) -> bool: 124 if (not isinstance(other, MDPStateBoard)): 125 return False 126 127 return self._board_string == other._board_string 128 129 def __hash__(self) -> int: 130 return hash(self._board_string) 131 132 def __str__(self) -> str: 133 return super().__str__() + "::" + str(hash(self)) 134 135 def __repr__(self) -> str: 136 return str(self) 137 138 def to_dict(self) -> dict[str, typing.Any]: 139 return { 140 'position': self.position.to_dict(), 141 '_board_string': self._board_string, 142 } 143 144 @classmethod 145 def from_dict(cls, data: dict[str, typing.Any]) -> typing.Any: 146 return MDPStateBoard(position = data['position'], _board_string = data['_board_string'])
An MDP state that uses an entire board. Technically, this only uses the non-wall markers for a board.
Using this will be quite slow and is generally not recommended for larger problems. Because this class is fairly high-traffic (and the board data can be large), this class will prefer optimization over generalization.
95 def __init__(self, 96 board: pacai.core.board.Board | None = None, 97 game_state: pacai.core.gamestate.GameState | None = None, 98 _board_string: str | None = None, 99 **kwargs: typing.Any) -> None: 100 super().__init__(**kwargs) 101 102 if (_board_string is None): 103 if ((board is None) and (game_state is not None)): 104 board = game_state.board 105 106 if (board is None): 107 raise ValueError("Cannot create a MDPStateBoard without a board.") 108 109 _board_string = f"{self.position}::{board.get_nonwall_string()}" 110 111 self._board_string: str = _board_string 112 """ 113 The board represented as a string. 114 Converting the board to a string makes future comparisons easier. 115 """
138 def to_dict(self) -> dict[str, typing.Any]: 139 return { 140 'position': self.position.to_dict(), 141 '_board_string': self._board_string, 142 }
Return a dict that can be used to represent this object. If the dict is passed to from_dict(), an identical object should be reconstructed.
A general (but inefficient) implementation is provided by default.
144 @classmethod 145 def from_dict(cls, data: dict[str, typing.Any]) -> typing.Any: 146 return MDPStateBoard(position = data['position'], _board_string = data['_board_string'])
Return an instance of this subclass created using the given dict. If the dict came from to_dict(), the returned object should be identical to the original.
A general (but inefficient) implementation is provided by default.
Inherited Members
150class Transition(typing.Generic[StateType]): 151 """ 152 A possible result of taking some action in an MDP. 153 """ 154 155 def __init__(self, 156 state: StateType, 157 action: pacai.core.action.Action, 158 probability: float, 159 reward: float, 160 **kwargs: typing.Any) -> None: 161 self.state = state 162 """ The MDP state reached by this transition. """ 163 164 self.action = action 165 """ The action taken to reach the given state. """ 166 167 self.probability = probability 168 """ The probability that this transition will be taken. """ 169 170 self.reward = reward 171 """ The reward for taking this transition. """ 172 173 def update(self, other: 'Transition', check: bool = True) -> None: 174 """ 175 Add the probability from the other transition into this one. 176 If check is true, then the states and rewards for these transitions must match. 177 """ 178 179 if (check and (self.state != other.state)): 180 raise ValueError(f"State of merging transitions does not match. Expected: '{self.state}', Found: '{other.state}'.") 181 182 if (check and (not math.isclose(self.reward, other.reward))): 183 raise ValueError(f"Reward of merging transitions does not match. Expected: '{self.reward}', Found: '{other.reward}'.") 184 185 self.probability += other.probability
A possible result of taking some action in an MDP.
155 def __init__(self, 156 state: StateType, 157 action: pacai.core.action.Action, 158 probability: float, 159 reward: float, 160 **kwargs: typing.Any) -> None: 161 self.state = state 162 """ The MDP state reached by this transition. """ 163 164 self.action = action 165 """ The action taken to reach the given state. """ 166 167 self.probability = probability 168 """ The probability that this transition will be taken. """ 169 170 self.reward = reward 171 """ The reward for taking this transition. """
173 def update(self, other: 'Transition', check: bool = True) -> None: 174 """ 175 Add the probability from the other transition into this one. 176 If check is true, then the states and rewards for these transitions must match. 177 """ 178 179 if (check and (self.state != other.state)): 180 raise ValueError(f"State of merging transitions does not match. Expected: '{self.state}', Found: '{other.state}'.") 181 182 if (check and (not math.isclose(self.reward, other.reward))): 183 raise ValueError(f"Reward of merging transitions does not match. Expected: '{self.reward}', Found: '{other.reward}'.") 184 185 self.probability += other.probability
Add the probability from the other transition into this one. If check is true, then the states and rewards for these transitions must match.
187class MarkovDecisionProcess(typing.Generic[StateType], edq.util.json.DictConverter): 188 """ 189 A class that implements a Markov Decision Process (MDP). 190 191 See: https://en.wikipedia.org/wiki/Markov_decision_process . 192 """ 193 194 def game_start(self, initial_game_state: pacai.core.gamestate.GameState) -> None: 195 """ 196 Inform the MDP about the game's start. 197 This is the MDP's first chance to see the game/board and initialize the appropriate data. 198 """ 199 200 @abc.abstractmethod 201 def get_starting_state(self) -> StateType: 202 """ Return the starting state of this MDP. """ 203 204 @abc.abstractmethod 205 def get_states(self) -> list[StateType]: 206 """ Return a list of all states in this MDP. """ 207 208 @abc.abstractmethod 209 def is_terminal_state(self, state: StateType) -> bool: 210 """ 211 Returns true if the given state is a terminal state. 212 By convention, a terminal state has zero future rewards. 213 Sometimes the terminal state(s) may have no possible actions. 214 It is also common to think of the terminal state as having 215 a self-loop action 'pass' with zero reward; the formulations are equivalent. 216 """ 217 218 @abc.abstractmethod 219 def get_possible_actions(self, state: StateType) -> list[pacai.core.action.Action]: 220 """ Return the possible actions from the given MDP state. """ 221 222 @abc.abstractmethod 223 def get_transitions(self, state: StateType, action: pacai.core.action.Action) -> list[Transition[StateType]]: 224 """ 225 Get a list of the possible transitions from the given state with the given action. 226 A transition consists of the target state, the probability that the transition is taken, 227 and the reward for taking that transition. 228 229 All transition probabilities should add up to 1.0. 230 231 Note that in some methods like Q-Learning and reinforcement learning, 232 we do not know these probabilities nor do we directly model them. 233 """ 234 235 def to_dict(self) -> dict[str, typing.Any]: 236 return vars(self).copy() 237 238 @classmethod 239 def from_dict(cls, data: dict[str, typing.Any]) -> typing.Any: 240 return cls(**data)
A class that implements a Markov Decision Process (MDP).
See: https://en.wikipedia.org/wiki/Markov_decision_process .
194 def game_start(self, initial_game_state: pacai.core.gamestate.GameState) -> None: 195 """ 196 Inform the MDP about the game's start. 197 This is the MDP's first chance to see the game/board and initialize the appropriate data. 198 """
Inform the MDP about the game's start. This is the MDP's first chance to see the game/board and initialize the appropriate data.
200 @abc.abstractmethod 201 def get_starting_state(self) -> StateType: 202 """ Return the starting state of this MDP. """
Return the starting state of this MDP.
204 @abc.abstractmethod 205 def get_states(self) -> list[StateType]: 206 """ Return a list of all states in this MDP. """
Return a list of all states in this MDP.
208 @abc.abstractmethod 209 def is_terminal_state(self, state: StateType) -> bool: 210 """ 211 Returns true if the given state is a terminal state. 212 By convention, a terminal state has zero future rewards. 213 Sometimes the terminal state(s) may have no possible actions. 214 It is also common to think of the terminal state as having 215 a self-loop action 'pass' with zero reward; the formulations are equivalent. 216 """
Returns true if the given state is a terminal state. By convention, a terminal state has zero future rewards. Sometimes the terminal state(s) may have no possible actions. It is also common to think of the terminal state as having a self-loop action 'pass' with zero reward; the formulations are equivalent.
218 @abc.abstractmethod 219 def get_possible_actions(self, state: StateType) -> list[pacai.core.action.Action]: 220 """ Return the possible actions from the given MDP state. """
Return the possible actions from the given MDP state.
222 @abc.abstractmethod 223 def get_transitions(self, state: StateType, action: pacai.core.action.Action) -> list[Transition[StateType]]: 224 """ 225 Get a list of the possible transitions from the given state with the given action. 226 A transition consists of the target state, the probability that the transition is taken, 227 and the reward for taking that transition. 228 229 All transition probabilities should add up to 1.0. 230 231 Note that in some methods like Q-Learning and reinforcement learning, 232 we do not know these probabilities nor do we directly model them. 233 """
Get a list of the possible transitions from the given state with the given action. A transition consists of the target state, the probability that the transition is taken, and the reward for taking that transition.
All transition probabilities should add up to 1.0.
Note that in some methods like Q-Learning and reinforcement learning, we do not know these probabilities nor do we directly model them.