pacai.core.mdp

This file provides the core infrastructure for Markov Decision Processes (MDPs).

  1"""
  2This file provides the core infrastructure for Markov Decision Processes (MDPs).
  3"""
  4
  5import abc
  6import math
  7import typing
  8
  9import edq.util.json
 10
 11import pacai.core.action
 12import pacai.core.board
 13import pacai.core.gamestate
 14import pacai.util.comparable
 15
 16ACTION_EXIT: pacai.core.action.Action = pacai.core.action.Action('exit')
 17""" A new action for exiting the MDP (used to reach the true terminal state). """
 18
 19TERMINAL_POSITION: pacai.core.board.Position = pacai.core.board.Position(-1, -1)
 20""" A special (impossible) position representing the terminal state. """
 21
 22class MDPState(edq.util.json.DictConverter):
 23    """
 24    A state or "node" in an MDP.
 25    """
 26
 27    @abc.abstractmethod
 28    def is_terminal(self) -> bool:
 29        """ Whether or not this state is the terminal state. """
 30
 31class MDPStatePosition(MDPState):
 32    """
 33    An MDP state that relies solely on position.
 34
 35    Because this class is fairly high-traffic (and MDPStateBoard can be large),
 36    this class will prefer optimization over generalization.
 37    """
 38
 39    def __init__(self,
 40            position: pacai.core.board.Position | dict[str, typing.Any] | None = None,
 41            **kwargs: typing.Any) -> None:
 42        if (position is None):
 43            raise ValueError("Cannot create a MDPStatePosition without a position.")
 44
 45        if (isinstance(position, dict)):
 46            position = pacai.core.board.Position.from_dict(position)
 47
 48        if (not isinstance(position, pacai.core.board.Position)):
 49            raise ValueError(f"Cannot create a MDPStatePosition with a non-position type: {type(position)}.")
 50
 51        self.position: pacai.core.board.Position = position
 52        """ The board position of this MDP state. """
 53
 54    def is_terminal(self) -> bool:
 55        return (self.position == TERMINAL_POSITION)
 56
 57    def __lt__(self, other: 'MDPStatePosition') -> bool:  # type: ignore[override]
 58        return (self.position < other.position)
 59
 60    def __eq__(self, other: object) -> bool:
 61        if (not isinstance(other, MDPStatePosition)):
 62            return False
 63
 64        return (self.position == other.position)
 65
 66    def __hash__(self) -> int:
 67        return self.position._hash
 68
 69    def __str__(self) -> str:
 70        return str(self.position)
 71
 72    def __repr__(self) -> str:
 73        return str(self)
 74
 75    def to_dict(self) -> dict[str, typing.Any]:
 76        return {
 77            'position': self.position.to_dict(),
 78        }
 79
 80    @classmethod
 81    def from_dict(cls, data: dict[str, typing.Any]) -> typing.Any:
 82        return MDPStatePosition(data['position'])
 83
 84class MDPStateBoard(MDPStatePosition):
 85    """
 86    An MDP state that uses an entire board.
 87    Technically, this only uses the non-wall markers for a board.
 88
 89    Using this will be quite slow and is generally not recommended for larger problems.
 90    Because this class is fairly high-traffic (and the board data can be large),
 91    this class will prefer optimization over generalization.
 92    """
 93
 94    def __init__(self,
 95            board: pacai.core.board.Board | None = None,
 96            game_state: pacai.core.gamestate.GameState | None = None,
 97            _board_string: str | None = None,
 98            **kwargs: typing.Any) -> None:
 99        super().__init__(**kwargs)
100
101        if (_board_string is None):
102            if ((board is None) and (game_state is not None)):
103                board = game_state.board
104
105            if (board is None):
106                raise ValueError("Cannot create a MDPStateBoard without a board.")
107
108            _board_string = f"{self.position}::{board.get_nonwall_string()}"
109
110        self._board_string: str = _board_string
111        """
112        The board represented as a string.
113        Converting the board to a string makes future comparisons easier.
114        """
115
116    def __lt__(self, other: object) -> bool:
117        if (not isinstance(other, MDPStateBoard)):
118            return False
119
120        return self._board_string < other._board_string
121
122    def __eq__(self, other: object) -> bool:
123        if (not isinstance(other, MDPStateBoard)):
124            return False
125
126        return self._board_string == other._board_string
127
128    def __hash__(self) -> int:
129        return hash(self._board_string)
130
131    def __str__(self) -> str:
132        return super().__str__() + "::" + str(hash(self))
133
134    def __repr__(self) -> str:
135        return str(self)
136
137    def to_dict(self) -> dict[str, typing.Any]:
138        return {
139            'position': self.position.to_dict(),
140            '_board_string': self._board_string,
141        }
142
143    @classmethod
144    def from_dict(cls, data: dict[str, typing.Any]) -> typing.Any:
145        return MDPStateBoard(position = data['position'], _board_string = data['_board_string'])
146
147StateType = typing.TypeVar('StateType', bound = MDPState)  # pylint: disable=invalid-name
148
149class Transition(typing.Generic[StateType]):
150    """
151    A possible result of taking some action in an MDP.
152    """
153
154    def __init__(self,
155            state: StateType,
156            action: pacai.core.action.Action,
157            probability: float,
158            reward: float,
159            **kwargs: typing.Any) -> None:
160        self.state = state
161        """ The MDP state reached by this transition. """
162
163        self.action = action
164        """ The action taken to reach the given state. """
165
166        self.probability = probability
167        """ The probability that this transition will be taken. """
168
169        self.reward = reward
170        """ The reward for taking this transition. """
171
172    def update(self, other: 'Transition', check: bool = True) -> None:
173        """
174        Add the probability from the other transition into this one.
175        If check is true, then the states and rewards for these transitions must match.
176        """
177
178        if (check and (self.state != other.state)):
179            raise ValueError(f"State of merging transitions does not match. Expected: '{self.state}', Found: '{other.state}'.")
180
181        if (check and (not math.isclose(self.reward, other.reward))):
182            raise ValueError(f"Reward of merging transitions does not match. Expected: '{self.reward}', Found: '{other.reward}'.")
183
184        self.probability += other.probability
185
186class MarkovDecisionProcess(typing.Generic[StateType], edq.util.json.DictConverter):
187    """
188    A class that implements a Markov Decision Process (MDP).
189
190    See: https://en.wikipedia.org/wiki/Markov_decision_process .
191    """
192
193    def game_start(self, initial_game_state: pacai.core.gamestate.GameState) -> None:
194        """
195        Inform the MDP about the game's start.
196        This is the MDP's first chance to see the game/board and initialize the appropriate data.
197        """
198
199    @abc.abstractmethod
200    def get_starting_state(self) -> StateType:
201        """ Return the starting state of this MDP. """
202
203    @abc.abstractmethod
204    def get_states(self) -> list[StateType]:
205        """ Return a list of all states in this MDP. """
206
207    @abc.abstractmethod
208    def is_terminal_state(self, state: StateType) -> bool:
209        """
210        Returns true if the given state is a terminal state.
211        By convention, a terminal state has zero future rewards.
212        Sometimes the terminal state(s) may have no possible actions.
213        It is also common to think of the terminal state as having
214        a self-loop action 'pass' with zero reward; the formulations are equivalent.
215        """
216
217    @abc.abstractmethod
218    def get_possible_actions(self, state: StateType) -> list[pacai.core.action.Action]:
219        """ Return the possible actions from the given MDP state. """
220
221    @abc.abstractmethod
222    def get_transitions(self, state: StateType, action: pacai.core.action.Action) -> list[Transition[StateType]]:
223        """
224        Get a list of the possible transitions from the given state with the given action.
225        A transition consists of the target state, the probability that the transition is taken,
226        and the reward for taking that transition.
227
228        All transition probabilities should add up to 1.0.
229
230        Note that in some methods like Q-Learning and reinforcement learning,
231        we do not know these probabilities nor do we directly model them.
232        """
233
234    def to_dict(self) -> dict[str, typing.Any]:
235        return vars(self).copy()
236
237    @classmethod
238    def from_dict(cls, data: dict[str, typing.Any]) -> typing.Any:
239        return cls(**data)
ACTION_EXIT: pacai.core.action.Action = 'EXIT'

A new action for exiting the MDP (used to reach the true terminal state).

TERMINAL_POSITION: pacai.core.board.Position = (-1, -1)

A special (impossible) position representing the terminal state.

class MDPState(edq.util.json.DictConverter):
23class MDPState(edq.util.json.DictConverter):
24    """
25    A state or "node" in an MDP.
26    """
27
28    @abc.abstractmethod
29    def is_terminal(self) -> bool:
30        """ Whether or not this state is the terminal state. """

A state or "node" in an MDP.

@abc.abstractmethod
def is_terminal(self) -> bool:
28    @abc.abstractmethod
29    def is_terminal(self) -> bool:
30        """ Whether or not this state is the terminal state. """

Whether or not this state is the terminal state.

class MDPStatePosition(MDPState):
32class MDPStatePosition(MDPState):
33    """
34    An MDP state that relies solely on position.
35
36    Because this class is fairly high-traffic (and MDPStateBoard can be large),
37    this class will prefer optimization over generalization.
38    """
39
40    def __init__(self,
41            position: pacai.core.board.Position | dict[str, typing.Any] | None = None,
42            **kwargs: typing.Any) -> None:
43        if (position is None):
44            raise ValueError("Cannot create a MDPStatePosition without a position.")
45
46        if (isinstance(position, dict)):
47            position = pacai.core.board.Position.from_dict(position)
48
49        if (not isinstance(position, pacai.core.board.Position)):
50            raise ValueError(f"Cannot create a MDPStatePosition with a non-position type: {type(position)}.")
51
52        self.position: pacai.core.board.Position = position
53        """ The board position of this MDP state. """
54
55    def is_terminal(self) -> bool:
56        return (self.position == TERMINAL_POSITION)
57
58    def __lt__(self, other: 'MDPStatePosition') -> bool:  # type: ignore[override]
59        return (self.position < other.position)
60
61    def __eq__(self, other: object) -> bool:
62        if (not isinstance(other, MDPStatePosition)):
63            return False
64
65        return (self.position == other.position)
66
67    def __hash__(self) -> int:
68        return self.position._hash
69
70    def __str__(self) -> str:
71        return str(self.position)
72
73    def __repr__(self) -> str:
74        return str(self)
75
76    def to_dict(self) -> dict[str, typing.Any]:
77        return {
78            'position': self.position.to_dict(),
79        }
80
81    @classmethod
82    def from_dict(cls, data: dict[str, typing.Any]) -> typing.Any:
83        return MDPStatePosition(data['position'])

An MDP state that relies solely on position.

Because this class is fairly high-traffic (and MDPStateBoard can be large), this class will prefer optimization over generalization.

MDPStatePosition( position: pacai.core.board.Position | dict[str, typing.Any] | None = None, **kwargs: Any)
40    def __init__(self,
41            position: pacai.core.board.Position | dict[str, typing.Any] | None = None,
42            **kwargs: typing.Any) -> None:
43        if (position is None):
44            raise ValueError("Cannot create a MDPStatePosition without a position.")
45
46        if (isinstance(position, dict)):
47            position = pacai.core.board.Position.from_dict(position)
48
49        if (not isinstance(position, pacai.core.board.Position)):
50            raise ValueError(f"Cannot create a MDPStatePosition with a non-position type: {type(position)}.")
51
52        self.position: pacai.core.board.Position = position
53        """ The board position of this MDP state. """

The board position of this MDP state.

def is_terminal(self) -> bool:
55    def is_terminal(self) -> bool:
56        return (self.position == TERMINAL_POSITION)

Whether or not this state is the terminal state.

def to_dict(self) -> dict[str, typing.Any]:
76    def to_dict(self) -> dict[str, typing.Any]:
77        return {
78            'position': self.position.to_dict(),
79        }

Return a dict that can be used to represent this object. If the dict is passed to from_dict(), an identical object should be reconstructed.

A general (but inefficient) implementation is provided by default.

@classmethod
def from_dict(cls, data: dict[str, typing.Any]) -> Any:
81    @classmethod
82    def from_dict(cls, data: dict[str, typing.Any]) -> typing.Any:
83        return MDPStatePosition(data['position'])

Return an instance of this subclass created using the given dict. If the dict came from to_dict(), the returned object should be identical to the original.

A general (but inefficient) implementation is provided by default.

class MDPStateBoard(MDPStatePosition):
 85class MDPStateBoard(MDPStatePosition):
 86    """
 87    An MDP state that uses an entire board.
 88    Technically, this only uses the non-wall markers for a board.
 89
 90    Using this will be quite slow and is generally not recommended for larger problems.
 91    Because this class is fairly high-traffic (and the board data can be large),
 92    this class will prefer optimization over generalization.
 93    """
 94
 95    def __init__(self,
 96            board: pacai.core.board.Board | None = None,
 97            game_state: pacai.core.gamestate.GameState | None = None,
 98            _board_string: str | None = None,
 99            **kwargs: typing.Any) -> None:
100        super().__init__(**kwargs)
101
102        if (_board_string is None):
103            if ((board is None) and (game_state is not None)):
104                board = game_state.board
105
106            if (board is None):
107                raise ValueError("Cannot create a MDPStateBoard without a board.")
108
109            _board_string = f"{self.position}::{board.get_nonwall_string()}"
110
111        self._board_string: str = _board_string
112        """
113        The board represented as a string.
114        Converting the board to a string makes future comparisons easier.
115        """
116
117    def __lt__(self, other: object) -> bool:
118        if (not isinstance(other, MDPStateBoard)):
119            return False
120
121        return self._board_string < other._board_string
122
123    def __eq__(self, other: object) -> bool:
124        if (not isinstance(other, MDPStateBoard)):
125            return False
126
127        return self._board_string == other._board_string
128
129    def __hash__(self) -> int:
130        return hash(self._board_string)
131
132    def __str__(self) -> str:
133        return super().__str__() + "::" + str(hash(self))
134
135    def __repr__(self) -> str:
136        return str(self)
137
138    def to_dict(self) -> dict[str, typing.Any]:
139        return {
140            'position': self.position.to_dict(),
141            '_board_string': self._board_string,
142        }
143
144    @classmethod
145    def from_dict(cls, data: dict[str, typing.Any]) -> typing.Any:
146        return MDPStateBoard(position = data['position'], _board_string = data['_board_string'])

An MDP state that uses an entire board. Technically, this only uses the non-wall markers for a board.

Using this will be quite slow and is generally not recommended for larger problems. Because this class is fairly high-traffic (and the board data can be large), this class will prefer optimization over generalization.

MDPStateBoard( board: pacai.core.board.Board | None = None, game_state: pacai.core.gamestate.GameState | None = None, _board_string: str | None = None, **kwargs: Any)
 95    def __init__(self,
 96            board: pacai.core.board.Board | None = None,
 97            game_state: pacai.core.gamestate.GameState | None = None,
 98            _board_string: str | None = None,
 99            **kwargs: typing.Any) -> None:
100        super().__init__(**kwargs)
101
102        if (_board_string is None):
103            if ((board is None) and (game_state is not None)):
104                board = game_state.board
105
106            if (board is None):
107                raise ValueError("Cannot create a MDPStateBoard without a board.")
108
109            _board_string = f"{self.position}::{board.get_nonwall_string()}"
110
111        self._board_string: str = _board_string
112        """
113        The board represented as a string.
114        Converting the board to a string makes future comparisons easier.
115        """
def to_dict(self) -> dict[str, typing.Any]:
138    def to_dict(self) -> dict[str, typing.Any]:
139        return {
140            'position': self.position.to_dict(),
141            '_board_string': self._board_string,
142        }

Return a dict that can be used to represent this object. If the dict is passed to from_dict(), an identical object should be reconstructed.

A general (but inefficient) implementation is provided by default.

@classmethod
def from_dict(cls, data: dict[str, typing.Any]) -> Any:
144    @classmethod
145    def from_dict(cls, data: dict[str, typing.Any]) -> typing.Any:
146        return MDPStateBoard(position = data['position'], _board_string = data['_board_string'])

Return an instance of this subclass created using the given dict. If the dict came from to_dict(), the returned object should be identical to the original.

A general (but inefficient) implementation is provided by default.

class Transition(typing.Generic[~StateType]):
150class Transition(typing.Generic[StateType]):
151    """
152    A possible result of taking some action in an MDP.
153    """
154
155    def __init__(self,
156            state: StateType,
157            action: pacai.core.action.Action,
158            probability: float,
159            reward: float,
160            **kwargs: typing.Any) -> None:
161        self.state = state
162        """ The MDP state reached by this transition. """
163
164        self.action = action
165        """ The action taken to reach the given state. """
166
167        self.probability = probability
168        """ The probability that this transition will be taken. """
169
170        self.reward = reward
171        """ The reward for taking this transition. """
172
173    def update(self, other: 'Transition', check: bool = True) -> None:
174        """
175        Add the probability from the other transition into this one.
176        If check is true, then the states and rewards for these transitions must match.
177        """
178
179        if (check and (self.state != other.state)):
180            raise ValueError(f"State of merging transitions does not match. Expected: '{self.state}', Found: '{other.state}'.")
181
182        if (check and (not math.isclose(self.reward, other.reward))):
183            raise ValueError(f"Reward of merging transitions does not match. Expected: '{self.reward}', Found: '{other.reward}'.")
184
185        self.probability += other.probability

A possible result of taking some action in an MDP.

Transition( state: ~StateType, action: pacai.core.action.Action, probability: float, reward: float, **kwargs: Any)
155    def __init__(self,
156            state: StateType,
157            action: pacai.core.action.Action,
158            probability: float,
159            reward: float,
160            **kwargs: typing.Any) -> None:
161        self.state = state
162        """ The MDP state reached by this transition. """
163
164        self.action = action
165        """ The action taken to reach the given state. """
166
167        self.probability = probability
168        """ The probability that this transition will be taken. """
169
170        self.reward = reward
171        """ The reward for taking this transition. """
state

The MDP state reached by this transition.

action

The action taken to reach the given state.

probability

The probability that this transition will be taken.

reward

The reward for taking this transition.

def update(self, other: Transition, check: bool = True) -> None:
173    def update(self, other: 'Transition', check: bool = True) -> None:
174        """
175        Add the probability from the other transition into this one.
176        If check is true, then the states and rewards for these transitions must match.
177        """
178
179        if (check and (self.state != other.state)):
180            raise ValueError(f"State of merging transitions does not match. Expected: '{self.state}', Found: '{other.state}'.")
181
182        if (check and (not math.isclose(self.reward, other.reward))):
183            raise ValueError(f"Reward of merging transitions does not match. Expected: '{self.reward}', Found: '{other.reward}'.")
184
185        self.probability += other.probability

Add the probability from the other transition into this one. If check is true, then the states and rewards for these transitions must match.

class MarkovDecisionProcess(typing.Generic[~StateType], edq.util.json.DictConverter):
187class MarkovDecisionProcess(typing.Generic[StateType], edq.util.json.DictConverter):
188    """
189    A class that implements a Markov Decision Process (MDP).
190
191    See: https://en.wikipedia.org/wiki/Markov_decision_process .
192    """
193
194    def game_start(self, initial_game_state: pacai.core.gamestate.GameState) -> None:
195        """
196        Inform the MDP about the game's start.
197        This is the MDP's first chance to see the game/board and initialize the appropriate data.
198        """
199
200    @abc.abstractmethod
201    def get_starting_state(self) -> StateType:
202        """ Return the starting state of this MDP. """
203
204    @abc.abstractmethod
205    def get_states(self) -> list[StateType]:
206        """ Return a list of all states in this MDP. """
207
208    @abc.abstractmethod
209    def is_terminal_state(self, state: StateType) -> bool:
210        """
211        Returns true if the given state is a terminal state.
212        By convention, a terminal state has zero future rewards.
213        Sometimes the terminal state(s) may have no possible actions.
214        It is also common to think of the terminal state as having
215        a self-loop action 'pass' with zero reward; the formulations are equivalent.
216        """
217
218    @abc.abstractmethod
219    def get_possible_actions(self, state: StateType) -> list[pacai.core.action.Action]:
220        """ Return the possible actions from the given MDP state. """
221
222    @abc.abstractmethod
223    def get_transitions(self, state: StateType, action: pacai.core.action.Action) -> list[Transition[StateType]]:
224        """
225        Get a list of the possible transitions from the given state with the given action.
226        A transition consists of the target state, the probability that the transition is taken,
227        and the reward for taking that transition.
228
229        All transition probabilities should add up to 1.0.
230
231        Note that in some methods like Q-Learning and reinforcement learning,
232        we do not know these probabilities nor do we directly model them.
233        """
234
235    def to_dict(self) -> dict[str, typing.Any]:
236        return vars(self).copy()
237
238    @classmethod
239    def from_dict(cls, data: dict[str, typing.Any]) -> typing.Any:
240        return cls(**data)

A class that implements a Markov Decision Process (MDP).

See: https://en.wikipedia.org/wiki/Markov_decision_process .

def game_start(self, initial_game_state: pacai.core.gamestate.GameState) -> None:
194    def game_start(self, initial_game_state: pacai.core.gamestate.GameState) -> None:
195        """
196        Inform the MDP about the game's start.
197        This is the MDP's first chance to see the game/board and initialize the appropriate data.
198        """

Inform the MDP about the game's start. This is the MDP's first chance to see the game/board and initialize the appropriate data.

@abc.abstractmethod
def get_starting_state(self) -> ~StateType:
200    @abc.abstractmethod
201    def get_starting_state(self) -> StateType:
202        """ Return the starting state of this MDP. """

Return the starting state of this MDP.

@abc.abstractmethod
def get_states(self) -> list[~StateType]:
204    @abc.abstractmethod
205    def get_states(self) -> list[StateType]:
206        """ Return a list of all states in this MDP. """

Return a list of all states in this MDP.

@abc.abstractmethod
def is_terminal_state(self, state: ~StateType) -> bool:
208    @abc.abstractmethod
209    def is_terminal_state(self, state: StateType) -> bool:
210        """
211        Returns true if the given state is a terminal state.
212        By convention, a terminal state has zero future rewards.
213        Sometimes the terminal state(s) may have no possible actions.
214        It is also common to think of the terminal state as having
215        a self-loop action 'pass' with zero reward; the formulations are equivalent.
216        """

Returns true if the given state is a terminal state. By convention, a terminal state has zero future rewards. Sometimes the terminal state(s) may have no possible actions. It is also common to think of the terminal state as having a self-loop action 'pass' with zero reward; the formulations are equivalent.

@abc.abstractmethod
def get_possible_actions(self, state: ~StateType) -> list[pacai.core.action.Action]:
218    @abc.abstractmethod
219    def get_possible_actions(self, state: StateType) -> list[pacai.core.action.Action]:
220        """ Return the possible actions from the given MDP state. """

Return the possible actions from the given MDP state.

@abc.abstractmethod
def get_transitions( self, state: ~StateType, action: pacai.core.action.Action) -> list[Transition[~StateType]]:
222    @abc.abstractmethod
223    def get_transitions(self, state: StateType, action: pacai.core.action.Action) -> list[Transition[StateType]]:
224        """
225        Get a list of the possible transitions from the given state with the given action.
226        A transition consists of the target state, the probability that the transition is taken,
227        and the reward for taking that transition.
228
229        All transition probabilities should add up to 1.0.
230
231        Note that in some methods like Q-Learning and reinforcement learning,
232        we do not know these probabilities nor do we directly model them.
233        """

Get a list of the possible transitions from the given state with the given action. A transition consists of the target state, the probability that the transition is taken, and the reward for taking that transition.

All transition probabilities should add up to 1.0.

Note that in some methods like Q-Learning and reinforcement learning, we do not know these probabilities nor do we directly model them.