Module pacai.agents.learning.reinforcement
Expand source code
import abc
import logging
import time
from pacai.agents.learning.value import ValueEstimationAgent
class ReinforcementAgent(ValueEstimationAgent):
An abstract value estimation agent that learns by estimating Q-values from experience.
You should know the following:
The environment will call `ReinforcementAgent.observeTransition`,
which will then call `ReinforcementAgent.update` (which you should override).
Use `ReinforcementAgent.getLegalActions` to know which actions are available in a state.
def __init__(self, index, actionFn = None, numTraining = 100, epsilon = 0.5,
alpha = 0.5, gamma = 1, **kwargs):
actionFn: A function which takes a state and returns the list of legal actions.
alpha: The learning rate.
epsilon: The exploration rate.
gamma: The discount factor.
numTraining: The number of training episodes.
super().__init__(index, **kwargs)
if (actionFn is None):
actionFn = lambda state: state.getLegalActions()
self.actionFn = actionFn
self.episodesSoFar = 0
self.accumTrainRewards = 0.0
self.accumTestRewards = 0.0
self.numTraining = int(numTraining)
self.epsilon = float(epsilon)
self.alpha = float(alpha)
self.discountRate = float(gamma)
def update(self, state, action, nextState, reward):
This class will call this function after observing a transition and reward.
def getAlpha(self):
return self.alpha
def getDiscountRate(self):
return self.discountRate
def getEpsilon(self):
return self.epsilon
def getGamma(self):
return self.discountRate
def getLegalActions(self, state):
Get the actions available for a given state.
This is what you should use to obtain legal actions for a state.
return self.actionFn(state)
def observeTransition(self, state, action, nextState, deltaReward):
Called by environment to inform agent that a transition has been observed.
This will result in a call to `ReinforcementAgent.update` on the same arguments.
You should not directly call this function (the environment will).
self.episodeRewards += deltaReward
self.update(state, action, nextState, deltaReward)
def startEpisode(self):
Called by environment when a new episode is starting.
self.lastState = None
self.lastAction = None
self.episodeRewards = 0.0
def stopEpisode(self):
Called by environment when an episode is done.
if (self.episodesSoFar < self.numTraining):
self.accumTrainRewards += self.episodeRewards
self.accumTestRewards += self.episodeRewards
self.episodesSoFar += 1
if (self.episodesSoFar >= self.numTraining):
# Take off the training wheels.
self.epsilon = 0.0 # No exploration.
self.alpha = 0.0 # No learning.
def isInTraining(self):
return (self.episodesSoFar < self.numTraining)
def isInTesting(self):
return not self.isInTraining()
def setEpsilon(self, epsilon):
self.epsilon = epsilon
def setLearningRate(self, alpha):
self.alpha = alpha
def setDiscount(self, discount):
self.discountRate = discount
def doAction(self, state, action):
Called by inherited class when an action is taken in a state.
self.lastState = state
self.lastAction = action
def observationFunction(self, state):
This is where we ended up after our last action.
if self.lastState is not None:
reward = state.getScore() - self.lastState.getScore()
self.observeTransition(self.lastState, self.lastAction, state, reward)
def registerInitialState(self, state):
if self.episodesSoFar == 0:
logging.debug('Beginning %d episodes of Training' % (self.numTraining))
def final(self, state):
Called by Pacman game at the terminal state.
deltaReward = state.getScore() - self.lastState.getScore()
self.observeTransition(self.lastState, self.lastAction, state, deltaReward)
if ('episodeStartTime' not in self.__dict__):
self.episodeStartTime = time.time()
if ('lastWindowAccumRewards' not in self.__dict__):
self.lastWindowAccumRewards = 0.0
self.lastWindowAccumRewards += state.getScore()
if (self.episodesSoFar % NUM_EPS_UPDATE == 0):
logging.debug('Reinforcement Learning Status:')
windowAvg = self.lastWindowAccumRewards / float(NUM_EPS_UPDATE)
if (self.episodesSoFar <= self.numTraining):
trainAvg = self.accumTrainRewards / float(self.episodesSoFar)
logging.debug('\tCompleted %d out of %d training episodes' %
(self.episodesSoFar, self.numTraining))
logging.debug('\tAverage Rewards over all training: %.2f' % (trainAvg))
testAvg = float(self.accumTestRewards) / (self.episodesSoFar - self.numTraining)
logging.debug('\tCompleted %d test episodes' %
(self.episodesSoFar - self.numTraining))
logging.debug('\tAverage Rewards over testing: %.2f' % (testAvg))'\tAverage Rewards for last %d episodes: %.2f' %
(NUM_EPS_UPDATE, windowAvg))'\tEpisode took %.2f seconds' % (time.time() - self.episodeStartTime))
self.lastWindowAccumRewards = 0.0
self.episodeStartTime = time.time()
if (self.episodesSoFar == self.numTraining):
msg = 'Training Done (turning off epsilon and alpha)'
logging.debug('%s\n%s' % (msg, '-' * len(msg)))
class ReinforcementAgent (index, actionFn=None, numTraining=100, epsilon=0.5, alpha=0.5, gamma=1, **kwargs)
An abstract value estimation agent that learns by estimating Q-values from experience.
You should know the following: The environment will call
, which will then callReinforcementAgent.update()
(which you should override). UseReinforcementAgent.getLegalActions()
to know which actions are available in a state.Args
- A function which takes a state and returns the list of legal actions.
- The learning rate.
- The exploration rate.
- The discount factor.
- The number of training episodes.
Expand source code
class ReinforcementAgent(ValueEstimationAgent): """ An abstract value estimation agent that learns by estimating Q-values from experience. You should know the following: The environment will call `ReinforcementAgent.observeTransition`, which will then call `ReinforcementAgent.update` (which you should override). Use `ReinforcementAgent.getLegalActions` to know which actions are available in a state. """ def __init__(self, index, actionFn = None, numTraining = 100, epsilon = 0.5, alpha = 0.5, gamma = 1, **kwargs): """ Args: actionFn: A function which takes a state and returns the list of legal actions. alpha: The learning rate. epsilon: The exploration rate. gamma: The discount factor. numTraining: The number of training episodes. """ super().__init__(index, **kwargs) if (actionFn is None): actionFn = lambda state: state.getLegalActions() self.actionFn = actionFn self.episodesSoFar = 0 self.accumTrainRewards = 0.0 self.accumTestRewards = 0.0 self.numTraining = int(numTraining) self.epsilon = float(epsilon) self.alpha = float(alpha) self.discountRate = float(gamma) @abc.abstractmethod def update(self, state, action, nextState, reward): """ This class will call this function after observing a transition and reward. """ pass def getAlpha(self): return self.alpha def getDiscountRate(self): return self.discountRate def getEpsilon(self): return self.epsilon def getGamma(self): return self.discountRate def getLegalActions(self, state): """ Get the actions available for a given state. This is what you should use to obtain legal actions for a state. """ return self.actionFn(state) def observeTransition(self, state, action, nextState, deltaReward): """ Called by environment to inform agent that a transition has been observed. This will result in a call to `ReinforcementAgent.update` on the same arguments. You should not directly call this function (the environment will). """ self.episodeRewards += deltaReward self.update(state, action, nextState, deltaReward) def startEpisode(self): """ Called by environment when a new episode is starting. """ self.lastState = None self.lastAction = None self.episodeRewards = 0.0 def stopEpisode(self): """ Called by environment when an episode is done. """ if (self.episodesSoFar < self.numTraining): self.accumTrainRewards += self.episodeRewards else: self.accumTestRewards += self.episodeRewards self.episodesSoFar += 1 if (self.episodesSoFar >= self.numTraining): # Take off the training wheels. self.epsilon = 0.0 # No exploration. self.alpha = 0.0 # No learning. def isInTraining(self): return (self.episodesSoFar < self.numTraining) def isInTesting(self): return not self.isInTraining() def setEpsilon(self, epsilon): self.epsilon = epsilon def setLearningRate(self, alpha): self.alpha = alpha def setDiscount(self, discount): self.discountRate = discount def doAction(self, state, action): """ Called by inherited class when an action is taken in a state. """ self.lastState = state self.lastAction = action def observationFunction(self, state): """ This is where we ended up after our last action. """ if self.lastState is not None: reward = state.getScore() - self.lastState.getScore() self.observeTransition(self.lastState, self.lastAction, state, reward) def registerInitialState(self, state): self.startEpisode() if self.episodesSoFar == 0: logging.debug('Beginning %d episodes of Training' % (self.numTraining)) def final(self, state): """ Called by Pacman game at the terminal state. """ deltaReward = state.getScore() - self.lastState.getScore() self.observeTransition(self.lastState, self.lastAction, state, deltaReward) self.stopEpisode() if ('episodeStartTime' not in self.__dict__): self.episodeStartTime = time.time() if ('lastWindowAccumRewards' not in self.__dict__): self.lastWindowAccumRewards = 0.0 self.lastWindowAccumRewards += state.getScore() NUM_EPS_UPDATE = 100 if (self.episodesSoFar % NUM_EPS_UPDATE == 0): logging.debug('Reinforcement Learning Status:') windowAvg = self.lastWindowAccumRewards / float(NUM_EPS_UPDATE) if (self.episodesSoFar <= self.numTraining): trainAvg = self.accumTrainRewards / float(self.episodesSoFar) logging.debug('\tCompleted %d out of %d training episodes' % (self.episodesSoFar, self.numTraining)) logging.debug('\tAverage Rewards over all training: %.2f' % (trainAvg)) else: testAvg = float(self.accumTestRewards) / (self.episodesSoFar - self.numTraining) logging.debug('\tCompleted %d test episodes' % (self.episodesSoFar - self.numTraining)) logging.debug('\tAverage Rewards over testing: %.2f' % (testAvg))'\tAverage Rewards for last %d episodes: %.2f' % (NUM_EPS_UPDATE, windowAvg))'\tEpisode took %.2f seconds' % (time.time() - self.episodeStartTime)) self.lastWindowAccumRewards = 0.0 self.episodeStartTime = time.time() if (self.episodesSoFar == self.numTraining): msg = 'Training Done (turning off epsilon and alpha)' logging.debug('%s\n%s' % (msg, '-' * len(msg)))
- ValueEstimationAgent
- BaseAgent
- abc.ABC
Static methods
def loadAgent(name, index, args={})
Inherited from:
Load an agent with the given class name. The name can be fully qualified or just the bare class name. If the bare name is given, the class should …
def doAction(self, state, action)
Called by inherited class when an action is taken in a state.
Expand source code
def doAction(self, state, action): """ Called by inherited class when an action is taken in a state. """ self.lastState = state self.lastAction = action
def final(self, state)
Called by Pacman game at the terminal state.
Expand source code
def final(self, state): """ Called by Pacman game at the terminal state. """ deltaReward = state.getScore() - self.lastState.getScore() self.observeTransition(self.lastState, self.lastAction, state, deltaReward) self.stopEpisode() if ('episodeStartTime' not in self.__dict__): self.episodeStartTime = time.time() if ('lastWindowAccumRewards' not in self.__dict__): self.lastWindowAccumRewards = 0.0 self.lastWindowAccumRewards += state.getScore() NUM_EPS_UPDATE = 100 if (self.episodesSoFar % NUM_EPS_UPDATE == 0): logging.debug('Reinforcement Learning Status:') windowAvg = self.lastWindowAccumRewards / float(NUM_EPS_UPDATE) if (self.episodesSoFar <= self.numTraining): trainAvg = self.accumTrainRewards / float(self.episodesSoFar) logging.debug('\tCompleted %d out of %d training episodes' % (self.episodesSoFar, self.numTraining)) logging.debug('\tAverage Rewards over all training: %.2f' % (trainAvg)) else: testAvg = float(self.accumTestRewards) / (self.episodesSoFar - self.numTraining) logging.debug('\tCompleted %d test episodes' % (self.episodesSoFar - self.numTraining)) logging.debug('\tAverage Rewards over testing: %.2f' % (testAvg))'\tAverage Rewards for last %d episodes: %.2f' % (NUM_EPS_UPDATE, windowAvg))'\tEpisode took %.2f seconds' % (time.time() - self.episodeStartTime)) self.lastWindowAccumRewards = 0.0 self.episodeStartTime = time.time() if (self.episodesSoFar == self.numTraining): msg = 'Training Done (turning off epsilon and alpha)' logging.debug('%s\n%s' % (msg, '-' * len(msg)))
def getAction(self, state)
Inherited from:
The BaseAgent will receive an
, and must return an action fromDirections
. def getAlpha(self)
Expand source code
def getAlpha(self): return self.alpha
def getDiscountRate(self)
Expand source code
def getDiscountRate(self): return self.discountRate
def getEpsilon(self)
Expand source code
def getEpsilon(self): return self.epsilon
def getGamma(self)
Expand source code
def getGamma(self): return self.discountRate
def getLegalActions(self, state)
Get the actions available for a given state. This is what you should use to obtain legal actions for a state.
Expand source code
def getLegalActions(self, state): """ Get the actions available for a given state. This is what you should use to obtain legal actions for a state. """ return self.actionFn(state)
def getPolicy(self, state)
Inherited from:
What is the best action to take in the state? Note that because we might want to explore, this might not coincide with …
def getQValue(self, state, action)
Inherited from:
Should return Q(state,action).
def getValue(self, state)
Inherited from:
What is the value of this state under the best action? Concretely, this is given by:
V(state) = max_{action in actions} Q(state ,action)
def isInTesting(self)
Expand source code
def isInTesting(self): return not self.isInTraining()
def isInTraining(self)
Expand source code
def isInTraining(self): return (self.episodesSoFar < self.numTraining)
def observationFunction(self, state)
This is where we ended up after our last action.
Expand source code
def observationFunction(self, state): """ This is where we ended up after our last action. """ if self.lastState is not None: reward = state.getScore() - self.lastState.getScore() self.observeTransition(self.lastState, self.lastAction, state, reward)
def observeTransition(self, state, action, nextState, deltaReward)
Called by environment to inform agent that a transition has been observed. This will result in a call to
on the same arguments. You should not directly call this function (the environment will).Expand source code
def observeTransition(self, state, action, nextState, deltaReward): """ Called by environment to inform agent that a transition has been observed. This will result in a call to `ReinforcementAgent.update` on the same arguments. You should not directly call this function (the environment will). """ self.episodeRewards += deltaReward self.update(state, action, nextState, deltaReward)
def registerInitialState(self, state)
Inherited from:
Inspect the starting state.
Expand source code
def registerInitialState(self, state): self.startEpisode() if self.episodesSoFar == 0: logging.debug('Beginning %d episodes of Training' % (self.numTraining))
def setDiscount(self, discount)
Expand source code
def setDiscount(self, discount): self.discountRate = discount
def setEpsilon(self, epsilon)
Expand source code
def setEpsilon(self, epsilon): self.epsilon = epsilon
def setLearningRate(self, alpha)
Expand source code
def setLearningRate(self, alpha): self.alpha = alpha
def startEpisode(self)
Called by environment when a new episode is starting.
Expand source code
def startEpisode(self): """ Called by environment when a new episode is starting. """ self.lastState = None self.lastAction = None self.episodeRewards = 0.0
def stopEpisode(self)
Called by environment when an episode is done.
Expand source code
def stopEpisode(self): """ Called by environment when an episode is done. """ if (self.episodesSoFar < self.numTraining): self.accumTrainRewards += self.episodeRewards else: self.accumTestRewards += self.episodeRewards self.episodesSoFar += 1 if (self.episodesSoFar >= self.numTraining): # Take off the training wheels. self.epsilon = 0.0 # No exploration. self.alpha = 0.0 # No learning.
def update(self, state, action, nextState, reward)
This class will call this function after observing a transition and reward.
Expand source code
@abc.abstractmethod def update(self, state, action, nextState, reward): """ This class will call this function after observing a transition and reward. """ pass