Module pacai.agents.learning.reinforcement
Expand source code
import abc
import logging
import time
from pacai.agents.learning.value import ValueEstimationAgent
class ReinforcementAgent(ValueEstimationAgent):
"""
An abstract value estimation agent that learns by estimating Q-values from experience.
You should know the following:
The environment will call `ReinforcementAgent.observeTransition`,
which will then call `ReinforcementAgent.update` (which you should override).
Use `ReinforcementAgent.getLegalActions` to know which actions are available in a state.
"""
def __init__(self, index, actionFn = None, numTraining = 100, epsilon = 0.5,
alpha = 0.5, gamma = 1, **kwargs):
"""
Args:
actionFn: A function which takes a state and returns the list of legal actions.
alpha: The learning rate.
epsilon: The exploration rate.
gamma: The discount factor.
numTraining: The number of training episodes.
"""
super().__init__(index, **kwargs)
if (actionFn is None):
actionFn = lambda state: state.getLegalActions()
self.actionFn = actionFn
self.episodesSoFar = 0
self.accumTrainRewards = 0.0
self.accumTestRewards = 0.0
self.numTraining = int(numTraining)
self.epsilon = float(epsilon)
self.alpha = float(alpha)
self.discountRate = float(gamma)
@abc.abstractmethod
def update(self, state, action, nextState, reward):
"""
This class will call this function after observing a transition and reward.
"""
pass
def getAlpha(self):
return self.alpha
def getDiscountRate(self):
return self.discountRate
def getEpsilon(self):
return self.epsilon
def getGamma(self):
return self.discountRate
def getLegalActions(self, state):
"""
Get the actions available for a given state.
This is what you should use to obtain legal actions for a state.
"""
return self.actionFn(state)
def observeTransition(self, state, action, nextState, deltaReward):
"""
Called by environment to inform agent that a transition has been observed.
This will result in a call to `ReinforcementAgent.update` on the same arguments.
You should not directly call this function (the environment will).
"""
self.episodeRewards += deltaReward
self.update(state, action, nextState, deltaReward)
def startEpisode(self):
"""
Called by environment when a new episode is starting.
"""
self.lastState = None
self.lastAction = None
self.episodeRewards = 0.0
def stopEpisode(self):
"""
Called by environment when an episode is done.
"""
if (self.episodesSoFar < self.numTraining):
self.accumTrainRewards += self.episodeRewards
else:
self.accumTestRewards += self.episodeRewards
self.episodesSoFar += 1
if (self.episodesSoFar >= self.numTraining):
# Take off the training wheels.
self.epsilon = 0.0 # No exploration.
self.alpha = 0.0 # No learning.
def isInTraining(self):
return (self.episodesSoFar < self.numTraining)
def isInTesting(self):
return not self.isInTraining()
def setEpsilon(self, epsilon):
self.epsilon = epsilon
def setLearningRate(self, alpha):
self.alpha = alpha
def setDiscount(self, discount):
self.discountRate = discount
def doAction(self, state, action):
"""
Called by inherited class when an action is taken in a state.
"""
self.lastState = state
self.lastAction = action
def observationFunction(self, state):
"""
This is where we ended up after our last action.
"""
if self.lastState is not None:
reward = state.getScore() - self.lastState.getScore()
self.observeTransition(self.lastState, self.lastAction, state, reward)
def registerInitialState(self, state):
self.startEpisode()
if self.episodesSoFar == 0:
logging.debug('Beginning %d episodes of Training' % (self.numTraining))
def final(self, state):
"""
Called by Pacman game at the terminal state.
"""
deltaReward = state.getScore() - self.lastState.getScore()
self.observeTransition(self.lastState, self.lastAction, state, deltaReward)
self.stopEpisode()
if ('episodeStartTime' not in self.__dict__):
self.episodeStartTime = time.time()
if ('lastWindowAccumRewards' not in self.__dict__):
self.lastWindowAccumRewards = 0.0
self.lastWindowAccumRewards += state.getScore()
NUM_EPS_UPDATE = 100
if (self.episodesSoFar % NUM_EPS_UPDATE == 0):
logging.debug('Reinforcement Learning Status:')
windowAvg = self.lastWindowAccumRewards / float(NUM_EPS_UPDATE)
if (self.episodesSoFar <= self.numTraining):
trainAvg = self.accumTrainRewards / float(self.episodesSoFar)
logging.debug('\tCompleted %d out of %d training episodes' %
(self.episodesSoFar, self.numTraining))
logging.debug('\tAverage Rewards over all training: %.2f' % (trainAvg))
else:
testAvg = float(self.accumTestRewards) / (self.episodesSoFar - self.numTraining)
logging.debug('\tCompleted %d test episodes' %
(self.episodesSoFar - self.numTraining))
logging.debug('\tAverage Rewards over testing: %.2f' % (testAvg))
logging.info('\tAverage Rewards for last %d episodes: %.2f' %
(NUM_EPS_UPDATE, windowAvg))
logging.info('\tEpisode took %.2f seconds' % (time.time() - self.episodeStartTime))
self.lastWindowAccumRewards = 0.0
self.episodeStartTime = time.time()
if (self.episodesSoFar == self.numTraining):
msg = 'Training Done (turning off epsilon and alpha)'
logging.debug('%s\n%s' % (msg, '-' * len(msg)))
Classes
class ReinforcementAgent (index, actionFn=None, numTraining=100, epsilon=0.5, alpha=0.5, gamma=1, **kwargs)
-
An abstract value estimation agent that learns by estimating Q-values from experience.
You should know the following: The environment will call
ReinforcementAgent.observeTransition()
, which will then callReinforcementAgent.update()
(which you should override). UseReinforcementAgent.getLegalActions()
to know which actions are available in a state.Args
actionFn
- A function which takes a state and returns the list of legal actions.
alpha
- The learning rate.
epsilon
- The exploration rate.
gamma
- The discount factor.
numTraining
- The number of training episodes.
Expand source code
class ReinforcementAgent(ValueEstimationAgent): """ An abstract value estimation agent that learns by estimating Q-values from experience. You should know the following: The environment will call `ReinforcementAgent.observeTransition`, which will then call `ReinforcementAgent.update` (which you should override). Use `ReinforcementAgent.getLegalActions` to know which actions are available in a state. """ def __init__(self, index, actionFn = None, numTraining = 100, epsilon = 0.5, alpha = 0.5, gamma = 1, **kwargs): """ Args: actionFn: A function which takes a state and returns the list of legal actions. alpha: The learning rate. epsilon: The exploration rate. gamma: The discount factor. numTraining: The number of training episodes. """ super().__init__(index, **kwargs) if (actionFn is None): actionFn = lambda state: state.getLegalActions() self.actionFn = actionFn self.episodesSoFar = 0 self.accumTrainRewards = 0.0 self.accumTestRewards = 0.0 self.numTraining = int(numTraining) self.epsilon = float(epsilon) self.alpha = float(alpha) self.discountRate = float(gamma) @abc.abstractmethod def update(self, state, action, nextState, reward): """ This class will call this function after observing a transition and reward. """ pass def getAlpha(self): return self.alpha def getDiscountRate(self): return self.discountRate def getEpsilon(self): return self.epsilon def getGamma(self): return self.discountRate def getLegalActions(self, state): """ Get the actions available for a given state. This is what you should use to obtain legal actions for a state. """ return self.actionFn(state) def observeTransition(self, state, action, nextState, deltaReward): """ Called by environment to inform agent that a transition has been observed. This will result in a call to `ReinforcementAgent.update` on the same arguments. You should not directly call this function (the environment will). """ self.episodeRewards += deltaReward self.update(state, action, nextState, deltaReward) def startEpisode(self): """ Called by environment when a new episode is starting. """ self.lastState = None self.lastAction = None self.episodeRewards = 0.0 def stopEpisode(self): """ Called by environment when an episode is done. """ if (self.episodesSoFar < self.numTraining): self.accumTrainRewards += self.episodeRewards else: self.accumTestRewards += self.episodeRewards self.episodesSoFar += 1 if (self.episodesSoFar >= self.numTraining): # Take off the training wheels. self.epsilon = 0.0 # No exploration. self.alpha = 0.0 # No learning. def isInTraining(self): return (self.episodesSoFar < self.numTraining) def isInTesting(self): return not self.isInTraining() def setEpsilon(self, epsilon): self.epsilon = epsilon def setLearningRate(self, alpha): self.alpha = alpha def setDiscount(self, discount): self.discountRate = discount def doAction(self, state, action): """ Called by inherited class when an action is taken in a state. """ self.lastState = state self.lastAction = action def observationFunction(self, state): """ This is where we ended up after our last action. """ if self.lastState is not None: reward = state.getScore() - self.lastState.getScore() self.observeTransition(self.lastState, self.lastAction, state, reward) def registerInitialState(self, state): self.startEpisode() if self.episodesSoFar == 0: logging.debug('Beginning %d episodes of Training' % (self.numTraining)) def final(self, state): """ Called by Pacman game at the terminal state. """ deltaReward = state.getScore() - self.lastState.getScore() self.observeTransition(self.lastState, self.lastAction, state, deltaReward) self.stopEpisode() if ('episodeStartTime' not in self.__dict__): self.episodeStartTime = time.time() if ('lastWindowAccumRewards' not in self.__dict__): self.lastWindowAccumRewards = 0.0 self.lastWindowAccumRewards += state.getScore() NUM_EPS_UPDATE = 100 if (self.episodesSoFar % NUM_EPS_UPDATE == 0): logging.debug('Reinforcement Learning Status:') windowAvg = self.lastWindowAccumRewards / float(NUM_EPS_UPDATE) if (self.episodesSoFar <= self.numTraining): trainAvg = self.accumTrainRewards / float(self.episodesSoFar) logging.debug('\tCompleted %d out of %d training episodes' % (self.episodesSoFar, self.numTraining)) logging.debug('\tAverage Rewards over all training: %.2f' % (trainAvg)) else: testAvg = float(self.accumTestRewards) / (self.episodesSoFar - self.numTraining) logging.debug('\tCompleted %d test episodes' % (self.episodesSoFar - self.numTraining)) logging.debug('\tAverage Rewards over testing: %.2f' % (testAvg)) logging.info('\tAverage Rewards for last %d episodes: %.2f' % (NUM_EPS_UPDATE, windowAvg)) logging.info('\tEpisode took %.2f seconds' % (time.time() - self.episodeStartTime)) self.lastWindowAccumRewards = 0.0 self.episodeStartTime = time.time() if (self.episodesSoFar == self.numTraining): msg = 'Training Done (turning off epsilon and alpha)' logging.debug('%s\n%s' % (msg, '-' * len(msg)))
Ancestors
- ValueEstimationAgent
- BaseAgent
- abc.ABC
Subclasses
Static methods
def loadAgent(name, index, args={})
-
Inherited from:
ValueEstimationAgent
.loadAgent
Load an agent with the given class name. The name can be fully qualified or just the bare class name. If the bare name is given, the class should …
Methods
def doAction(self, state, action)
-
Called by inherited class when an action is taken in a state.
Expand source code
def doAction(self, state, action): """ Called by inherited class when an action is taken in a state. """ self.lastState = state self.lastAction = action
def final(self, state)
-
Called by Pacman game at the terminal state.
Expand source code
def final(self, state): """ Called by Pacman game at the terminal state. """ deltaReward = state.getScore() - self.lastState.getScore() self.observeTransition(self.lastState, self.lastAction, state, deltaReward) self.stopEpisode() if ('episodeStartTime' not in self.__dict__): self.episodeStartTime = time.time() if ('lastWindowAccumRewards' not in self.__dict__): self.lastWindowAccumRewards = 0.0 self.lastWindowAccumRewards += state.getScore() NUM_EPS_UPDATE = 100 if (self.episodesSoFar % NUM_EPS_UPDATE == 0): logging.debug('Reinforcement Learning Status:') windowAvg = self.lastWindowAccumRewards / float(NUM_EPS_UPDATE) if (self.episodesSoFar <= self.numTraining): trainAvg = self.accumTrainRewards / float(self.episodesSoFar) logging.debug('\tCompleted %d out of %d training episodes' % (self.episodesSoFar, self.numTraining)) logging.debug('\tAverage Rewards over all training: %.2f' % (trainAvg)) else: testAvg = float(self.accumTestRewards) / (self.episodesSoFar - self.numTraining) logging.debug('\tCompleted %d test episodes' % (self.episodesSoFar - self.numTraining)) logging.debug('\tAverage Rewards over testing: %.2f' % (testAvg)) logging.info('\tAverage Rewards for last %d episodes: %.2f' % (NUM_EPS_UPDATE, windowAvg)) logging.info('\tEpisode took %.2f seconds' % (time.time() - self.episodeStartTime)) self.lastWindowAccumRewards = 0.0 self.episodeStartTime = time.time() if (self.episodesSoFar == self.numTraining): msg = 'Training Done (turning off epsilon and alpha)' logging.debug('%s\n%s' % (msg, '-' * len(msg)))
def getAction(self, state)
-
Inherited from:
ValueEstimationAgent
.getAction
The BaseAgent will receive an
AbstractGameState
, and must return an action fromDirections
. def getAlpha(self)
-
Expand source code
def getAlpha(self): return self.alpha
def getDiscountRate(self)
-
Expand source code
def getDiscountRate(self): return self.discountRate
def getEpsilon(self)
-
Expand source code
def getEpsilon(self): return self.epsilon
def getGamma(self)
-
Expand source code
def getGamma(self): return self.discountRate
def getLegalActions(self, state)
-
Get the actions available for a given state. This is what you should use to obtain legal actions for a state.
Expand source code
def getLegalActions(self, state): """ Get the actions available for a given state. This is what you should use to obtain legal actions for a state. """ return self.actionFn(state)
def getPolicy(self, state)
-
Inherited from:
ValueEstimationAgent
.getPolicy
What is the best action to take in the state? Note that because we might want to explore, this might not coincide with …
def getQValue(self, state, action)
-
Inherited from:
ValueEstimationAgent
.getQValue
Should return Q(state,action).
def getValue(self, state)
-
Inherited from:
ValueEstimationAgent
.getValue
What is the value of this state under the best action? Concretely, this is given by:
V(state) = max_{action in actions} Q(state ,action)
def isInTesting(self)
-
Expand source code
def isInTesting(self): return not self.isInTraining()
def isInTraining(self)
-
Expand source code
def isInTraining(self): return (self.episodesSoFar < self.numTraining)
def observationFunction(self, state)
-
This is where we ended up after our last action.
Expand source code
def observationFunction(self, state): """ This is where we ended up after our last action. """ if self.lastState is not None: reward = state.getScore() - self.lastState.getScore() self.observeTransition(self.lastState, self.lastAction, state, reward)
def observeTransition(self, state, action, nextState, deltaReward)
-
Called by environment to inform agent that a transition has been observed. This will result in a call to
ReinforcementAgent.update()
on the same arguments. You should not directly call this function (the environment will).Expand source code
def observeTransition(self, state, action, nextState, deltaReward): """ Called by environment to inform agent that a transition has been observed. This will result in a call to `ReinforcementAgent.update` on the same arguments. You should not directly call this function (the environment will). """ self.episodeRewards += deltaReward self.update(state, action, nextState, deltaReward)
def registerInitialState(self, state)
-
Inherited from:
ValueEstimationAgent
.registerInitialState
Inspect the starting state.
Expand source code
def registerInitialState(self, state): self.startEpisode() if self.episodesSoFar == 0: logging.debug('Beginning %d episodes of Training' % (self.numTraining))
def setDiscount(self, discount)
-
Expand source code
def setDiscount(self, discount): self.discountRate = discount
def setEpsilon(self, epsilon)
-
Expand source code
def setEpsilon(self, epsilon): self.epsilon = epsilon
def setLearningRate(self, alpha)
-
Expand source code
def setLearningRate(self, alpha): self.alpha = alpha
def startEpisode(self)
-
Called by environment when a new episode is starting.
Expand source code
def startEpisode(self): """ Called by environment when a new episode is starting. """ self.lastState = None self.lastAction = None self.episodeRewards = 0.0
def stopEpisode(self)
-
Called by environment when an episode is done.
Expand source code
def stopEpisode(self): """ Called by environment when an episode is done. """ if (self.episodesSoFar < self.numTraining): self.accumTrainRewards += self.episodeRewards else: self.accumTestRewards += self.episodeRewards self.episodesSoFar += 1 if (self.episodesSoFar >= self.numTraining): # Take off the training wheels. self.epsilon = 0.0 # No exploration. self.alpha = 0.0 # No learning.
def update(self, state, action, nextState, reward)
-
This class will call this function after observing a transition and reward.
Expand source code
@abc.abstractmethod def update(self, state, action, nextState, reward): """ This class will call this function after observing a transition and reward. """ pass