# Reinforcement Learning with Monte Carlo and Tabulation Methods

I was reading another blog post about reinforcement learning using Monte Carlo and tabulation methods that provided an example of the technique using Blackjack. I decided to implement my own method using Tic-Tac-Toe as an example. The basic idea is that you generate a random state of a game, and you generate a random action based on that state. Then you play the rest of the game through until the end and record the outcome. Then you should be able to store the state, action, and outcome as a key in a dictionary that refers to a count. Each time that state-action-outcome occurs again, you update the count by one. Over time, your dictionary will encode information about the relative strengths of different actions for a given state.

In my implementation, I was thinking about Swift and how I would define a standard protocol for an object that could be used to produce a dictionary of state-action-outcome counts. So far, I think that you’d want methods to return the current state, the legal actions, the outcome (win, lose, draw, incomplete), and methods to make a move and reset the game.

Here’s my Python code so far:

```import random

class TicTacToe:
def __init__( self, boxes=None ):
self.boxes = [ [ None, None, None ]
, [ None, None, None ]
, [ None, None, None ] ]
def __repr__( self ):
"""
Produce a pretty-printed version of the board
"""
s = ""
for i in range(3):
for j in range(3):
if self.boxes[i][j] is None:
s += ". "
else:
s += str(self.boxes[i][j])+" "
s += "\n"
return s
def reset( self ):
self.boxes = [ [ None, None, None ]
, [ None, None, None ]
, [ None, None, None ] ]
def getRow( self, i ):
"""
Return on of the rows
"""
return [ self.boxes[i][j] for j in range(3) ]
def getColumn( self, i ):
"""
Return one of the columns
"""
return [ self.boxes[j][i] for j in range(3) ]
def getDiagonal( self, direction ):
"""
Return one of the diagonals, `direction` is 1 or -1
"""
if direction == 1:
return [ self.boxes[i][2-i] for i in range(3) ]
elif direction == -1:
return [ self.boxes[i][i] for i in range(3) ]
def _win( self, itr ):
"""
Helper function for the winner function
"""
reduced = list( set( itr ) )
if len( reduced ) == 1:
return reduced
else:
return None
def winner( self ):
"""
Return an integer code for the following cases
0: 0 wins
1: 1 wins
2: draw
3: in progress
"""
rows = [ self.getRow(i) for i in range(3) ]
cols = [ self.getColumn(i) for i in range(3) ]
diags = [ self.getDiagonal(1), self.getDiagonal(-1) ]
options = rows + cols + diags
for option in options:
result = self._win( option )
if result is not None:
return result
if None in self.state():
return 3
return 2
def action( self ):
"""
Return a random legal move, or None
"""
play = []
if None not in self.state():
return None
for i in range(3):
for j in range(3):
if self.boxes[i][j] == None:
play.append( [i,j] )
random.shuffle( play )
return play.pop()
def state( self ):
"""
Return a flattened array of the board
"""
return [ item for sublist in self.boxes for item in sublist ]
def play( self, token, location ):
"""
Place a token at location if location is not None
"""
if location is None:
return
i, j = location
self.boxes[i][j] = token
def generateRandomState( self, depth ):
for i in range(depth):
self.play(i%2,self.action())
if None not in self.state():
break
#print self
return self.state()
def playRandomGame( self, player ):
while True:
self.play(player%2,self.action())
#print self
# check win
if self.winner() != 3:
break
player += 1

def train(game,depth=0,n=1000):
q = dict()
for d in range(depth):
for i in range(n):
state = game.generateRandomState(d)
if game.winner() < 3:
continue
if state is None:
continue
action = game.action()
game.play(d%2,action)
game.playRandomGame((d+1)%2)
winner = game.winner()
key = ( tuple(state), tuple(action), winner )
if key in q.keys():
q[key] += 1
else:
q[key] = 1
game.reset()
return q

def qactions( q, state, winner ):
qs = dict()
for k,v in q.iteritems():
_state, _action, _winner = k
if _state == tuple(state) and _winner == winner:
qs[_action] = v
return qs

def qaction( q, state, winner ):
actions = qactions( q, state, winner )
maximum = 0
action = None
for k,v in actions.iteritems():
if v > maximum:
maximum = v
action = k
return list(action)

# create a dictionary, q

tictactoe = TicTacToe()
q = train(tictactoe,depth=10,n=50000)

# then, interactively, I take a turn

t = TicTacToe()
t.play(0,[1,1])
print t

# and then ask the computer to make a turn

t.play(1,qaction( q, t.state(), 1 ))
print t

# et cetra

```

This site uses Akismet to reduce spam. Learn how your comment data is processed.