Reinforcement Learning with Monte Carlo and Tabulation Methods

I was reading another blog post about reinforcement learning using Monte Carlo and tabulation methods that provided an example of the technique using Blackjack. I decided to implement my own method using Tic-Tac-Toe as an example. The basic idea is that you generate a random state of a game, and you generate a random action based on that state. Then you play the rest of the game through until the end and record the outcome. Then you should be able to store the state, action, and outcome as a key in a dictionary that refers to a count. Each time that state-action-outcome occurs again, you update the count by one. Over time, your dictionary will encode information about the relative strengths of different actions for a given state.

In my implementation, I was thinking about Swift and how I would define a standard protocol for an object that could be used to produce a dictionary of state-action-outcome counts. So far, I think that you’d want methods to return the current state, the legal actions, the outcome (win, lose, draw, incomplete), and methods to make a move and reset the game.

Here’s my Python code so far:

import random

class TicTacToe:
    def __init__( self, boxes=None ):
        self.boxes = [ [ None, None, None ]
                     , [ None, None, None ]
                     , [ None, None, None ] ]
    def __repr__( self ):
        """
        Produce a pretty-printed version of the board
        """
        s = ""
        for i in range(3):
            for j in range(3):
                if self.boxes[i][j] is None:
                    s += ". "
                else:
                    s += str(self.boxes[i][j])+" "
            s += "\n"
        return s
    def reset( self ):
        self.boxes = [ [ None, None, None ]
                     , [ None, None, None ]
                     , [ None, None, None ] ]
    def getRow( self, i ):
        """
        Return on of the rows
        """
        return [ self.boxes[i][j] for j in range(3) ]
    def getColumn( self, i ):
        """
        Return one of the columns
        """
        return [ self.boxes[j][i] for j in range(3) ]
    def getDiagonal( self, direction ):
        """
        Return one of the diagonals, `direction` is 1 or -1
        """
        if direction == 1:
            return [ self.boxes[i][2-i] for i in range(3) ]
        elif direction == -1:
            return [ self.boxes[i][i] for i in range(3) ]
    def _win( self, itr ):
        """
        Helper function for the winner function
        """
        reduced = list( set( itr ) )
        if len( reduced ) == 1:
            return reduced[0]
        else:
            return None
    def winner( self ):
        """
        Return an integer code for the following cases
          0: 0 wins
          1: 1 wins
          2: draw
          3: in progress
        """
        rows = [ self.getRow(i) for i in range(3) ]
        cols = [ self.getColumn(i) for i in range(3) ]
        diags = [ self.getDiagonal(1), self.getDiagonal(-1) ]
        options = rows + cols + diags
        for option in options:
            result = self._win( option )
            if result is not None:
                return result
        if None in self.state():
            return 3
        return 2
    def action( self ):
        """
        Return a random legal move, or None
        """
        play = []
        if None not in self.state():
            return None
        for i in range(3):
            for j in range(3):
                if self.boxes[i][j] == None:
                    play.append( [i,j] )
        random.shuffle( play )
        return play.pop()
    def state( self ):
        """
        Return a flattened array of the board
        """
        return [ item for sublist in self.boxes for item in sublist ]
    def play( self, token, location ):
        """
        Place a token at location if location is not None
        """
        if location is None:
            return
        i, j = location
        self.boxes[i][j] = token
    def generateRandomState( self, depth ):
        for i in range(depth):
            self.play(i%2,self.action())
            if None not in self.state():
                break
        #print self
        return self.state()
    def playRandomGame( self, player ):
        while True:
            self.play(player%2,self.action())
            #print self
            # check win
            if self.winner() != 3:
                break
            player += 1

def train(game,depth=0,n=1000):
    q = dict()
    for d in range(depth):
        for i in range(n):
            state = game.generateRandomState(d)
            if game.winner() < 3:
                continue
            if state is None:
                continue
            action = game.action()
            game.play(d%2,action)
            game.playRandomGame((d+1)%2)
            winner = game.winner()
            key = ( tuple(state), tuple(action), winner )
            if key in q.keys():
                q[key] += 1
            else:
                q[key] = 1
            game.reset()
    return q

def qactions( q, state, winner ):
    qs = dict()
    for k,v in q.iteritems():
        _state, _action, _winner = k
        if _state == tuple(state) and _winner == winner:
            qs[_action] = v
    return qs

def qaction( q, state, winner ):
    actions = qactions( q, state, winner )
    maximum = 0
    action = None
    for k,v in actions.iteritems():
        if v > maximum:
            maximum = v
            action = k
    return list(action)

# create a dictionary, q

tictactoe = TicTacToe()
q = train(tictactoe,depth=10,n=50000)

# then, interactively, I take a turn

t = TicTacToe()
t.play(0,[1,1])
print t

# and then ask the computer to make a turn

t.play(1,qaction( q, t.state(), 1 ))
print t

# et cetra