I was reading another blog post about reinforcement learning using Monte Carlo and tabulation methods that provided an example of the technique using Blackjack. I decided to implement my own method using Tic-Tac-Toe as an example. The basic idea is that you generate a random state of a game, and you generate a random action based on that state. Then you play the rest of the game through until the end and record the outcome. Then you should be able to store the state, action, and outcome as a key in a dictionary that refers to a count. Each time that state-action-outcome occurs again, you update the count by one. Over time, your dictionary will encode information about the relative strengths of different actions for a given state.
In my implementation, I was thinking about Swift and how I would define a standard protocol for an object that could be used to produce a dictionary of state-action-outcome counts. So far, I think that you’d want methods to return the current state, the legal actions, the outcome (win, lose, draw, incomplete), and methods to make a move and reset the game.
Here’s my Python code so far:
import random class TicTacToe: def __init__( self, boxes=None ): self.boxes = [ [ None, None, None ] , [ None, None, None ] , [ None, None, None ] ] def __repr__( self ): """ Produce a pretty-printed version of the board """ s = "" for i in range(3): for j in range(3): if self.boxes[i][j] is None: s += ". " else: s += str(self.boxes[i][j])+" " s += "\n" return s def reset( self ): self.boxes = [ [ None, None, None ] , [ None, None, None ] , [ None, None, None ] ] def getRow( self, i ): """ Return on of the rows """ return [ self.boxes[i][j] for j in range(3) ] def getColumn( self, i ): """ Return one of the columns """ return [ self.boxes[j][i] for j in range(3) ] def getDiagonal( self, direction ): """ Return one of the diagonals, `direction` is 1 or -1 """ if direction == 1: return [ self.boxes[i][2-i] for i in range(3) ] elif direction == -1: return [ self.boxes[i][i] for i in range(3) ] def _win( self, itr ): """ Helper function for the winner function """ reduced = list( set( itr ) ) if len( reduced ) == 1: return reduced else: return None def winner( self ): """ Return an integer code for the following cases 0: 0 wins 1: 1 wins 2: draw 3: in progress """ rows = [ self.getRow(i) for i in range(3) ] cols = [ self.getColumn(i) for i in range(3) ] diags = [ self.getDiagonal(1), self.getDiagonal(-1) ] options = rows + cols + diags for option in options: result = self._win( option ) if result is not None: return result if None in self.state(): return 3 return 2 def action( self ): """ Return a random legal move, or None """ play =  if None not in self.state(): return None for i in range(3): for j in range(3): if self.boxes[i][j] == None: play.append( [i,j] ) random.shuffle( play ) return play.pop() def state( self ): """ Return a flattened array of the board """ return [ item for sublist in self.boxes for item in sublist ] def play( self, token, location ): """ Place a token at location if location is not None """ if location is None: return i, j = location self.boxes[i][j] = token def generateRandomState( self, depth ): for i in range(depth): self.play(i%2,self.action()) if None not in self.state(): break #print self return self.state() def playRandomGame( self, player ): while True: self.play(player%2,self.action()) #print self # check win if self.winner() != 3: break player += 1 def train(game,depth=0,n=1000): q = dict() for d in range(depth): for i in range(n): state = game.generateRandomState(d) if game.winner() < 3: continue if state is None: continue action = game.action() game.play(d%2,action) game.playRandomGame((d+1)%2) winner = game.winner() key = ( tuple(state), tuple(action), winner ) if key in q.keys(): q[key] += 1 else: q[key] = 1 game.reset() return q def qactions( q, state, winner ): qs = dict() for k,v in q.iteritems(): _state, _action, _winner = k if _state == tuple(state) and _winner == winner: qs[_action] = v return qs def qaction( q, state, winner ): actions = qactions( q, state, winner ) maximum = 0 action = None for k,v in actions.iteritems(): if v > maximum: maximum = v action = k return list(action) # create a dictionary, q tictactoe = TicTacToe() q = train(tictactoe,depth=10,n=50000) # then, interactively, I take a turn t = TicTacToe() t.play(0,[1,1]) print t # and then ask the computer to make a turn t.play(1,qaction( q, t.state(), 1 )) print t # et cetra