Reputation: 57421
I've written a program (listed below) which plays Tic Tic Toe with a Tkinter GUI. If I invoke it like this:
root = tk.Tk()
root.title("Tic Tac Toe")
player1 = QPlayer(mark="X")
player2 = QPlayer(mark="O")
human_player = HumanPlayer(mark="X")
player2.epsilon = 0 # For playing the actual match, disable exploratory moves
game = Game(root, player1=human_player, player2=player2)
game.play()
root.mainloop()
it works as expected and the HumanPlayer
can play against player2
, which is a computer player (specifically, a QPlayer
). The figure below shows how the HumanPlayer
(with mark "X") easily wins.
In order to improve the performance of the QPlayer
, I'd like to 'train' it by allowing it to play against an instance of itself before playing against the human player. I've tried modifying the above code as follows:
root = tk.Tk()
root.title("Tic Tac Toe")
player1 = QPlayer(mark="X")
player2 = QPlayer(mark="O")
for _ in range(1): # Play a couple of training games
training_game = Game(root, player1, player2)
training_game.play()
training_game.reset()
human_player = HumanPlayer(mark="X")
player2.epsilon = 0 # For playing the actual match, disable exploratory moves
game = Game(root, player1=human_player, player2=player2)
game.play()
root.mainloop()
What I then find, however, is that the Tkinter window contains two Tic Tac Toe boards (depicted below), and the buttons of the second board are unresponsive.
In the above code, the reset()
method is the same one as used in the callback of the "Reset" button, which usually makes the board blank again to start over. I don't understand why I'm seeing two boards (of which one is unresponsive) instead of a single, responsive board?
For reference, the full code of the Tic Tac Toe program is listed below (with the 'offensive' lines of code commented out):
import numpy as np
import Tkinter as tk
import copy
class Game:
def __init__(self, master, player1, player2, Q_learn=None, Q={}, alpha=0.3, gamma=0.9):
frame = tk.Frame()
frame.grid()
self.master = master
self.player1 = player1
self.player2 = player2
self.current_player = player1
self.other_player = player2
self.empty_text = ""
self.board = Board()
self.buttons = [[None for _ in range(3)] for _ in range(3)]
for i in range(3):
for j in range(3):
self.buttons[i][j] = tk.Button(frame, height=3, width=3, text=self.empty_text, command=lambda i=i, j=j: self.callback(self.buttons[i][j]))
self.buttons[i][j].grid(row=i, column=j)
self.reset_button = tk.Button(text="Reset", command=self.reset)
self.reset_button.grid(row=3)
self.Q_learn = Q_learn
self.Q_learn_or_not()
if self.Q_learn:
self.Q = Q
self.alpha = alpha # Learning rate
self.gamma = gamma # Discount rate
self.share_Q_with_players()
def Q_learn_or_not(self): # If either player is a QPlayer, turn on Q-learning
if self.Q_learn is None:
if isinstance(self.player1, QPlayer) or isinstance(self.player2, QPlayer):
self.Q_learn = True
def share_Q_with_players(self): # The action value table Q is shared with the QPlayers to help them make their move decisions
if isinstance(self.player1, QPlayer):
self.player1.Q = self.Q
if isinstance(self.player2, QPlayer):
self.player2.Q = self.Q
def callback(self, button):
if self.board.over():
pass # Do nothing if the game is already over
else:
if isinstance(self.current_player, HumanPlayer) and isinstance(self.other_player, HumanPlayer):
if self.empty(button):
move = self.get_move(button)
self.handle_move(move)
elif isinstance(self.current_player, HumanPlayer) and isinstance(self.other_player, ComputerPlayer):
computer_player = self.other_player
if self.empty(button):
human_move = self.get_move(button)
self.handle_move(human_move)
if not self.board.over(): # Trigger the computer's next move
computer_move = computer_player.get_move(self.board)
self.handle_move(computer_move)
def empty(self, button):
return button["text"] == self.empty_text
def get_move(self, button):
info = button.grid_info()
move = (info["row"], info["column"]) # Get move coordinates from the button's metadata
return move
def handle_move(self, move):
try:
if self.Q_learn:
self.learn_Q(move)
i, j = move # Get row and column number of the corresponding button
self.buttons[i][j].configure(text=self.current_player.mark) # Change the label on the button to the current player's mark
self.board.place_mark(move, self.current_player.mark) # Update the board
if self.board.over():
self.declare_outcome()
else:
self.switch_players()
except:
print "There was an error handling the move."
pass # This might occur if no moves are available and the game is already over
def declare_outcome(self):
if self.board.winner() is None:
print "Cat's game."
else:
print "The game is over. The player with mark %s won!" % self.current_player.mark
def reset(self):
print "Resetting..."
for i in range(3):
for j in range(3):
self.buttons[i][j].configure(text=self.empty_text)
self.board = Board(grid=np.ones((3,3))*np.nan)
self.current_player = self.player1
self.other_player = self.player2
# np.random.seed(seed=0) # Set the random seed to zero to see the Q-learning 'in action' or for debugging purposes
self.play()
def switch_players(self):
if self.current_player == self.player1:
self.current_player = self.player2
self.other_player = self.player1
else:
self.current_player = self.player1
self.other_player = self.player2
def play(self):
if isinstance(self.player1, HumanPlayer) and isinstance(self.player2, HumanPlayer):
pass # For human vs. human, play relies on the callback from button presses
elif isinstance(self.player1, HumanPlayer) and isinstance(self.player2, ComputerPlayer):
pass
elif isinstance(self.player1, ComputerPlayer) and isinstance(self.player2, HumanPlayer):
first_computer_move = player1.get_move(self.board) # If player 1 is a computer, it needs to be triggered to make the first move.
self.handle_move(first_computer_move)
elif isinstance(self.player1, ComputerPlayer) and isinstance(self.player2, ComputerPlayer):
while not self.board.over(): # Make the two computer players play against each other without button presses
move = self.current_player.get_move(self.board)
self.handle_move(move)
def learn_Q(self, move): # If Q-learning is toggled on, "learn_Q" should be called after receiving a move from an instance of Player and before implementing the move (using Board's "place_mark" method)
state_key = QPlayer.make_and_maybe_add_key(self.board, self.current_player.mark, self.Q)
next_board = self.board.get_next_board(move, self.current_player.mark)
reward = next_board.give_reward()
next_state_key = QPlayer.make_and_maybe_add_key(next_board, self.other_player.mark, self.Q)
if next_board.over():
expected = reward
else:
next_Qs = self.Q[next_state_key] # The Q values represent the expected future reward for player X for each available move in the next state (after the move has been made)
if self.current_player.mark == "X":
expected = reward + (self.gamma * min(next_Qs.values())) # If the current player is X, the next player is O, and the move with the minimum Q value should be chosen according to our "sign convention"
elif self.current_player.mark == "O":
expected = reward + (self.gamma * max(next_Qs.values())) # If the current player is O, the next player is X, and the move with the maximum Q vlue should be chosen
change = self.alpha * (expected - self.Q[state_key][move])
self.Q[state_key][move] += change
class Board:
def __init__(self, grid=np.ones((3,3))*np.nan):
self.grid = grid
def winner(self):
rows = [self.grid[i,:] for i in range(3)]
cols = [self.grid[:,j] for j in range(3)]
diag = [np.array([self.grid[i,i] for i in range(3)])]
cross_diag = [np.array([self.grid[2-i,i] for i in range(3)])]
lanes = np.concatenate((rows, cols, diag, cross_diag)) # A "lane" is defined as a row, column, diagonal, or cross-diagonal
any_lane = lambda x: any([np.array_equal(lane, x) for lane in lanes]) # Returns true if any lane is equal to the input argument "x"
if any_lane(np.ones(3)):
return "X"
elif any_lane(np.zeros(3)):
return "O"
def over(self): # The game is over if there is a winner or if no squares remain empty (cat's game)
return (not np.any(np.isnan(self.grid))) or (self.winner() is not None)
def place_mark(self, move, mark): # Place a mark on the board
num = Board.mark2num(mark)
self.grid[tuple(move)] = num
@staticmethod
def mark2num(mark): # Convert's a player's mark to a number to be inserted in the Numpy array representing the board. The mark must be either "X" or "O".
d = {"X": 1, "O": 0}
return d[mark]
def available_moves(self):
return [(i,j) for i in range(3) for j in range(3) if np.isnan(self.grid[i][j])]
def get_next_board(self, move, mark):
next_board = copy.deepcopy(self)
next_board.place_mark(move, mark)
return next_board
def make_key(self, mark): # For Q-learning, returns a 10-character string representing the state of the board and the player whose turn it is
fill_value = 9
filled_grid = copy.deepcopy(self.grid)
np.place(filled_grid, np.isnan(filled_grid), fill_value)
return "".join(map(str, (map(int, filled_grid.flatten())))) + mark
def give_reward(self): # Assign a reward for the player with mark X in the current board position.
if self.over():
if self.winner() is not None:
if self.winner() == "X":
return 1.0 # Player X won -> positive reward
elif self.winner() == "O":
return -1.0 # Player O won -> negative reward
else:
return 0.5 # A smaller positive reward for cat's game
else:
return 0.0 # No reward if the game is not yet finished
class Player(object):
def __init__(self, mark):
self.mark = mark
self.get_opponent_mark()
def get_opponent_mark(self):
if self.mark == 'X':
self.opponent_mark = 'O'
elif self.mark == 'O':
self.opponent_mark = 'X'
else:
print "The player's mark must be either 'X' or 'O'."
class HumanPlayer(Player):
def __init__(self, mark):
super(HumanPlayer, self).__init__(mark=mark)
class ComputerPlayer(Player):
def __init__(self, mark):
super(ComputerPlayer, self).__init__(mark=mark)
class RandomPlayer(ComputerPlayer):
def __init__(self, mark):
super(RandomPlayer, self).__init__(mark=mark)
@staticmethod
def get_move(board):
moves = board.available_moves()
if moves: # If "moves" is not an empty list (as it would be if cat's game were reached)
return moves[np.random.choice(len(moves))] # Apply random selection to the index, as otherwise it will be seen as a 2D array
class THandPlayer(ComputerPlayer):
def __init__(self, mark):
super(THandPlayer, self).__init__(mark=mark)
def get_move(self, board):
moves = board.available_moves()
if moves:
for move in moves:
if THandPlayer.next_move_winner(board, move, self.mark):
return move
elif THandPlayer.next_move_winner(board, move, self.opponent_mark):
return move
else:
return RandomPlayer.get_move(board)
@staticmethod
def next_move_winner(board, move, mark):
return board.get_next_board(move, mark).winner() == mark
class QPlayer(ComputerPlayer):
def __init__(self, mark, Q={}, epsilon=0.2):
super(QPlayer, self).__init__(mark=mark)
self.Q = Q
self.epsilon = epsilon
def get_move(self, board):
if np.random.uniform() < self.epsilon: # With probability epsilon, choose a move at random ("epsilon-greedy" exploration)
return RandomPlayer.get_move(board)
else:
state_key = QPlayer.make_and_maybe_add_key(board, self.mark, self.Q)
Qs = self.Q[state_key]
if self.mark == "X":
return QPlayer.stochastic_argminmax(Qs, max)
elif self.mark == "O":
return QPlayer.stochastic_argminmax(Qs, min)
@staticmethod
def make_and_maybe_add_key(board, mark, Q): # Make a dictionary key for the current state (board + player turn) and if Q does not yet have it, add it to Q
state_key = board.make_key(mark)
if Q.get(state_key) is None:
moves = board.available_moves()
Q[state_key] = {move: 0.0 for move in moves} # The available moves in each state are initially given a default value of zero
return state_key
@staticmethod
def stochastic_argminmax(Qs, min_or_max): # Determines either the argmin or argmax of the array Qs such that if there are 'ties', one is chosen at random
min_or_maxQ = min_or_max(Qs.values())
if Qs.values().count(min_or_maxQ) > 1: # If there is more than one move corresponding to the maximum Q-value, choose one at random
best_options = [move for move in Qs.keys() if Qs[move] == min_or_maxQ]
move = best_options[np.random.choice(len(best_options))]
else:
move = min_or_max(Qs, key=Qs.get)
return move
root = tk.Tk()
root.title("Tic Tac Toe")
player1 = QPlayer(mark="X")
player2 = QPlayer(mark="O")
# for _ in range(1): # Play a couple of training games
# training_game = Game(root, player1, player2)
# training_game.play()
# training_game.reset()
human_player = HumanPlayer(mark="X")
player2.epsilon = 0 # For playing the actual match, disable exploratory moves
game = Game(root, player1=human_player, player2=player2)
game.play()
root.mainloop()
Upvotes: 0
Views: 2127
Reputation: 54
I've noticed in this code that if you were to use it in python 3.2.3 or similar editions all of the print statements would need to be enclosed by brackets, and you'd need to add tkinter in the program by importing it.
Upvotes: 0
Reputation: 33193
It looks like you only need to create the board one time as the reset method resets it for the new players. Each type you create a Game instance, you create a new Tk frame so you either need to destroy the old one or you can reuse the windows by not creating a new Game instance each time.
A minor change to the main code at the bottom of the file seems to fix this:
player1 = QPlayer(mark="X")
player2 = QPlayer(mark="O")
game = Game(root, player1, player2)
for _ in range(1): # Play a couple of training games
game.play()
game.reset()
human_player = HumanPlayer(mark="X")
player2.epsilon = 0 # For playing the actual match, disable exploratory moves
game.player1 = human_player
game.player2 = player2
game.play()
Upvotes: 1