三目並べの強化学習

29 Nov 2018

Reading time ~5 minutes

NNで三目並べを強化学習させる

三目並べをプレイするために、

マス目の位置
9次元のベクトル
最善の手を与えることで、ニューラルネットワークをトレーニングする。

ここでは、三目並べの限られた数の局面を与えるほか、トレーニングセットのサイズを増やすために各局面にランダムな座標変換を適用する。

このアルゴルズムをテストするために、1つの局面の指し手を削除し、モデルが最善の手を予測できるかどうかを確認する。

最後に実際にモデルと対戦をする。

このレシピの局面と最善の手からなるリストはここに用意されている。

import tensorflow as tf
import matplotlib.pyplot as plt
import csv
import numpy as np
import random
from tensorflow.python.framework import ops
ops.reset_default_graph()

#モデルをトレーニングするためのバッチサイズを指定(5000を推奨)
batch_size = 50

#局面をXとOで出力する関数
def print_board(board):
    symbols = ['0', ' ','X']
    board_plus1 = [int(x) + 1 for x in board]
    print(' ' + symbols[board_plus1[0]] + ' | ' + symbols[board_plus1[1]] + ' | ' + symbols[board_plus1[2]])
    print('________________')
    print(' ' + symbols[board_plus1[3]] + ' | ' + symbols[board_plus1[4]] +  ' | ' + symbols[board_plus1[5]])
    print('________________')
    print(' ' + symbols[board_plus1[6]] + ' | ' + symbols[board_plus1[7]] +  ' | ' + symbols[board_plus1[8]])

#座標交換を使って新しい局面と最善の手を返す関数
def get_symmetry(board, response, transformation):
    """
    :param board: 長さ9の整数のリスト:
     opposing mark = -1
     friendly mark = 1
     empty space = 0
    :param transformation: 以下5つの座標変換のうちの１つ
     'rotate180', 'rotate90', 'rotate270', 'flip_v', 'flip_h'
    :return: tuple: (new_board, new_response)
    """
    if transformation == 'rotate180':
        new_response = 8 - response
        return board[::-1], new_response
    elif transformation == 'rotate90':
        new_response = [6, 3, 0, 7, 4, 1, 8, 5, 2].index(response)
        tuple_board = list(zip(*[board[6:9], board[3:6], board[0:3]]))
        return [value for item in tuple_board for value in item], new_response
    elif transformation == 'rotate270':
        new_response = [2, 5, 8, 1, 4, 7, 0, 3, 6].index(response)
        tuple_board = list(zip(*[board[0:3], board[3:6], board[6:9]]))[::-1]
        return [value for item in tuple_board for value in item], new_response
    elif transformation == 'flip_v':
        new_response = [6, 7, 8, 3, 4, 5, 0, 1, 2].index(response)
        return board[6:9] + board[3:6] + board[0:3], new_response
    elif transformation == 'flip_h':  # flip_h = rotate180, then flip_v
        new_response = [2, 1, 0, 5, 4, 3, 8, 7, 6].index(response)
        new_board = board[::-1]
        return new_board[6:9] + new_board[3:6] + new_board[0:3], new_response
    else:
        raise ValueError('Method not implemented.')

#局面と最善の手のリストは、.csvファイルに含まれている。
#このファイルから局面と最善の手を読み込み、タプリのリストとして格納する関数
def get_moves_from_csv(csv_file):
    """
    :param csv_file: 局面と最善の手を含むCSVファイル
    :return: moves: 最善の手のインデックスが含まれた指し手のリスト
    """
    moves = []
    with open(csv_file, 'rt') as csvfile:
        reader = csv.reader(csvfile, delimiter=',')
        for row in reader:
            moves.append(([int(x) for x in row[0:9]], int(row[9])))
    return moves

#いくつかの関数を組み合わせて、ランダムに変換された局面と最善の手を返す関数
def get_rand_move(moves, rand_transforms=2):
    (board, response) = random.choice(moves)
    possible_transforms = ['rotate90', 'rotate180', 'rotate270', 'flip_v', 'flip_h']
    for _ in range(rand_transforms):
        random_transform = random.choice(possible_transforms)
        (board, response) = get_symmetry(board, response, random_transform)
    return board, response

#グラフセッションを開始し、データロードした後、トレーニングセットを作成する
sess = tf.Session()

#局面と最善の手が含まれたリストを取得
moves = get_moves_from_csv('base_tic_tac_toe_moves.csv')

#トレーニングセットを作成
train_length = 500
train_set = []
for t in range(train_length):
    train_set.append(get_rand_move(moves))

#トレーニングセットから１つの局面の指し手を削除して、このモデルが最善の手を予測できるかどうかを確認したい。
# 次の局面に対する最善の手は、インデックス６のマスに打つことだ。
# To see if the network learns anything new, we will remove
# all instances of the board [-1, 0, 0, 1, -1, -1, 0, 0, 1],
# which the optimal response will be the index '6'.  We will
# Test this at the end.
test_board = [-1, 0, 0, 1, -1, -1, 0, 0, 1]
train_set = [x for x in train_set if x[0] != test_board]

#モデルの変数と演算を作成するための関数を定義する。
#このモデルにソフトマックス活性化関数（softmax()）が含まれていないことに注目しよう。
#この関数は損失関数に含まれている。
def init_weights(shape):
    return tf.Variable(tf.random_normal(shape))


def model(X, A1, A2, bias1, bias2):
    layer1 = tf.nn.sigmoid(tf.add(tf.matmul(X, A1), bias1))
    layer2 = tf.add(tf.matmul(layer1, A2), bias2)
    return(layer2)

#プレースホルダ、変数、モデルを設定する
X = tf.placeholder(dtype=tf.float32, shape=[None, 9])
Y = tf.placeholder(dtype=tf.int32, shape=[None])

A1 = init_weights([9, 81])
bias1 = init_weights([81])
A2 = init_weights([81, 9])
bias2 = init_weights([9])

model_output = model(X, A1, A2, bias1, bias2)

#損失関数を設定する。
#トレーニングステップと最適化関数を設定する
#将来このモデルと対戦できるようにしたい場合は、予測関数を作成してく必要もある。
loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=model_output, labels=Y))
train_step = tf.train.GradientDescentOptimizer(0.025).minimize(loss)
prediction = tf.argmax(model_output, 1)

#変数を初期化し、ニューラルネットワークのトレーニングを開始する。
init = tf.global_variables_initializer()
sess.run(init)

loss_vec = []
for i in range(10000):
    #バッチを選択するためのインデックスをランダムに選択
    rand_indices = np.random.choice(range(len(train_set)), batch_size, replace=False)
    #ランダムな値でバッチを取得
    batch_data = [train_set[i] for i in rand_indices]
    x_input = [x[0] for x in batch_data]
    y_target = np.array([y[1] for y in batch_data])
    #トレーニングステップを実行
    sess.run(train_step, feed_dict={X: x_input, Y: y_target})
    
    #トレーニングセットの損失値を取得
    temp_loss = sess.run(loss, feed_dict={X: x_input, Y: y_target})
    loss_vec.append(temp_loss)
    
    if i % 500 == 0:
        print('Iteration ' + str(i) + ' Loss: ' + str(temp_loss))

(こんな感じの結果になる)

Iteration 0 Loss: 8.904177
Iteration 500 Loss: 1.74531
Iteration 1000 Loss: 1.4589853
Iteration 1500 Loss: 1.1635289
Iteration 2000 Loss: 1.3590469
Iteration 2500 Loss: 1.3962117
Iteration 3000 Loss: 0.9921367
Iteration 3500 Loss: 1.0137455
Iteration 4000 Loss: 1.0708985
Iteration 4500 Loss: 1.0389541
Iteration 5000 Loss: 1.0655681
Iteration 5500 Loss: 1.1608162
Iteration 6000 Loss: 0.8579481
Iteration 6500 Loss: 0.9746061
Iteration 7000 Loss: 0.7223426
Iteration 7500 Loss: 0.7803637
Iteration 8000 Loss: 0.8147448
Iteration 8500 Loss: 0.7135554
Iteration 9000 Loss: 0.6399172
Iteration 9500 Loss: 0.7355053

このモデルのトレーニングセットの損失値をプロットする。

plt.plot(loss_vec, 'k-', label='Loss')
plt.title('Loss (MSE) per Generation')
plt.xlabel('Generation')
plt.ylabel('Loss')
plt.show()

aaa

テスト

#テスト
#トレーニングセットから削除した局面でテストを実行したらどうなるかを確認する
#[test_bord]を予測させてみる（最善の手はインデックス6）
test_boards = [test_board]
feed_dict = {X: test_boards}
logits = sess.run(model_output, feed_dict=feed_dict)
predictions = sess.run(prediction, feed_dict=feed_dict)
print(predictions)

#評価
#トレーニングしたモデルと対戦する計画を立てる
#勝敗をチェックする関数を作成する必要がある。
def check(board):
    wins = [[0, 1, 2], [3, 4, 5], [6, 7, 8], [0, 3, 6], [1, 4, 7], [2, 5, 8], [0, 4, 8], [2, 4, 6]]
    for ix in range(len(wins)):
        if board[wins[ix][0]] == board[wins[ix][1]] == board[wins[ix][2]] == 1.:
            return 1
        elif board[wins[ix][0]] == board[wins[ix][1]] == board[wins[ix][2]] == -1.:
            return 1
    return 0
    
#評価
#トレーニングしたモデルと対戦する計画を立てる
#勝敗をチェックする関数を作成する必要がある。
def check(board):
    wins = [[0, 1, 2], [3, 4, 5], [6, 7, 8], [0, 3, 6], [1, 4, 7], [2, 5, 8], [0, 4, 8], [2, 4, 6]]
    for ix in range(len(wins)):
        if board[wins[ix][0]] == board[wins[ix][1]] == board[wins[ix][2]] == 1.:
            return 1
        elif board[wins[ix][0]] == board[wins[ix][1]] == board[wins[ix][2]] == -1.:
            return 1
    return 0
    
#最初は、全てのマス目が空（0）
#次に、プレイヤーがインデックスを入力し、そのインデックスをモデルに羽根井して次の一手を予測させる
game_tracker = [0., 0., 0., 0., 0., 0., 0., 0., 0.]
win_logical = False
num_moves = 0
while not win_logical:
    player_index = input('Input index of your move (0-8): ')
    num_moves += 1
    # Add player move to game
    game_tracker[int(player_index)] = 1.
    
    # Get model's move by first getting all the logits for each index
    [potential_moves] = sess.run(model_output, feed_dict={X: [game_tracker]})
    # Now find allowed moves (where game tracker values = 0.0)
    allowed_moves = [ix for ix, x in enumerate(game_tracker) if x == 0.0]
    # Find best move by taking argmax of logits if they are in allowed moves
    model_move = np.argmax([x if ix in allowed_moves else -999.0 for ix, x in enumerate(potential_moves)])
    
    # Add model move to game
    game_tracker[int(model_move)] = -1.
    print('Model has moved')
    print_board(game_tracker)
    # Now check for win or too many moves
    if check(game_tracker) == 1 or num_moves >= 5:
        print('Game Over!')
        win_logical = True