1. 程式人生 > >讓AI網路自己學會下五子棋

讓AI網路自己學會下五子棋

最近學AI,想做點什麼,後面想想能不能做一個相對簡單的AI,讓AI自己學會下五子棋呢!我不想寫任何邏輯,我只告訴AI,你這樣輸了哦。也就是反饋,獎懲的方式,也叫深度強化學習吧(DQN)。我做得很簡單,2個一樣的網路,一個判斷結果,讓2個網路自己對抗,策略是部分按訓練結果來下,部分按隨機概率來下(用來探索最佳答案)。訓練過程發現損失越來越少,下棋週期越來越長,棋子會相對離散。這裡輸出“0”表示先手,“*”表示後手。

    因為朋友的AI專案開始了,我後面沒有跟進這個東西了。有興趣的朋友完善下,一起交流!

import tensorflow as tf
import numpy as np
import os

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
k = 0;
M = 10
N = 10
chessState = {}


def getTrainData():
    tmp = []
    for i in range( M ):
        one = []
        for j in range( N ):
            one.append( chessState[i, j] )
        tmp.append( one )
    return np.reshape( tmp, [-1, 10] )


def getOneEpData():
    tmp = []
    for i in range( len( OneRePlay ) ):
        tmp.append()


def initState():
    for i in range( M ):
        for j in range( N ):
            chessState[i, j] = -1
    # chessState[5,5]=1

def outPutChess():
    print("{}".format(getTrainData()).replace("-1","  ").replace("1","*"))

def outputState():
    for i in range( M ):
        print( "" )
        print( "{:2d}  {:2d}  {:2d}  {:2d}  {:2d}  {:2d}  {:2d}  {:2d}  {:2d}  {:2d}".format( chessState[i, 0],
                                                                                              chessState[i, 1],
                                                                                              chessState[i, 2],
                                                                                              chessState[i, 3],
                                                                                              chessState[i, 4],
                                                                                              chessState[i, 5],
                                                                                              chessState[i, 6],
                                                                                              chessState[i, 7],
                                                                                              chessState[i, 8],
                                                                                              chessState[i, 9] ).replace("-1","-").replace("1","*") )


def PlayOneStep(L=0., who=-1, sess=tf.Session):
    i1, j1 = 0, 0
    p1, p2 = 0.00, 0.000

    for i in range( M ):  # 所有點測試都嘗試
        for j in range( N ):
            if (chessState[i, j] == -1):
                chessState[i, j] = who
                p1 = sess.run( L, feed_dict={x: np.reshape( getTrainData(), [-1, 100] )} )
                if (p1[0][0] > p2):
                    i1, j1 = i, j
                    p2 = p1[0][0]
                chessState[i, j] = -1
    if (True):#np.random.uniform() < 1-p2):  # 隨機嘗試不同的地方
        return i1, j1
    else:
        while True:
            i1 = np.random.randint( 0, 9 )
            j1 = np.random.randint( 0, 9 )
            if (chessState[i1, j1] == -1):
                break
    return i1, j1

def Normalize(data):
    m = np.mean(data)
    mx = max(data)
    mn = min(data)
    return [(float(i) - m) / (mx - mn) for i in data]

def whoWin(who=1):
    i, j = 0, 0
    h, v, p, l = 0, 0, 0, 0
    for i in range( M ):
        for j in range( N ):
            if (chessState[i, j] == who):
                h, v, p, l = 1, 1, 1, 1
                for m in range( j + 1, N ):  # h方向-
                    if (chessState[i, m] == who):
                        h += 1
                        if (h >= 5):
                            return True, "—"
                    else:
                        h = 0

                for m in range( i + 1, M ):  # V方向|
                    if (chessState[m, j] == who):
                        v += 1
                        if (v >= 5):
                            return True, "|"
                    else:
                        v = 0

                for m in range( 1, M - j ):  # L方向\
                    if (i + m >= M):
                        break
                    if (j + m >= M):
                        break
                    if (chessState[i + m, j + m] == who):
                        p += 1
                        if (p >= 5):
                            return True, "\\"
                    else:
                        p = 0

                for m in range( 1, M - i ):  # P方向/(1,10)
                    if (i + m >= M):
                        break
                    if (j - m < 0):
                        break
                    if (chessState[i + m, j - m] == who):
                        l += 1
                        if (l >= 5):
                            return True, "/"
                    else:
                        l = 0

    return False, ""


initState()
learning_rate = 0.0001

"""
chessState[1,5]=1
chessState[1,6]=1
chessState[1,7]=1
chessState[1,8]=1
chessState[1,9]=1
print("1111")
print(whoWin(1))
print(getTrainData())
exit()

chessState[1,9]=-1
chessState[2,8]=-1
chessState[3,7]=-1
chessState[4,6]=-1
chessState[5,5]=-1

print(whoWin(-1))
exit(0)
"""
#print( getTrainData() )
outPutChess();
# 先手網路
x = tf.placeholder( dtype=tf.float32, shape=[None, 100], name="X_In" )
y = tf.placeholder( dtype=tf.float32, shape=[None, 1], name="Y_In" )

w1 = tf.get_variable( "W1", shape=[100, 40], initializer=tf.contrib.layers.xavier_initializer() )
b1 = tf.get_variable( "b1", shape=[40], initializer=tf.contrib.layers.xavier_initializer() )

w2 = tf.get_variable( "W2", shape=[40, 1], initializer=tf.contrib.layers.xavier_initializer() )
# np.random.uniform(0,1,size=[50,1]))
b2 = tf.get_variable( "b2", shape=[1], initializer=tf.contrib.layers.xavier_initializer() )

L1 = tf.matmul( x, w1 ) + b1
L2_R = tf.matmul( L1, w2 ) + b2
L2 = tf.nn.sigmoid( L2_R )

# loglik = tf.log(y * (y - L2) + (1 - y) * (y + L2))
# los = -tf.reduce_mean(loglik)

#los = -tf.reduce_mean( y * tf.log( L2_R ) )
los=tf.reduce_mean(tf.square(L2_R-y))
#los = tf.nn.softmax_cross_entropy_with_logits_v2(logits=L2,labels=y)
# los=tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=L2, labels=y))
train = tf.train.AdamOptimizer( learning_rate ).minimize( los )
# train=tf.train.GradientDescentOptimizer(0.0001).minimize(los)

# 後手網路
w1_h = tf.get_variable( "w1_h", shape=[100, 40], initializer=tf.contrib.layers.xavier_initializer() )
b1_h = tf.get_variable( "b1_h", shape=[40], initializer=tf.contrib.layers.xavier_initializer() )

w2_h = tf.get_variable( "w2_h", shape=[40, 1], initializer=tf.contrib.layers.xavier_initializer() )
b2_h = tf.get_variable( "b2_h", shape=[1], initializer=tf.contrib.layers.xavier_initializer() )

L1_h = tf.matmul( x, w1_h ) + b1_h
L2_h_R = tf.matmul( L1, w2_h ) + b2_h
L2_h = tf.nn.sigmoid( L2_h_R )

# loglik_h = tf.log(y * (y - L2_h) + (1 - y) * (y + L2_h))
# los_h = -tf.reduce_mean(loglik_h)

#los_h = -tf.reduce_mean( y * tf.log( L2_h_R ) )
los_h=tf.reduce_mean(tf.square(L2_h_R-y))
#los_h = tf.nn.softmax_cross_entropy_with_logits_v2(logits=L2_h,labels=y)

# los_h=tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=L2_h, labels=y))
# train_h=tf.train.GradientDescentOptimizer(0.001).minimize(los_h)
train_h = tf.train.AdamOptimizer( learning_rate ).minimize( los_h )

epCount = 0
AllReplay = []
AllReward = []
AllRewardH = []

OneRePlay = []
OneReward = []
OneRewardH = []

OneRePlay.append( getTrainData() )
OneReward.append( 1 )
OneRewardH.append( 1 )
step = 0
who = 1  # 後手
with tf.Session() as sess:
    init = tf.global_variables_initializer()
    sess.run( init )
    while True:
        oneTran = getTrainData()
        if (who == 1):
            i, j = PlayOneStep( L2, who, sess )  # 位置
        else:
            i, j = PlayOneStep( L2_h, who, sess )  # 位置

        chessState[i, j] = who
        done, posWin = whoWin( who )
        step += 1
        if (done):
            print(
                "------【{:1d}】----------------{:6d}----------【{:s}】----step{:d}--------".format( who, epCount, posWin,
                                                                                                 step ).replace("-1","-") )
            step = 0
            #print( getTrainData() )
            #outputState()
            outPutChess()
            done = True
            iMax = len( OneReward )
            if (who == 1):  # 先手贏了
                OneReward[iMax - 1] = 0.96
                for i in reversed( range( iMax - 1 ) ):
                    OneReward[i] *= OneReward[i + 1] * 0.995
                OneRewardH[iMax - 1] = 0.10
                for i in reversed( range( iMax - 1 ) ):
                    OneRewardH[i] *= OneRewardH[i + 1] * 1.02
            else:  # 後手贏了
                OneRewardH[iMax - 1] = 0.96
                for i in reversed( range( iMax - 1 ) ):
                    OneRewardH[i] *= OneRewardH[i + 1] * 0.995
              
                OneReward[iMax - 1] = 0.10
                for i in reversed( range( iMax - 1 ) ):
                    OneReward[i] *= OneReward[i + 1] * 1.02

            AllReplay.append( OneRePlay )
            AllReward.append( OneReward )
            AllRewardH.append( OneRewardH )
            initState()  # 重新開始
            # print(getTrainData())

        if (len( AllReplay ) > 0):# and done):  # 更新梯度,
            x_feed = np.vstack( AllReplay )
            x_feed = np.array( x_feed )
            x_feed = np.reshape( x_feed, [-1, 100] )
            r = np.hstack( AllReward )
            r = np.array( r )
            rh = np.hstack( AllRewardH )
            rh = np.array( rh )
            _, tlos1, ww1, ww2 = sess.run( [train, los, w1, w2], feed_dict={x: x_feed, y: np.reshape( r, [-1, 1] )} )
            _, tlos2 = sess.run( [train_h, los_h], feed_dict={x: x_feed, y: np.reshape( rh, [-1, 1] )} )
            if (step % 10 == 0):
                print( "los1,los2:", tlos1, tlos2 )
            # print("w1,w2:",ww1,ww2)

            if (done):  # 一輪結束
                OneReward = []
                OneRewardH = []
                OneRePlay = []

        OneRePlay.append( getTrainData() )
        OneReward.append( 1 )
        OneRewardH.append( 1 )

        if (who == 1):
            who = 0
        elif (who == 0):
            who =1
        epCount += 1

        if (len( AllReplay ) > 50):
            AllReplay.pop()
            AllReward.pop()
            AllRewardH.pop()
        # if(epCount%10==0):#看看啥情況
        #   print(getTrainData())
        # outputState()

訓練過程如下:

訓練