Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 61 additions & 4 deletions RandomBird/FlappyAgent.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,66 @@
from ple.games.flappybird import FlappyBird
from ple import PLE

from keras import optimizers
from keras.models import Sequential
from keras.layers import Dense, Flatten

import numpy as np

list_actions = [None,119]
nb_save = 4
save_pipe_center = []

## Initialisation du réseau de neurones :
batchSize = 256 # mini batch size
## couches du réseau de neurone : plusieurs couches ne permettent pas de converger plus rapidement par expérience.
dqn = Sequential()
# 1st layer
#dqn.add(Dense(units=112, init='lecun_uniform', activation="relu", input_shape=(8,)))
# 2nd layer
dqn.add(Dense(units=500, init='lecun_uniform', activation="relu", input_shape=(8,)))
# 3rd layer
#dqn.add(Dense(units=112, init='lecun_uniform', activation="relu", input_shape=(8,)))
# output layer
dqn.add(Dense(units=2, init='lecun_uniform', activation="linear"))
dqn.compile(loss="mean_squared_error", optimizer=optimizers.Adam(1e-4))

dqn.load_weights("test_final.dqf") # Permet de charger le résultat précédent.

def FlappyPolicy(state, screen):
action=None
if(np.random.randint(0,2)<1):
action=119
return action
return NemoPolicy(state)


def DoriPolicy(state):
next_pip_center = (state['next_pipe_bottom_y']+state['next_pipe_top_y'])/2
if state['player_y'] > next_pip_center:
return list_actions[1]
else:
return list_actions[0]

def NemoPolicy(state):
global save_pipe_center, nb_save

next_pipe_center = (state['next_pipe_bottom_y']+state['next_pipe_top_y'])/2

if len(save_pipe_center)==0:
save_pipe_center = [next_pipe_center for i in range(nb_save+1)]
else:
save_pipe_center.append(next_pipe_center)

if state['player_y'] > save_pipe_center.pop(0):
return list_actions[1]
else:
return list_actions[0]


def greedy_action(network, state, batchSize): #Cherche la meilleure action prédite.
qval = network.predict(state.reshape(1,len(state)), batch_size=batchSize)
qval_av_action = [-9999]*2
for ac in range(0,2):
qval_av_action[ac] = qval[0][ac]
action = (np.argmax(qval_av_action))
return action

def Reinforcement_learning_policy(state):
return list_actions(greedy_action(dqn, state, batchSize))
157 changes: 157 additions & 0 deletions RandomBird/Training.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
# -*- coding: utf-8 -*-
"""
Created on Fri Mar 2 15:24:01 2018

Fichier d'apprentissage du Flappy Bird

@author: Gaspard Berthelin
"""

from ple.games.flappybird import FlappyBird
from ple import PLE

from keras import optimizers
from keras.models import Sequential
from keras.layers import Dense, Flatten

import numpy as np

## Initialisation du jeu :
game = FlappyBird(graphics="fixed")
p = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=True, display_screen=True)
p.init()
list_actions=[None,119]

## Initialisation du réseau de neurones :
batchSize = 256 # mini batch size
## couches du réseau de neurone : plusieurs couches ne permettent pas de converger plus rapidement par expérience.
dqn = Sequential()
# 1st layer
#dqn.add(Dense(units=112, init='lecun_uniform', activation="relu", input_shape=(8,)))
# 2nd layer
dqn.add(Dense(units=500, init='lecun_uniform', activation="relu", input_shape=(8,)))
# 3rd layer
#dqn.add(Dense(units=112, init='lecun_uniform', activation="relu", input_shape=(8,)))
# output layer
dqn.add(Dense(units=2, init='lecun_uniform', activation="linear"))
dqn.compile(loss="mean_squared_error", optimizer=optimizers.Adam(1e-4))

dqn.load_weights("final.dqf") # Permet de charger le résultat précédent.

def process_state(state): #Adaptaion de l'état pour en faire une entrée du réseau de neurone.
return np.array(list(state.values()))

def epsilon(step,total): # Définie la probabilité de choisir le cas aléatoire.
p_max = 0.4
p_min = 0.2
d = 0.0
x = step/total
if x < d:
return p_max
elif x > (1-d):
return p_min
return (p_min-p_max) * (x-d)/(1-2*d) + p_max

def clip_reward(r): #Récompense sur l'évaluation des résultats (permet de donner le nombre de tuyaux passés)
rr=0
if r>0:
rr=1
if r<0:
rr=0
return train4000score13.0

def training_reward(r): #Récompense du training : fort handicap sur le game over.
rr=0
if r>0:
rr=1
if r<0:
rr=-1000
return rr

def random_action(state): # Pour accélérer la convergence, j'ai essayé un état aléatoire plus cohérent. Ce ne fut pas concluant à long terme.
if state[0] > (state[3]+state[4])/2.0 :
return 1
else :
return 0

def greedy_action(network, state, batchSize): #Cherche la meilleure action prédite.
qval = network.predict(state.reshape(1,len(state)), batch_size=batchSize)
qval_av_action = [-9999]*2
for ac in range(0,2):
qval_av_action[ac] = qval[0][ac]
action = (np.argmax(qval_av_action))
return action

def MCeval(network, trials): #Evalue la qualité du réseau jusqu'à présent.
scores = np.zeros((trials))
for i in range(trials):
p.reset_game()
while not(p.game_over()):
state = game.getGameState()
state = process_state(state)
action = greedy_action(network, state, batchSize)
action = list_actions[action]
reward = p.act(action)
reward = clip_reward(reward)
state = game.getGameState()
state = process_state(state)
scores[i] = scores[i] + reward
return np.sum(scores)


## Training loop :
total_games = 15000 #nombre de parties jouées pour l'entraînement.
evaluation_period = 1000 #Tout les ... évalue la qualité du réseau.
gamma = 0.99 #Permet de déifinir la récompense update.
step_game = 0 #indice du nombre de parties jouées.
while (step_game < total_games):
p.reset_game() #réinitialisation du jeu
state = game.getGameState()
state = process_state(state)
rand_sum = 0
greedy_sum = 0
tuyau_passe = 0
while(not game.game_over()):

if (np.random.random() < epsilon(step_game,total_games)):
#Exploration
rand_sum = rand_sum + 1
#action = random_action(state)
action = np.random.choice([0,1])
else:
#On suit le résultat du réseau de neurone.
greedy_sum = greedy_sum + 1
action = greedy_action(dqn, state, batchSize)

#Résultat de l'action :
reward = p.act(list_actions[action])
reward = training_reward(reward)
if reward > 0:
tuyau_passe = tuyau_passe + 1

new_state = game.getGameState()
new_state = process_state(new_state)

terminal = game.game_over()

## Update du réseau de neurone :
newQ = dqn.predict(new_state.reshape(1,len(state)), batch_size=batchSize)
maxQ = np.max(newQ)
y = np.zeros((1,2))
y = dqn.predict(new_state.reshape(1,len(state)), batch_size=batchSize)

update = reward + gamma * (1-terminal) * maxQ

y[0][action] = update

dqn.fit(state.reshape(1, len(state)), y, batch_size=batchSize, nb_epoch=3, verbose=0)
state = new_state

print("game %d : score = %d / prop_alea = %s pourcents" % (step_game ,tuyau_passe,round(rand_sum/(rand_sum+greedy_sum)*100)))
if step_game%evaluation_period == 0:
mcval = MCeval(dqn,50)
print('eval_MC=',MCeval(dqn,50))
dqn.save("train" + str(step_game) + ".dqf")

step_game = step_game + 1
dqn.save("final.dqf")
Binary file added RandomBird/test_final.dqf
Binary file not shown.
Binary file added RandomBird/train0.dqf
Binary file not shown.
Binary file added RandomBird/train1000.dqf
Binary file not shown.
Binary file added RandomBird/train10000.dqf
Binary file not shown.
Binary file added RandomBird/train11000.dqf
Binary file not shown.
Binary file added RandomBird/train12000.dqf
Binary file not shown.
Binary file added RandomBird/train13000.dqf
Binary file not shown.
Binary file added RandomBird/train2000.dqf
Binary file not shown.
Binary file added RandomBird/train3000.dqf
Binary file not shown.
Binary file added RandomBird/train4000.dqf
Binary file not shown.
Binary file added RandomBird/train5000.dqf
Binary file not shown.
Binary file added RandomBird/train6000.dqf
Binary file not shown.
Binary file added RandomBird/train7000.dqf
Binary file not shown.
Binary file added RandomBird/train8000.dqf
Binary file not shown.
Binary file added RandomBird/train9000.dqf
Binary file not shown.