diff --git a/HernanzGonzalez/FlappyAgent.py b/HernanzGonzalez/FlappyAgent.py new file mode 100644 index 0000000..7f3e4f9 --- /dev/null +++ b/HernanzGonzalez/FlappyAgent.py @@ -0,0 +1,37 @@ +#### Flappy Bird policy selection function + +#%% +# Imports +import numpy as np +import random + +# We load the Q values dictionary (already learned) +Q_learned = np.load("Q.npy").item() + +#%% +# Round function to define the grid +def myround(x): + return int(5*round(float(x)/5)) + +# Key creation function to create a vector with the key variables +def getKey(pos, distance, vel): + key = (myround(pos), myround(distance), vel) + return key + +#%% +# Function to select the optimal policy. +def FlappyPolicy(state, screen): + + # Current state's key + pos = state["player_y"] - state["next_pipe_bottom_y"] + distance = state["next_pipe_dist_to_player"] + vel = state["player_vel"] + key = getKey(pos, distance, vel) + + if(Q_learned.get(key) == None): + action = 119*random.randint(0,1) # In case key is non existent + else: + action = 119*np.argmax(Q_learned[key]) + + # We return the selected action + return action \ No newline at end of file diff --git a/HernanzGonzalez/FlappyAgent2.py b/HernanzGonzalez/FlappyAgent2.py new file mode 100644 index 0000000..f56caf8 --- /dev/null +++ b/HernanzGonzalez/FlappyAgent2.py @@ -0,0 +1,37 @@ +#### Flappy Bird policy selection function +#### Convolutional neural network + +#%% +# Imports +import numpy as np +from keras.models import load_model + +import skimage as skimage +from skimage import color, transform, exposure + +ACTIONS = [0, 119] # valid actions + +model = load_model("model.h5") +np.seterr(divide='ignore', invalid='ignore') + +#%% +# Function to select the optimal policy. +def FlappyPolicy(state, screen): + + x_t = skimage.color.rgb2gray(screen) + x_t = skimage.transform.resize(x_t,(80,80)) + x_t = skimage.exposure.rescale_intensity(x_t,out_range=(0,255)) + + s_t = np.stack((x_t, x_t, x_t, x_t), axis=2) + + #In Keras, need to reshape + s_t = s_t.reshape(1, s_t.shape[0], s_t.shape[1], s_t.shape[2]) #1*80*80*4 + + + q = model.predict(s_t) #input a stack of 4 images, get the prediction + max_Q = np.argmax(q) + action_index = max_Q + a_t = ACTIONS[action_index] + + # We return the selected action + return a_t \ No newline at end of file diff --git a/HernanzGonzalez/Q.npy b/HernanzGonzalez/Q.npy new file mode 100644 index 0000000..e05dc20 Binary files /dev/null and b/HernanzGonzalez/Q.npy differ diff --git a/HernanzGonzalez/README.md b/HernanzGonzalez/README.md new file mode 100644 index 0000000..ca3943f --- /dev/null +++ b/HernanzGonzalez/README.md @@ -0,0 +1,23 @@ +## **Q-Learning algorithm to play Flappy Bird** + +In this folder you will find my version of a learning algorithm that learns to play the [Flappy Bird](https://en.wikipedia.org/wiki/Flappy_Bird) game from [PLE (PyGame Learning Environment)](https://github.com/ntasfi/PyGame-Learning-Environment). + +The game is played by executing the [run.py](run.py) file and calls the `FlappyPolicy(state, screen)` function inside the [FlappyAgent.py](FlappyAgent.py) file. This function selects the optimal policy for the bird in any given state. + +The optimal policy is selected by refering to a dictionary (saved inside the Q.npy file). This dictionary saves values for the utilities of each of the two possible actions of the bird (which are either do nothing or flap). The values are saved by using a Q learning algorithm which is implemented in the [q-learning.py](q-learning.py) file. + +The **bibliography** used to learn about this algorithm and implement is: + +- https://en.wikipedia.org/wiki/Q-learning +- http://mnemstudio.org/path-finding-q-learning-tutorial.htm A simple example of q-learning to exit a house from any of its five rooms. +- https://studywolf.wordpress.com/2012/11/25/reinforcement-learning-q-learning-and-exploration/ Another example with mouse searching for cheese on a simple grid. + +I want to also point out several examples that have been very useful in order to better understand the way the algorithm works: + +- https://github.com/chncyhn/flappybird-qlearning-bot +- http://sarvagyavaish.github.io/FlappyBirdRL/ + +I must also say that I have been trying to implement a **convolutional neural network** that uses the pixels of the image but I have not been successful. My attempts are visible on the [nn-learning.py](nn-learning.py). I also created a [FlappyAgent2.py](FlappyAgent2.py) file that chooses the policy using the neural network that has been trained on on the [nn-learning.py](nn-learning.py) file. It did not converge. Still, there is a large amount of examples online and a great deal of tutorials and helpful documents. I point out the ones that I had been using (I think they both are great): + +- https://github.com/yenchenlin/DeepLearningFlappyBird +- https://yanpanlau.github.io/2016/07/10/FlappyBird-Keras.html diff --git a/HernanzGonzalez/model.h5 b/HernanzGonzalez/model.h5 new file mode 100644 index 0000000..1032ea9 Binary files /dev/null and b/HernanzGonzalez/model.h5 differ diff --git a/HernanzGonzalez/nn-learning.py b/HernanzGonzalez/nn-learning.py new file mode 100644 index 0000000..9c77343 --- /dev/null +++ b/HernanzGonzalez/nn-learning.py @@ -0,0 +1,207 @@ +# -*- coding: utf-8 -*- +""" +Created on Tue Mar 6 15:02:59 2018 + +@author: Julio Hernanz González +""" + +#### Creation of a Convolutional Neural Network Model for the Flappy Bird PLE game. +#### This is the training file. The bird plays the game a NB_GAMES and +#### saves the information, updating the created model. + +### I HAVE NOT ARRIVED TO A SUCCESFULL NEURAL NETWORK. IT DOESN'T CONVERGE. + +#%% IMPORTS + +# We import the game +from ple.games.flappybird import FlappyBird +from ple import PLE + +import random +import numpy as np + +from collections import deque + +# To work with the screen image +import skimage as skimage +from skimage import color, transform, exposure + +# Keras allows us to create the neutal network +from keras import initializers +from keras.initializers import normal, identity +from keras.models import Sequential, load_model +from keras.layers.core import Dense, Activation, Flatten +from keras.layers.convolutional import Conv2D +from keras.optimizers import Adam +import tensorflow as tf + +#%% NEURAL NETWORK + +# This function allows us to create the model. The network is made of several +# layers. Conv2D are convolutional layers that use a filter to create an activation +# map (or feature map). We also use Rectified Linear Units ReLU and pooling layers. + +def buildmodel(): + print("Now we build the model") + model = Sequential() + model.add(Conv2D(32, (8, 8), strides=(4, 4), padding='same',input_shape=(img_rows,img_cols,img_channels))) #80*80*4 + model.add(Activation('relu')) + model.add(Conv2D(64, (4, 4), strides=(2, 2), padding='same')) + model.add(Activation('relu')) + model.add(Conv2D(64, (3, 3), strides=(1, 1), padding='same')) + model.add(Activation('relu')) + model.add(Flatten()) + model.add(Dense(512)) + model.add(Activation('relu')) + model.add(Dense(2)) + adam = Adam(lr=LEARNING_RATE) + model.compile(loss='mse',optimizer=adam) + print("We finish building the model") + return model + + +#%% CODE CORE + + +# PARAMETERS +ACTIONS = [0, 119] # valid actions (don't flap, flap) +GAMMA = 0.99 # decay rate of past observations +LEARNING_RATE = 0.0001 # alpha +OBSERVATION = 3200. # timesteps to observe before training +INITIAL_EPSILON = 0.1 # starting value of epsilon +REPLAY_MEMORY = 50000 # number of previous transitions to remember +BATCH = 32 # size of minibatch +NB_EPISODES = 10000 # Number of episodes + +img_rows, img_cols = 80, 80 +img_channels = 4 # We stack 4 frames + +# We start the backend +config = tf.ConfigProto() +config.gpu_options.allow_growth = True +sess = tf.Session(config=config) +from keras import backend as K +K.set_session(sess) + +# We build the nn model +model = buildmodel() + +# We initialise the game +game = FlappyBird(graphics="fixed") +p = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=True, display_screen=True) +p.init() +episode_counter = 0 +counter = 0 # counter to control the reduction of epsilon + +# store the previous observations in replay memory +D = deque() + +# First action, don't flap. +p.act(ACTIONS[0]) +x_t = p.getScreenRGB() +terminal = p.game_over() + +x_t = skimage.color.rgb2gray(x_t) +x_t = skimage.transform.resize(x_t,(80,80)) +x_t = skimage.exposure.rescale_intensity(x_t,out_range=(0,255)) + +s_t = np.stack((x_t, x_t, x_t, x_t), axis=2) + +#In Keras, need to reshape +s_t = s_t.reshape(1, s_t.shape[0], s_t.shape[1], s_t.shape[2]) #1*80*80*4 + +#We go to training mode +epsilon = INITIAL_EPSILON +t = 0 + +loss = 0 +Q_sa = 0 +action_index = 0 +r_t = 0 + +# We will be playing the game for a fixed number of episodes. +while (episode_counter <= NB_EPISODES): + + # We reset the game in case there has been a game over. + if p.game_over(): + p.reset_game() + + # Choose an action epsilon greedy + if random.random() <= epsilon: + # Random action + action_index = random.randrange(2) + a_t = ACTIONS[action_index] + else: + q = model.predict(s_t) # s_t is a stack of 4 images, we get the prediction + max_Q = np.argmax(q) + action_index = max_Q + a_t = ACTIONS[action_index] + counter += 1 + + # Epsilon reduction (only if observation is done) + if counter % 500 == 0 and t > OBSERVATION: + epsilon *= 0.9 + + # Run action and get next state and reward + r_t = p.act(a_t) + terminal = p.game_over() + x_t1_colored = p.getScreenRGB() + + x_t1 = skimage.color.rgb2gray(x_t1_colored) + x_t1 = skimage.transform.resize(x_t1,(80,80)) + x_t1 = skimage.exposure.rescale_intensity(x_t1, out_range=(0, 255)) + + x_t1 = x_t1.reshape(1, x_t1.shape[0], x_t1.shape[1], 1) #1x80x80x1 + s_t1 = np.append(x_t1, s_t[:, :, :, :3], axis=3) + + # We store the information in D + D.append((s_t, action_index, r_t, s_t1, terminal)) + if len(D) > REPLAY_MEMORY: + D.popleft() + + # Training when t is bigger than OBSERVATION (we have already observed enough) + if t > OBSERVATION: + + # Pick the minibatch for the training (size is BATCH) + minibatch = random.sample(D, BATCH) + + inputs = np.zeros((BATCH, s_t.shape[1], s_t.shape[2], s_t.shape[3])) #32, 80, 80, 4 + targets = np.zeros((inputs.shape[0], len(ACTIONS))) #32, 2 + + # Experience replay + for i in range(0, len(minibatch)): + + state_t = minibatch[i][0] + action_t = minibatch[i][1] # Action index + reward_t = minibatch[i][2] + state_t1 = minibatch[i][3] + terminal = minibatch[i][4] + + inputs[i:i + 1] = state_t # We save s_t + + targets[i] = model.predict(state_t) # Prediction + Q_sa = model.predict(state_t1) + + if terminal: + targets[i, action_t] = reward_t # if terminated, only equals reward + else: + targets[i, action_t] = reward_t + GAMMA * np.max(Q_sa) + + loss += model.train_on_batch(inputs, targets) + + # End of the episode + episode_counter += 1 + + # Control print + if episode_counter % 100 == 0: + print("Episode number:",episode_counter) + + # New state and time step + 1 + s_t = s_t1 + t = t + 1 + + # We save the progress every 1000 iterations + if t % 1000 == 0: + print("Now we save the model") #☺ Control print + model.save("model.h5", overwrite=True) + \ No newline at end of file diff --git a/HernanzGonzalez/q-learning.py b/HernanzGonzalez/q-learning.py new file mode 100644 index 0000000..2f5ef6d --- /dev/null +++ b/HernanzGonzalez/q-learning.py @@ -0,0 +1,125 @@ +# -*- coding: utf-8 -*- +""" +Created on Sun Feb 11 14:10:25 2018 + +@author: Julio Hernanz González +""" + +#### Creation of the Q-value dictionary for the Flappy Bird PLE game. +#### This is the training file. The bird plays the game a NB_GAMES and +#### saves the information, learning using the Q-learning algorithm. + +#%% Imports + +from ple.games.flappybird import FlappyBird +from ple import PLE +import numpy as np +import random + +#%% Functions + +# Function to create a grid for x and y position values. +def myround(x): + return int(5 * round(float(x)/5)) + +# Creation of a vector with the three variables that are saved as key of the dictionary. +def getKey(pos, distance, vel): + key = (myround(pos), myround(distance), vel) + return key + +#%% Q-learning code + +# Beginning the game : +game = FlappyBird(graphics="fixed") +p = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=False, display_screen=True) +p.init() # Initialize the game +reward = 0.0 + +NB_GAMES = 1000 # Number of games to be played +cumulated = np.zeros((NB_GAMES)) +counter = 0 +games_counter = 0 + +# We initialize or load the Q dictionary +#Q = dict() # to restart the dictionary +Q = np.load("Q.npy").item() # to keep learning on an existing dictionary + +# Q learning parameters +GAMMA = 0.9 +ALPHA = 0.01 +EPSILON = 0.01 +ACTIONS = [0, 119] # allowed actions for the game (nothing / flap) + +for i in range(NB_GAMES): + p.reset_game() + games_counter += 1 + + if(games_counter % 500 == 0): + print(games_counter) # to keep track of how many games have been played + + while(not p.game_over()): + state = game.getGameState() # Current state. + + # Key of the current state + pos = state["player_y"] - state["next_pipe_bottom_y"] # y position difference between the bid and bottom pipe. + distance = state["next_pipe_dist_to_player"] # x distance to the pipes + vel = state["player_vel"] # bird speed + key = getKey(pos, distance, vel) + + # In case key is non existent we create an entry in the dictionary + if(Q.get(key) == None): + Q[key] = [0 , 0] + + counter += 1 + + if(counter % 500 == 0): # to reduce the value of epsilon + EPSILON *= 0.9 + + # This if allows to select actions randomly at the beginning and go + # back to already learned states afterwards. + if(EPSILON < random.random()): + a = np.argmax(Q[key]) + action = ACTIONS[a] + else: + a = random.randint(0,1) + action = ACTIONS[a] + + # We execute the action and save the reward value. + reward = p.act(action) + cumulated[i] = cumulated[i] + reward + + # Next state and its key + state = game.getGameState() + posprima = state["player_y"] - state["next_pipe_bottom_y"] + distanceprima = state["next_pipe_dist_to_player"] + velprima = state["player_vel"] + keyprima = getKey(posprima, distanceprima, velprima) + + # In case key is non existent we create an entry on the dictionary + if(Q.get(keyprima) == None): + Q[keyprima] = [0 , 0] + + # We select the best next possible action's Q value + maxQsprima = max(Q[keyprima]) + + # We update the Q value + Q[key][a] = (1-ALPHA)*Q[key][a] + ALPHA*(reward + GAMMA*maxQsprima) + + # Next state when p.game_over() and its key + state = game.getGameState() + posprima = state["player_y"] - state["next_pipe_bottom_y"] + distanceprima = state["next_pipe_dist_to_player"] + velprima = state["player_vel"] + keyprima = getKey(posprima, distanceprima, velprima) + + if(Q.get(keyprima) == None): + Q[keyprima] = [0 , 0] + + maxQsprima = max(Q[keyprima]) + Q[key][a] = Q[key][a] + ALPHA*(reward + GAMMA*maxQsprima - Q[key][a]) + +max_score = np.max(cumulated) +print(max_score) + +# We save the Q function +np.save("Q.npy",Q) \ No newline at end of file diff --git a/RandomBird/run.py b/HernanzGonzalez/run.py similarity index 82% rename from RandomBird/run.py rename to HernanzGonzalez/run.py index 39b5801..a13c60a 100644 --- a/RandomBird/run.py +++ b/HernanzGonzalez/run.py @@ -4,7 +4,7 @@ import numpy as np from FlappyAgent import FlappyPolicy -game = FlappyBird(graphics="fixed") # use "fancy" for full background, random bird color and random pipe color, use "fixed" (default) for black background and constant bird and pipe colors. +game = FlappyBird(graphics="fancy") # use "fancy" for full background, random bird color and random pipe color, use "fixed" (default) for black background and constant bird and pipe colors. p = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=False, display_screen=True) # Note: if you want to see you agent act in real time, set force_fps to False. But don't use this setting for learning, just for display purposes. @@ -20,10 +20,10 @@ while(not p.game_over()): state = game.getGameState() screen = p.getScreenRGB() - action=FlappyPolicy(state, screen) ### Your job is to define this function. + action = FlappyPolicy(state, screen) ### Your job is to define this function. reward = p.act(action) cumulated[i] = cumulated[i] + reward average_score = np.mean(cumulated) -max_score = np.max(cumulated) +max_score = np.max(cumulated) \ No newline at end of file diff --git a/RandomBird/FlappyAgent.py b/RandomBird/FlappyAgent.py deleted file mode 100644 index 9f3ec84..0000000 --- a/RandomBird/FlappyAgent.py +++ /dev/null @@ -1,9 +0,0 @@ -import numpy as np - -def FlappyPolicy(state, screen): - action=None - if(np.random.randint(0,2)<1): - action=119 - return action - -