diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 000000000..f17892aec Binary files /dev/null and b/.DS_Store differ diff --git a/DQN/.DS_Store b/DQN/.DS_Store new file mode 100644 index 000000000..5a032cb7c Binary files /dev/null and b/DQN/.DS_Store differ diff --git a/DQN/Deep Q Learning.ipynb b/DQN/Deep Q Learning.ipynb index d2a295cf1..a11798b00 100644 --- a/DQN/Deep Q Learning.ipynb +++ b/DQN/Deep Q Learning.ipynb @@ -4,7 +4,9 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -29,7 +31,9 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -40,7 +44,9 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -52,7 +58,9 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -67,7 +75,7 @@ " self.output = tf.image.rgb_to_grayscale(self.input_state)\n", " self.output = tf.image.crop_to_bounding_box(self.output, 34, 0, 160, 160)\n", " self.output = tf.image.resize_images(\n", - " self.output, 84, 84, method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)\n", + " self.output, [84, 84], method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)\n", " self.output = tf.squeeze(self.output)\n", "\n", " def process(self, sess, state):\n", @@ -86,7 +94,9 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -107,7 +117,7 @@ " summary_dir = os.path.join(summaries_dir, \"summaries_{}\".format(scope))\n", " if not os.path.exists(summary_dir):\n", " os.makedirs(summary_dir)\n", - " self.summary_writer = tf.train.SummaryWriter(summary_dir)\n", + " self.summary_writer = tf.summary.FileWriter(summary_dir)\n", "\n", " def _build_model(self):\n", " \"\"\"\n", @@ -151,11 +161,11 @@ " self.train_op = self.optimizer.minimize(self.loss, global_step=tf.contrib.framework.get_global_step())\n", "\n", " # Summaries for Tensorboard\n", - " self.summaries = tf.merge_summary([\n", - " tf.scalar_summary(\"loss\", self.loss),\n", - " tf.histogram_summary(\"loss_hist\", self.losses),\n", - " tf.histogram_summary(\"q_values_hist\", self.predictions),\n", - " tf.scalar_summary(\"max_q_value\", tf.reduce_max(self.predictions))\n", + " self.summaries = tf.summary.merge([\n", + " tf.summary.scalar(\"loss\", self.loss),\n", + " tf.summary.histogram(\"loss_hist\", self.losses),\n", + " tf.summary.histogram(\"q_values_hist\", self.predictions),\n", + " tf.summary.scalar(\"max_q_value\", tf.reduce_max(self.predictions))\n", " ])\n", "\n", "\n", @@ -199,7 +209,9 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -212,7 +224,7 @@ "sp = StateProcessor()\n", "\n", "with tf.Session() as sess:\n", - " sess.run(tf.initialize_all_variables())\n", + " sess.run(tf.global_variables_initializer())\n", " \n", " # Example observation batch\n", " observation = env.reset()\n", @@ -234,7 +246,9 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -264,7 +278,9 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -294,7 +310,9 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -468,7 +486,9 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -526,7 +546,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.1" + "version": "3.6.1" } }, "nbformat": 4, diff --git a/DQN/Double DQN Solution.ipynb b/DQN/Double DQN Solution.ipynb index 22bb9ebc9..fd13fcf4c 100644 --- a/DQN/Double DQN Solution.ipynb +++ b/DQN/Double DQN Solution.ipynb @@ -4,7 +4,9 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -29,7 +31,9 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -40,7 +44,9 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -52,7 +58,9 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -67,7 +75,7 @@ " self.output = tf.image.rgb_to_grayscale(self.input_state)\n", " self.output = tf.image.crop_to_bounding_box(self.output, 34, 0, 160, 160)\n", " self.output = tf.image.resize_images(\n", - " self.output, 84, 84, method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)\n", + " self.output, [84, 84], method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)\n", " self.output = tf.squeeze(self.output)\n", "\n", " def process(self, sess, state):\n", @@ -86,7 +94,9 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -131,7 +141,7 @@ " self.train_op = tf.no_op(\"train_pp\")\n", " \n", " # Summaries for Tensorboard\n", - " self.summaries = tf.merge_summary([\n", + " self.summaries = tf.summary.merge([\n", " tf.scalar_summary(\"loss\", self.loss)\n", " ])\n", "\n", @@ -176,7 +186,9 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -211,7 +223,9 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -241,7 +255,9 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -271,7 +287,9 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -473,7 +491,9 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -531,7 +551,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.1" + "version": "3.6.1" } }, "nbformat": 4, diff --git a/DQN/dqn.py b/DQN/dqn.py index 7b459240d..92a876f3f 100755 --- a/DQN/dqn.py +++ b/DQN/dqn.py @@ -5,14 +5,39 @@ import random import sys import tensorflow as tf +import cv2 if "../" not in sys.path: sys.path.append("../") from lib import plotting from collections import deque, namedtuple +from gym.wrappers import Monitor -env = gym.envs.make("Breakout-v0") +#added arguments +if len(sys.argv) < 3: + print("Usage python dqn.py -train -run ") + exit(1) + +trainBool = False +game_dict = {1: "Breakout-v0", 2: "Pong-v0", 3: "SpaceInvaders-v0", 4: "MsPacman-v0"} + +string = sys.argv[1] + +if string == '-train': + trainBool = True +elif string == '-run': + pass +else: + print("Usage python dqn.py -train -run ") + exit(1) + +game_num = int(sys.argv[2]) +if game_num > 4 or game_num < 1: + print("Usage python dqn.py -train -run ") + exit(1) + +env = gym.envs.make(game_dict[game_num]) # Atari Actions: 0 (noop), 1 (fire), 2 (left) and 3 (right) are valid actions VALID_ACTIONS = [0, 1, 2, 3] @@ -28,7 +53,7 @@ def __init__(self): self.output = tf.image.rgb_to_grayscale(self.input_state) self.output = tf.image.crop_to_bounding_box(self.output, 34, 0, 160, 160) self.output = tf.image.resize_images( - self.output, 84, 84, method=tf.image.ResizeMethod.NEAREST_NEIGHBOR) + self.output, [84, 84], method=tf.image.ResizeMethod.NEAREST_NEIGHBOR) self.output = tf.squeeze(self.output) def process(self, sess, state): @@ -59,7 +84,7 @@ def __init__(self, scope="estimator", summaries_dir=None): summary_dir = os.path.join(summaries_dir, "summaries_{}".format(scope)) if not os.path.exists(summary_dir): os.makedirs(summary_dir) - self.summary_writer = tf.train.SummaryWriter(summary_dir) + self.summary_writer = tf.summary.FileWriter(summary_dir) def _build_model(self): """ @@ -103,11 +128,11 @@ def _build_model(self): self.train_op = self.optimizer.minimize(self.loss, global_step=tf.contrib.framework.get_global_step()) # Summaries for Tensorboard - self.summaries = tf.merge_summary([ - tf.scalar_summary("loss", self.loss), - tf.histogram_summary("loss_hist", self.losses), - tf.histogram_summary("q_values_hist", self.predictions), - tf.scalar_summary("max_q_value", tf.reduce_max(self.predictions)) + self.summaries = tf.summary.merge([ + tf.summary.scalar("loss", self.loss), + tf.summary.histogram("loss_hist", self.losses), + tf.summary.histogram("q_values_hist", self.predictions), + tf.summary.scalar("max_q_value", tf.reduce_max(self.predictions)) ]) @@ -205,7 +230,8 @@ def deep_q_learning(sess, epsilon_end=0.1, epsilon_decay_steps=500000, batch_size=32, - record_video_every=50): + record_video_every=100000, + render=False): """ Q-Learning algorithm for fff-policy TD control using Function Approximation. Finds the optimal greedy policy while following an epsilon-greedy policy. @@ -292,9 +318,12 @@ def deep_q_learning(sess, state = next_state # Record videos - env.monitor.start(monitor_path, - resume=True, - video_callable=lambda count: count % record_video_every == 0) + # env.monitor.start(monitor_path, + # resume=True, + # video_callable=lambda count: count % record_video_every == 0) + env = Monitor(directory=monitor_path, + resume=True, + video_callable=lambda count: count % record_video_every == 0, env = env) for i_episode in range(num_episodes): @@ -309,7 +338,8 @@ def deep_q_learning(sess, # One step in the environment for t in itertools.count(): - + if render: + env.render() # Epsilon for this time step epsilon = epsilons[min(total_t, epsilon_decay_steps-1)] @@ -385,7 +415,7 @@ def deep_q_learning(sess, tf.reset_default_graph() # Where we save our checkpoints and graphs -experiment_dir = os.path.abspath("./experiments/{}".format(env.spec.id)) +experiment_dir = os.path.dirname("./experiments/{}/".format(env.spec.id)) # Create a glboal step variable global_step = tf.Variable(0, name='global_step', trainable=False) @@ -397,23 +427,47 @@ def deep_q_learning(sess, # State processor state_processor = StateProcessor() -with tf.Session() as sess: - sess.run(tf.initialize_all_variables()) - for t, stats in deep_q_learning(sess, - env, - q_estimator=q_estimator, - target_estimator=target_estimator, - state_processor=state_processor, - experiment_dir=experiment_dir, - num_episodes=10000, - replay_memory_size=500000, - replay_memory_init_size=50000, - update_target_estimator_every=10000, - epsilon_start=1.0, - epsilon_end=0.1, - epsilon_decay_steps=500000, - discount_factor=0.99, - batch_size=32): - - print("\nEpisode Reward: {}".format(stats.episode_rewards[-1])) - +# training step +if trainBool: + with tf.Session() as sess: + sess.run(tf.global_variables_initializer()) + for t, stats in deep_q_learning(sess, + env, + q_estimator=q_estimator, + target_estimator=target_estimator, + state_processor=state_processor, + experiment_dir=experiment_dir, + num_episodes=10000, + replay_memory_size=500000, + replay_memory_init_size=50000, + update_target_estimator_every=10000, + epsilon_start=1.0, + epsilon_end=0.1, + epsilon_decay_steps=500000, + discount_factor=0.99, + batch_size=32): + + print("\nEpisode Reward: {}".format(stats.episode_rewards[-1])) +else: + # in order to run as specified by the paper changed the episolon_start to 0.05 + with tf.Session() as sess: + sess.run(tf.global_variables_initializer()) + for t, stats in deep_q_learning(sess, + env, + q_estimator=q_estimator, + target_estimator=target_estimator, + state_processor=state_processor, + experiment_dir=experiment_dir, + num_episodes=10000, + replay_memory_size=500000, + replay_memory_init_size=500, + update_target_estimator_every=10000, + epsilon_start=0.05, + epsilon_end=0.05, + epsilon_decay_steps=500000, + discount_factor=0.99, + batch_size=32, + record_video_every=100, + render=True): + + print("\nEpisode Reward: {}".format(stats.episode_rewards[-1]))