KDD Cup | Humanities Track Tutorial Policy Gradients

Date Created: May 20, 2019

KDD Cup|Humanities Track Tutorial Policy Gradients

Vanilla Policy Gradient Agent

This tutorial contains a simple example of how to build a policy-gradient based agent build on top of this Medium post. This implementation has used the Challenge Environment.

In [ ]:
import tensorflow as tf
import tensorflow.contrib.slim as slim
import numpy as np
import os
from sys import exit, exc_info, argv
import random
import pandas as pd

!pip3 install git+https://github.com/slremy/netsapi --user --upgrade

from netsapi.challenge import *

try:
    xrange = xrange
except:
    xrange = range
In [ ]:
env = ChallengeSeqDecEnvironment() 

The Policy-Based Agent

In [ ]:
gamma = 0.99

def discount_rewards(r):
    """ take 1D float array of rewards and compute discounted reward """
    discounted_r = np.zeros_like(r)
    running_add = 0
    for t in reversed(xrange(0, r.size)):
        running_add = running_add * gamma + r[t]
        discounted_r[t] = running_add
    return discounted_r
In [ ]:
class agent():
    def __init__(self, lr, s_size,a_size,h_size):
        #These lines established the feed-forward part of the network. The agent takes a state and produces an action.
        self.state_in= tf.placeholder(shape=[None,s_size],dtype=tf.float32)
        hidden = slim.fully_connected(self.state_in,h_size,biases_initializer=None,activation_fn=tf.nn.relu)
        self.output = slim.fully_connected(hidden,a_size,activation_fn=tf.nn.softmax,biases_initializer=None)
        self.chosen_action = tf.argmax(self.output,1)

        #The next six lines establish the training proceedure. We feed the reward and chosen action into the network
        #to compute the loss, and use it to update the network.
        self.reward_holder = tf.placeholder(shape=[None],dtype=tf.float32)
        self.action_holder = tf.placeholder(shape=[None],dtype=tf.int32)
        
        self.indexes = tf.range(0, tf.shape(self.output)[0]) * tf.shape(self.output)[1] + self.action_holder
        self.responsible_outputs = tf.gather(tf.reshape(self.output, [-1]), self.indexes)

        self.loss = -tf.reduce_mean(tf.log(self.responsible_outputs)*self.reward_holder)
        
        tvars = tf.trainable_variables()
        self.gradient_holders = []
        for idx,var in enumerate(tvars):
            placeholder = tf.placeholder(tf.float32,name=str(idx)+'_holder')
            self.gradient_holders.append(placeholder)
        
        self.gradients = tf.gradients(self.loss,tvars)
        
        optimizer = tf.train.AdamOptimizer(learning_rate=lr)
        self.update_batch = optimizer.apply_gradients(zip(self.gradient_holders,tvars))

Training the Agent

In [ ]:
def actionSpace(resolution):
    x,y = np.meshgrid(np.arange(0,1+resolution,resolution), np.arange(0,1+resolution,resolution))
    xy = np.concatenate((x.reshape(-1,1), y.reshape(-1,1)), axis=1)
    return xy.round(2).tolist()

action_resolution = 0.2

actions = actionSpace(action_resolution)
a_s = len(actions)
actionspace = range(a_s-1)


tf.reset_default_graph() #Clear the Tensorflow graph.

myAgent = agent(lr=1e-2,s_size=1,a_size=a_s,h_size=8) #Load the agent.

total_episodes = 20 #Set total number of episodes to train agent on.
max_ep = 5
update_frequency = 4

init = tf.global_variables_initializer()

# Launch the tensorflow graph
with tf.Session() as sess:
    sess.run(init)
    i = 0
    total_reward = []
    total_length = []
        
    gradBuffer = sess.run(tf.trainable_variables())
    for ix,grad in enumerate(gradBuffer):
        gradBuffer[ix] = grad * 0
        
    while i < total_episodes:
        env.reset()
        s1 = env.state
        running_reward = 0
        ep_history = []
        for j in range(max_ep):
            s = s1
            s = np.expand_dims(s, axis=0)
            print('s=',s)
            #Probabilistically pick an action given our network outputs.
            a_dist = sess.run(myAgent.output,feed_dict={myAgent.state_in:[s]})
            a = np.random.choice(a_dist[0],p=a_dist[0])
            a = np.argmax(a_dist == a)
            env_action = actions[a] #convert to ITN/IRS
            print('env_action', env_action)
            s1,r,d,_  = env.evaluateAction(env_action)
            ep_history.append([s,a,r,s1])

            running_reward += r
            if d == True:
                #Update the network.
                ep_history = np.array(ep_history)
                ep_history[:,2] = discount_rewards(ep_history[:,2])
                feed_dict={myAgent.reward_holder:ep_history[:,2],
                        myAgent.action_holder:ep_history[:,1],myAgent.state_in:np.vstack(ep_history[:,0])}
                grads = sess.run(myAgent.gradients, feed_dict=feed_dict)
                for idx,grad in enumerate(grads):
                    gradBuffer[idx] += grad

                if i % update_frequency == 0 and i != 0:
                    feed_dict= dictionary = dict(zip(myAgent.gradient_holders, gradBuffer))
                    _ = sess.run(myAgent.update_batch, feed_dict=feed_dict)
                    for ix,grad in enumerate(gradBuffer):
                        gradBuffer[ix] = grad * 0
                
                total_reward.append(running_reward)
                total_length.append(j)
                break
        i += 1
                
    #Now extract final policy
    best_policy = {}
    for j in range(5):

        s = j+1
        s = np.expand_dims(s, axis=0)

        #Probabilistically pick an action given our network outputs.
        a_dist = sess.run(myAgent.output,feed_dict={myAgent.state_in:[s]})
        a = np.random.choice(a_dist[0],p=a_dist[0])
        a = np.argmax(a_dist == a)
        env_action = actions[a] #convert to ITN/IRS
        best_policy[int(s)]= env_action
        
    print('best_policy',best_policy)
    best_reward = env.evaluatePolicy(best_policy)
    print('best_reward',best_reward)
Share this:
Share

Comment

Comments


Paulina
Hello, could you please explain, why the rewards are changed into discount_rewards by the function discount_rewards? And how that formula in the function was created? Thanks
oetbent
Hi Paulina, this implementation is looking at a discounted reward by factor gamma. You can think of any discount giving greater weight to rewards at the start of an episode to future rewards. This is the notion that we tend to make decisions on what we have now rather than some point in the future. In the context of this problem if I say that you can achieve a 10% reduction in malaria prevalence today or 11% reduction next year - we'd tend to weight what we can achieve today higher.