-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcontextual_bandits_tensorflow.py
123 lines (71 loc) · 3.32 KB
/
contextual_bandits_tensorflow.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import numpy as np
import tensorflow as tf
import tensorflow.contrib.slim as slim
class Contextual_bandits(object):
"""docstring for Contextual_bandits"""
def __init__(self):
self.state = 0
self.bandits = np.array([[1.0, 4.5, -2.5, -3.6], [0.6, 6.3, -1.6, -3.4],
[-2.0, 4.5, -2.7, -1.9], [2.4, 0.1, -0.5, -0.9]])
self.num_states = self.bandits.shape[0]
self.len_bandits = self.bandits.shape[1]
def get_bandit_state(self):
self.state = np.random.randint(0, self.num_states)
return self.state
def pull_a_bandit(self, chosen_action):
random = np.random.randn(1)
bandit = self.bandits[self.state, chosen_action]
if bandit>random:
return 1
#positive reward
else:
return -1
#negative reward
class My_agent(object):
"""docstring for My_agent"""
def __init__(self, learning_rate, state_size, action_size):
self.state_current = tf.placeholder(shape = [1], dtype = tf.int32)
state_in_onehot = slim.one_hot_encoding(self.state_current, state_size)
output = slim.fully_connected(state_in_onehot, action_size, biases_initializer=None, activation_fn=tf.nn.sigmoid,
weights_initializer=tf.ones_initializer())
self.output = tf.reshape(output, [-1])
self.chosen_action = tf.argmax(self.output, 0)
self.reward_holder = tf.placeholder(shape = [1], dtype = tf.float32)#stores reward from pull_a_bandit
self.action_holder = tf.placeholder(shape = [1], dtype = tf.int32)#stores action selection
self.responsible_weight = tf.slice(self.output,self.action_holder,[1])#stores current value of chosen action
self.loss = (-tf.log(self.responsible_weight)*self.reward_holder)
optimizer = tf.train.GradientDescentOptimizer(learning_rate = learning_rate)
self.update = optimizer.minimize(self.loss)
tf.reset_default_graph()
c_bandit = Contextual_bandits()
my_agent = My_agent(learning_rate=0.001, state_size=c_bandit.num_states, action_size=c_bandit.len_bandits)
weights = tf.trainable_variables()[0]#these are variables that can be updated using gradient_descent
#just used in model as a variable to be updated
total_num_test = 100000
total_reward = np.zeros([c_bandit.num_states, c_bandit.len_bandits])
e = 0.2
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
i = 0
while i < total_num_test:
current_state = c_bandit.get_bandit_state()
e_random = np.random.randn(1)#to distribute action selection according to epsilon greedy approach
if e_random>e:
action = sess.run(my_agent.chosen_action, feed_dict={my_agent.state_current:[current_state]})
else:
action = np.random.randint(c_bandit.len_bandits)
reward = c_bandit.pull_a_bandit(action)
feed_dict={my_agent.reward_holder:[reward], my_agent.action_holder:[action], my_agent.state_current:[current_state]}
_,curr_weight = sess.run([my_agent.update, weights], feed_dict=feed_dict)
total_reward[current_state, action] += reward
i+=1
if i%500==0:
print "Mean reward for each of the " + str(c_bandit.num_states) + " bandits: " + str(np.mean(total_reward,axis=1))
for x in xrange(c_bandit.num_states):
#print "The agent thinks action "+ str(np.argmax(curr_weight[x])+1)+ " for bandit "+str(x+1)+" is best"
print ((curr_weight[x]))
if np.argmax(curr_weight[x])==(np.argmax(c_bandit.bandits[x])):
print "....you are right"
else:
print"...sorry you are wrong"