Dear Group,
sorry for reposting, but I did a quick search and did not find a matching
post here:
I am trying to model an Agent using the OpenAI Gym environment, which gives
you an environment class
import gym
...
# Initialize Environment
env = gym.make("Pong-v0")
Outside Theano, you can use it to form an agent-environment loop via:
# Reset Environment to Initial State
env.reset()
# Action-Perception Loop
done = False
while done = False:
action = some_sample_code()
observation, reward, done, info = env.step(action)
I am trying the same thing inside a Theano Scan function in the following
script:
# Imports
import cPickle
import gzip
import os
import sys
import timeit
import matplotlib.pyplot as plt
import gym
import numpy
import theano
import theano.tensor as T
from theano import pprint as pp
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
# Initialize RNG
ii32 = numpy.iinfo(numpy.int32)
theano_rng = RandomStreams(numpy.random.randint(ii32.max)) # ADD RANDOM
SEED!
# Functions and Classes
def softmax(X):
eX = T.exp(X - X.max(axis=-1, keepdims = True))
prob = eX / eX.sum(axis=-1, keepdims=True)
return prob
def Cat_sample(pi, num_sample=None):
z = theano_rng.multinomial(n=1, pvals = pi, dtype=pi.dtype)
return z
# Initialize Environment
env = gym.make("Pong-v0")
# Information about environment
print 'Action space:'
print(env.action_space)
print 'Number of actions:'
print(env.action_space.n)
print 'Observation space:'
print(env.observation_space)
# Define Random Policy in Theano
values = theano_rng.normal(size=(1, env.action_space.n), avg=0., std=10.,
dtype=theano.config.floatX)
probabilities = softmax(values)
decision = Cat_sample(probabilities)
# Run Agent within Theano scan
def inner_fn():
# Sample random decision within Theano
action = T.ge(decision, 0.5).nonzero()[1]
# Here we are leaving Theano to interact with Gym
observation, reward, done, info = env.step(action.eval())
# Here we bring back the output of Gym to Theano
observation = T.constant(observation, dtype=theano.config.floatX)
reward = T.constant(reward, dtype=theano.config.floatX)
if done:
done = 1.0
else:
done = 0.0
done = T.constant(done, dtype=theano.config.floatX)
return reward
# Reset environment
env.reset()
# Define scan loop
(rewards, updates) = theano.scan(fn=inner_fn,
n_steps=3000)
# Output function
sample_outputs = theano.function(
inputs=[],
outputs=[rewards],
updates=[],
givens={},
on_unused_input='ignore'
)
a = sample_outputs()
print a[0]
print 'Min:'
print numpy.amin(a)
print 'Max:'
print numpy.amax(a)
However, it seems as if the state of the environment never gets updated. Is
there a way to implement this in a way, that the state of the environment
gets updated, yet the individual action and observation sequences can be
used as Theano variables?
Thank you a lot,
Kai
--
---
You received this message because you are subscribed to the Google Groups
"theano-users" group.
To unsubscribe from this group and stop receiving emails from it, send an email
to [email protected].
For more options, visit https://groups.google.com/d/optout.
# Imports
import cPickle
import gzip
import os
import sys
import timeit
import matplotlib.pyplot as plt
import gym
import numpy
import theano
import theano.tensor as T
from theano import pprint as pp
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
# Initialize RNG
ii32 = numpy.iinfo(numpy.int32)
theano_rng = RandomStreams(numpy.random.randint(ii32.max)) # ADD RANDOM SEED!
# Functions and Classes
def softmax(X):
eX = T.exp(X - X.max(axis=-1, keepdims = True))
prob = eX / eX.sum(axis=-1, keepdims=True)
return prob
def Cat_sample(pi, num_sample=None):
z = theano_rng.multinomial(n=1, pvals = pi, dtype=pi.dtype)
return z
# Initialize Environment
env = gym.make("Pong-v0")
# Information about environment
print 'Action space:'
print(env.action_space)
print 'Number of actions:'
print(env.action_space.n)
print 'Observation space:'
print(env.observation_space)
# Define Random Policy in Theano
values = theano_rng.normal(size=(1, env.action_space.n), avg=0., std=10., dtype=theano.config.floatX)
probabilities = softmax(values)
decision = Cat_sample(probabilities)
# Run Agent within Theano scan
def inner_fn():
# Sample random decision within Theano
action = T.ge(decision, 0.5).nonzero()[1]
# Here we are leaving Theano to interact with Gym
observation, reward, done, info = env.step(action.eval())
# Here we bring back the output of Gym to Theano
observation = T.constant(observation, dtype=theano.config.floatX)
reward = T.constant(reward, dtype=theano.config.floatX)
if done:
done = 1.0
else:
done = 0.0
done = T.constant(done, dtype=theano.config.floatX)
return reward
# Reset environment
env.reset()
# Define scan loop
(rewards, updates) = theano.scan(fn=inner_fn,
n_steps=3000)
# Output function
sample_outputs = theano.function(
inputs=[],
outputs=[rewards],
updates=[],
givens={},
on_unused_input='ignore'
)
a = sample_outputs()
print a[0]
print 'Min:'
print numpy.amin(a)
print 'Max:'
print numpy.amax(a)