Dear Group,

sorry for reposting, but I did a quick search and did not find a matching 
post here:

I am trying to model an Agent using the OpenAI Gym environment, which gives 
you an environment class 

import gym

...

# Initialize Environment
env = gym.make("Pong-v0")

Outside Theano, you can use it to form an agent-environment loop via:

# Reset Environment to Initial State
env.reset()

# Action-Perception Loop
done = False
while done = False:
    action = some_sample_code()
    observation, reward, done, info = env.step(action)

I am trying the same thing inside a Theano Scan function in the following 
script:

# Imports
import cPickle
import gzip
import os
import sys
import timeit

import matplotlib.pyplot as plt
import gym

import numpy

import theano
import theano.tensor as T

from theano import pprint as pp

from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams  

# Initialize RNG
ii32 = numpy.iinfo(numpy.int32)
theano_rng = RandomStreams(numpy.random.randint(ii32.max)) # ADD RANDOM 
SEED!  

# Functions and Classes
def softmax(X):
    eX = T.exp(X - X.max(axis=-1, keepdims = True))
    prob = eX / eX.sum(axis=-1, keepdims=True)
    return prob  
       
def Cat_sample(pi, num_sample=None):

    z = theano_rng.multinomial(n=1, pvals = pi, dtype=pi.dtype)
    
    return z
       
# Initialize Environment
env = gym.make("Pong-v0")

# Information about environment
print 'Action space:'
print(env.action_space)
print 'Number of actions:'
print(env.action_space.n)
print 'Observation space:'
print(env.observation_space)

# Define Random Policy in Theano
values = theano_rng.normal(size=(1, env.action_space.n), avg=0., std=10., 
dtype=theano.config.floatX)
probabilities = softmax(values)
decision = Cat_sample(probabilities)

# Run Agent within Theano scan
def inner_fn():    
    
    # Sample random decision within Theano
    action = T.ge(decision, 0.5).nonzero()[1]
    
    # Here we are leaving Theano to interact with Gym
    observation, reward, done, info = env.step(action.eval())
    
    # Here we bring back the output of Gym to Theano
    observation = T.constant(observation, dtype=theano.config.floatX)
    reward = T.constant(reward, dtype=theano.config.floatX)
    
    if done:
        done = 1.0
    else:
        done = 0.0
        
    done = T.constant(done, dtype=theano.config.floatX)
    
    return reward

# Reset environment    
env.reset()    

# Define scan loop
(rewards, updates) = theano.scan(fn=inner_fn,
                     n_steps=3000)
                     
# Output function
sample_outputs = theano.function(
        inputs=[],
        outputs=[rewards],
        updates=[],
        givens={},
        on_unused_input='ignore'
    )
    
a = sample_outputs()
print a[0]

print 'Min:'
print numpy.amin(a)
print 'Max:'
print numpy.amax(a)

However, it seems as if the state of the environment never gets updated. Is 
there a way to implement this in a way, that the state of the environment 
gets updated, yet the individual action and observation sequences can be 
used as Theano variables?

Thank you a lot,
Kai

-- 

--- 
You received this message because you are subscribed to the Google Groups 
"theano-users" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
For more options, visit https://groups.google.com/d/optout.
# Imports
import cPickle
import gzip
import os
import sys
import timeit

import matplotlib.pyplot as plt
import gym

import numpy

import theano
import theano.tensor as T

from theano import pprint as pp

from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams  

# Initialize RNG
ii32 = numpy.iinfo(numpy.int32)
theano_rng = RandomStreams(numpy.random.randint(ii32.max)) # ADD RANDOM SEED!  

# Functions and Classes
def softmax(X):
    eX = T.exp(X - X.max(axis=-1, keepdims = True))
    prob = eX / eX.sum(axis=-1, keepdims=True)
    return prob  
       
def Cat_sample(pi, num_sample=None):

    z = theano_rng.multinomial(n=1, pvals = pi, dtype=pi.dtype)
    
    return z
       
# Initialize Environment
env = gym.make("Pong-v0")

# Information about environment
print 'Action space:'
print(env.action_space)
print 'Number of actions:'
print(env.action_space.n)
print 'Observation space:'
print(env.observation_space)

# Define Random Policy in Theano
values = theano_rng.normal(size=(1, env.action_space.n), avg=0., std=10., dtype=theano.config.floatX)
probabilities = softmax(values)
decision = Cat_sample(probabilities)

# Run Agent within Theano scan
def inner_fn():    
    
    # Sample random decision within Theano
    action = T.ge(decision, 0.5).nonzero()[1]
    
    # Here we are leaving Theano to interact with Gym
    observation, reward, done, info = env.step(action.eval())
    
    # Here we bring back the output of Gym to Theano
    observation = T.constant(observation, dtype=theano.config.floatX)
    reward = T.constant(reward, dtype=theano.config.floatX)
    
    if done:
        done = 1.0
    else:
        done = 0.0
        
    done = T.constant(done, dtype=theano.config.floatX)
    
    return reward

# Reset environment    
env.reset()    

# Define scan loop
(rewards, updates) = theano.scan(fn=inner_fn,
                     n_steps=3000)
                     
# Output function
sample_outputs = theano.function(
        inputs=[],
        outputs=[rewards],
        updates=[],
        givens={},
        on_unused_input='ignore'
    )
    
a = sample_outputs()
print a[0]

print 'Min:'
print numpy.amin(a)
print 'Max:'
print numpy.amax(a)

Reply via email to