The attached files contain the minimal amount of code to reproduce my
problem.
The model is as follows:
- First, there are a set of convolutional layers.
- This is followed by a recurrent module on top of it, which has
attention over the convolutional feature maps
The lower Conv feature extractor is a Wide ResNet, and the top recurrent
module is an LSTM.
The problem is that the model takes too much time to setup and compile. I
have tested with only convolution and only recurrence and both of them
begin training within few minutes. However, when combined as specified
above, the model takes hours to setup and compile. I've tested this on the
CPU on my MacBook Air (500sec to setup, 3700 secs to compile) and a GPU
workstation (3600 secs to setup and still compiling). Note that the small
set of the hyper-parameters specified are for a small network and my aim is
to scale this up for deeper nets.
Code uses Lasagne and Theano.
Hope someone can tell me what I could do to optimize this to be faster as
it has withstood all my efforts to speed up.
--
---
You received this message because you are subscribed to the Google Groups
"theano-users" group.
To unsubscribe from this group and stop receiving emails from it, send an email
to [email protected].
For more options, visit https://groups.google.com/d/optout.
import numpy as np
import theano
import theano.tensor as T
from theano.ifelse import ifelse
import lasagne
PI = np.pi
dtype = theano.config.floatX
def ortho_init(shape):
"""
taken from: https://github.com/Lasagne/Lasagne/blob/master/lasagne/init.py#L327-L367
"""
a = np.random.normal(0.0, 1.0, shape)
u, _, v = np.linalg.svd(a, full_matrices=False)
W = u if u.shape == shape else v # pick the one with the correct shape
return W.astype(dtype)
def normal_init(shape, sigma):
W = np.random.normal(0.0, sigma, shape)
return W.astype(dtype)
def batched_dot(A, B):
C = A.dimshuffle([0, 1, 2, 'x']) * B.dimshuffle([0, 'x', 1, 2])
return C.sum(axis=-2)
class ConvARC(lasagne.layers.Layer):
def __init__(self, incoming, num_filters, lstm_states, image_size, attn_win,
glimpses, fg_bias_init, final_state_only=True, **kwargs):
super(ConvARC, self).__init__(incoming, **kwargs)
num_input = num_filters * (attn_win ** 2)
W_lstm = np.zeros((4 * lstm_states, num_input + lstm_states + 1), dtype=dtype)
print "\t\t Conv ARC weight init begin"
for i in range(4):
W_lstm[i*lstm_states:(i + 1)*lstm_states, :num_input] = ortho_init(shape=(lstm_states, num_input))
W_lstm[i*lstm_states:(i + 1)*lstm_states, num_input:-1] = ortho_init(shape=(lstm_states, lstm_states))
W_lstm[2*lstm_states:3*lstm_states, -1] = fg_bias_init
W_g = normal_init(shape=(3, lstm_states), sigma=0.01)
print "\t\t Conv ARC weight init done."
self.W_lstm = self.add_param(W_lstm, (4 * lstm_states, num_input + lstm_states + 1), name='W_lstm')
self.W_g = self.add_param(W_g, (3, lstm_states), name='W_g')
self.num_filters = num_filters
self.lstm_states = lstm_states
self.image_size = image_size
self.attn_win = attn_win
self.glimpses = glimpses
self.final_state_only = final_state_only
def attend(self, I, H, W):
attn_win = self.attn_win
image_size = self.image_size
num_filters = self.num_filters
gp = T.tanh(T.dot(W, H.T).T)
center_y = gp[:, 0].dimshuffle(0, 'x')
center_x = gp[:, 1].dimshuffle(0, 'x')
delta = 1.0 - T.abs_(gp[:, 2]).dimshuffle(0, 'x')
gamma = T.exp(1.0 - 2 * T.abs_(gp[:, 2])).dimshuffle([0, 'x', 'x'])
center_y = image_size * (center_y + 1.0) / 2.0
center_x = image_size * (center_x + 1.0) / 2.0
delta = image_size / attn_win * delta
rng = T.arange(attn_win, dtype=dtype) - attn_win / 2.0 + 0.5
cX = center_x + delta * rng
cY = center_y + delta * rng
a = T.arange(image_size, dtype=dtype)
b = T.arange(image_size, dtype=dtype)
F_X = 1.0 + ((a - cX.dimshuffle([0, 1, 'x'])) / gamma) ** 2.0
F_Y = 1.0 + ((b - cY.dimshuffle([0, 1, 'x'])) / gamma) ** 2.0
F_X = 1.0 / (PI * gamma * F_X)
F_Y = 1.0 / (PI * gamma * F_Y)
F_X = F_X / (F_X.sum(axis=-1).dimshuffle(0, 1, 'x') + 1e-4)
F_Y = F_Y / (F_Y.sum(axis=-1).dimshuffle(0, 1, 'x') + 1e-4)
F_X = F_X.repeat(num_filters, axis=0)
F_Y = F_Y.repeat(num_filters, axis=0)
G = batched_dot(batched_dot(F_Y, I), F_X.transpose([0, 2, 1]))
return G
def get_output_for(self, input, **kwargs):
image_size = self.image_size
num_filters = self.num_filters
lstm_states = self.lstm_states
attn_win = self.attn_win
# input is 4D tensor: (batch_size, num_filters, 0, 1)
B = input.shape[0] / 2 # pairs in batch
odd_input = input[:B]
even_input = input[B:]
# (B * num_filters, image_size, image_size)
odd_input = odd_input.reshape((B * num_filters, image_size, image_size))
even_input = even_input.reshape((B * num_filters, image_size, image_size))
def step(glimpse_count, c_tm1, h_tm1, odd_input, even_input, W_lstm, W_g):
# c_tm1, h_tm1 are (B, lstm_states)
turn = T.eq(glimpse_count % 2, 0)
I = ifelse(turn, even_input, odd_input)
# (B, attn_win, attn_win)
glimpse = self.attend(I, h_tm1, W_g)
flat_glimpse = glimpse.reshape((B, num_filters * (attn_win ** 2)))
# (4 * states, num_input + states + 1) x transpose(B, num_input + states + 1)
# result: (4 * states, B)
lstm_ip = T.concatenate([flat_glimpse, h_tm1, T.ones((B, 1))], axis=1)
pre_activation = T.dot(W_lstm, lstm_ip.T)
z = T.tanh(pre_activation[0*lstm_states:1*lstm_states])
i = T.nnet.sigmoid(pre_activation[1*lstm_states:2*lstm_states])
f = T.nnet.sigmoid(pre_activation[2*lstm_states:3*lstm_states])
o = T.nnet.sigmoid(pre_activation[3*lstm_states:4*lstm_states])
# all in (states, B)
c_t = f * c_tm1.T + i * z
h_t = o * T.tanh(c_t)
#c_t = T.clip(c_t, -1.0, 1.0)
#h_t = T.clip(h_t, -1.0, 1.0)
# output: (B, states)
return glimpse_count + 1, c_t.T, h_t.T
glimpse_count_0 = 0
c_0 = T.zeros((B, lstm_states))
h_0 = T.zeros((B, lstm_states))
_, cells, hiddens = theano.scan(fn=step, non_sequences=[odd_input, even_input, self.W_lstm, self.W_g],
outputs_info=[glimpse_count_0, c_0, h_0], n_steps=self.glimpses * 2)[0]
if self.final_state_only:
return hiddens[-1]
else:
return hiddens
def get_output_shape_for(self, input_shape):
# the batch size in both must be input_shape[0] / 2
# but since that it is none, we leave it as it is
if self.final_state_only:
return (input_shape[0], self.lstm_states)
else:
return (2 * self.num_glimpses, input_shape[0], self.lstm_states)
if __name__ == "__main__":
from lasagne.layers import InputLayer, Conv2DLayer, get_output
l_in = InputLayer(shape=(2, 1, 7, 7))
l_conv = Conv2DLayer(l_in, num_filters=1, filter_size=(3, 3), stride=(1, 1), pad='same')
l_carc = ConvARC(l_conv, num_filters=1, lstm_states=5, image_size=7, \
attn_win=2, glimpses=1, fg_bias_init=0.0)
y = get_output(l_carc)
fn = theano.function([l_in.input_var], outputs=y)
X = np.random.random((2, 1, 7, 7)).astype(dtype)
print fn(X).shape
learning_rate = 1e-4
image_size = 32
attn_win = 4
glimpses = 2
lstm_states = 64
fg_bias_init = 0.2
batch_size = 32
wrn_n = 2 # depth of wide resnet will be 4 * wrn_n + 7 conv layers deep
wrn_k = 1 # width of resnet - a multiple of num of filters
print "... importing libraries"
#import sys
#sys.setrecursionlimit(10000)
import numpy as np
import theano
import theano.tensor as T
import lasagne
from lasagne.layers import InputLayer, DenseLayer, DropoutLayer
from lasagne.layers import batch_norm, BatchNormLayer, ExpressionLayer
from lasagne.layers import Conv2DLayer as ConvLayer
from lasagne.layers import ElemwiseSumLayer, NonlinearityLayer, GlobalPoolLayer
from lasagne.nonlinearities import rectify, sigmoid
from lasagne.init import HeNormal
from lasagne.layers import get_all_params, get_all_layers, get_output
from lasagne.regularization import regularize_layer_params
from lasagne.objectives import binary_crossentropy
from lasagne.updates import adam
from lasagne.layers import helper
from conv_arc import ConvARC
import time
def residual_block(l, increase_dim=False, projection=True, first=False, filters=16):
if increase_dim:
first_stride = (2, 2)
else:
first_stride = (1, 1)
if first:
bn_pre_relu = l
else:
bn_pre_conv = BatchNormLayer(l)
bn_pre_relu = NonlinearityLayer(bn_pre_conv, rectify)
conv_1 = batch_norm(ConvLayer(bn_pre_relu, num_filters=filters, filter_size=(3,3), stride=first_stride, nonlinearity=rectify, pad='same', W=HeNormal(gain='relu')))
dropout = DropoutLayer(conv_1, p=0.3)
conv_2 = ConvLayer(dropout, num_filters=filters, filter_size=(3,3), stride=(1,1), nonlinearity=None, pad='same', W=HeNormal(gain='relu'))
if increase_dim:
projection = ConvLayer(l, num_filters=filters, filter_size=(1,1), stride=(2,2), nonlinearity=None, pad='same', b=None)
block = ElemwiseSumLayer([conv_2, projection])
elif first:
projection = ConvLayer(l, num_filters=filters, filter_size=(1,1), stride=(1,1), nonlinearity=None, pad='same', b=None)
block = ElemwiseSumLayer([conv_2, projection])
else:
block = ElemwiseSumLayer([conv_2, l])
return block
tick = time.time()
print "... setting up the network"
n_filters = {0: 16, 1: 16 * wrn_k, 2: 32 * wrn_k}
X = T.tensor4("input")
y = T.imatrix("target")
l_in = InputLayer(shape=(None, 1, image_size, image_size), input_var=X)
print "adding first layer...\t", time.time() - tick
l = batch_norm(ConvLayer(l_in, num_filters=n_filters[0], filter_size=(3, 3), \
stride=(1, 1), nonlinearity=rectify, pad='same', W=HeNormal(gain='relu')))
print "adding first stack of residual blocks...\t", time.time() - tick
l = residual_block(l, first=True, filters=n_filters[1])
for i in range(1, wrn_n):
print "\tadded residual block ", i, '\t', time.time() - tick
l = residual_block(l, filters=n_filters[1])
print "adding second stack of residual blocks..."
l = residual_block(l, increase_dim=True, filters=n_filters[2])
for i in range(1, (wrn_n+2)):
print "\tadded residual block ", i, '\t', time.time() - tick
l = residual_block(l, filters=n_filters[2])
bn_post_conv = BatchNormLayer(l)
bn_post_relu = NonlinearityLayer(bn_post_conv, rectify)
print "adding Conv ARC layer\t", time.time() - tick
l_carc = ConvARC(bn_post_relu, num_filters=n_filters[2], lstm_states=lstm_states, image_size=16,
attn_win=attn_win, glimpses=glimpses, fg_bias_init=fg_bias_init)
l_y = DenseLayer(l_carc, num_units=1, nonlinearity=sigmoid)
prediction = get_output(l_y)
prediction_clean = get_output(l_y, deterministic=True)
print "specifying loss and accuracy funcs...\t", time.time() - tick
loss = T.mean(binary_crossentropy(prediction, y))
accuracy = T.mean(T.eq(prediction_clean > 0.5, y), dtype=theano.config.floatX)
print "adding L2 penalty...\t", time.time() - tick
all_layers = get_all_layers(l_y)
l2_penalty = 0.0001 * regularize_layer_params(all_layers, lasagne.regularization.l2)
loss = loss + l2_penalty
print "fetching all params and specifying updates (includes grad step)...\t", time.time() - tick
params = get_all_params(l_y, trainable=True)
updates = adam(loss, params, learning_rate=learning_rate)
print "total number of parameters: ", lasagne.layers.count_params(l_y)
print "... compiling\t", time.time() - tick
train_fn = theano.function(inputs=[X, y], outputs=loss, updates=updates)
val_fn = theano.function(inputs=[X, y], outputs=[loss, accuracy])
print "... compilation done\t", time.time() - tick
print "testing ... \t", time.time() - tick
X_train, y_train = np.random.randn(batch_size, 1, image_size, image_size)
y_train = np.zeros((batch_size, 1), dtype='int32')
batch_loss = train_fn(X_train, y_train)
print "all done, exiting...\t", time.time() - tick