The attached files contain the minimal amount of code to reproduce my 
problem.

The model is as follows:

   - First, there are a set of convolutional layers. 
   - This is followed by a recurrent module on top of it, which has 
   attention over the convolutional feature maps

The lower Conv feature extractor is a Wide ResNet, and the top recurrent 
module is an LSTM.

The problem is that the model takes too much time to setup and compile. I 
have tested with only convolution and only recurrence and both of them 
begin training within few minutes. However, when combined as specified 
above, the model takes hours to setup and compile. I've tested this on the 
CPU on my MacBook Air (500sec to setup, 3700 secs to compile) and a GPU 
workstation (3600 secs to setup and still compiling). Note that the small 
set of the hyper-parameters specified are for a small network and my aim is 
to scale this up for deeper nets. 

Code uses Lasagne and Theano.

Hope someone can tell me what I could do to optimize this to be faster as 
it has withstood all my efforts to speed up. 


-- 

--- 
You received this message because you are subscribed to the Google Groups 
"theano-users" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
For more options, visit https://groups.google.com/d/optout.
import numpy as np

import theano
import theano.tensor as T
from theano.ifelse import ifelse

import lasagne


PI = np.pi
dtype = theano.config.floatX


def ortho_init(shape):
	"""
	taken from: https://github.com/Lasagne/Lasagne/blob/master/lasagne/init.py#L327-L367
	"""
	a = np.random.normal(0.0, 1.0, shape)
	u, _, v = np.linalg.svd(a, full_matrices=False)
	W = u if u.shape == shape else v 	# pick the one with the correct shape
	return W.astype(dtype)


def normal_init(shape, sigma):
	W = np.random.normal(0.0, sigma, shape)
	return W.astype(dtype)


def batched_dot(A, B):
	C = A.dimshuffle([0, 1, 2, 'x']) * B.dimshuffle([0, 'x', 1, 2])      
	return C.sum(axis=-2)


class ConvARC(lasagne.layers.Layer):
	def __init__(self, incoming, num_filters, lstm_states, image_size, attn_win, 
					glimpses, fg_bias_init, final_state_only=True, **kwargs):
		super(ConvARC, self).__init__(incoming, **kwargs)
		
		num_input = num_filters * (attn_win ** 2)

		W_lstm = np.zeros((4 * lstm_states, num_input + lstm_states + 1), dtype=dtype)
		print "\t\t Conv ARC weight init begin"
		for i in range(4):
			W_lstm[i*lstm_states:(i + 1)*lstm_states, :num_input] = ortho_init(shape=(lstm_states, num_input))
			W_lstm[i*lstm_states:(i + 1)*lstm_states, num_input:-1] = ortho_init(shape=(lstm_states, lstm_states))
		W_lstm[2*lstm_states:3*lstm_states, -1] = fg_bias_init
		W_g = normal_init(shape=(3, lstm_states), sigma=0.01)
		print "\t\t Conv ARC weight init done."

		self.W_lstm = self.add_param(W_lstm, (4 * lstm_states, num_input + lstm_states + 1), name='W_lstm')
		self.W_g = self.add_param(W_g, (3, lstm_states), name='W_g')

		self.num_filters = num_filters
		self.lstm_states = lstm_states
		self.image_size = image_size
		self.attn_win = attn_win
		self.glimpses = glimpses
		self.final_state_only = final_state_only

	def attend(self, I, H, W):
		attn_win = self.attn_win
		image_size = self.image_size
		num_filters = self.num_filters

		gp = T.tanh(T.dot(W, H.T).T)

		center_y = gp[:, 0].dimshuffle(0, 'x')
		center_x = gp[:, 1].dimshuffle(0, 'x')
		delta = 1.0 - T.abs_(gp[:, 2]).dimshuffle(0, 'x')
		gamma = T.exp(1.0 - 2 * T.abs_(gp[:, 2])).dimshuffle([0, 'x', 'x'])

		center_y = image_size * (center_y + 1.0) / 2.0
		center_x = image_size * (center_x + 1.0) / 2.0
		delta = image_size / attn_win * delta

		rng = T.arange(attn_win, dtype=dtype) - attn_win / 2.0 + 0.5
		cX = center_x + delta * rng
		cY = center_y + delta * rng

		a = T.arange(image_size, dtype=dtype)
		b = T.arange(image_size, dtype=dtype)

		F_X = 1.0 + ((a - cX.dimshuffle([0, 1, 'x'])) / gamma) ** 2.0 
		F_Y = 1.0 + ((b - cY.dimshuffle([0, 1, 'x'])) / gamma) ** 2.0
		F_X = 1.0 / (PI * gamma * F_X)
		F_Y = 1.0 / (PI * gamma * F_Y)
		F_X = F_X / (F_X.sum(axis=-1).dimshuffle(0, 1, 'x') + 1e-4)
		F_Y = F_Y / (F_Y.sum(axis=-1).dimshuffle(0, 1, 'x') + 1e-4)

		F_X = F_X.repeat(num_filters, axis=0)
		F_Y = F_Y.repeat(num_filters, axis=0)

		G = batched_dot(batched_dot(F_Y, I), F_X.transpose([0, 2, 1]))

		return G

	def get_output_for(self, input, **kwargs):
		
		image_size = self.image_size
		num_filters = self.num_filters
		lstm_states = self.lstm_states
		attn_win = self.attn_win

		# input is 4D tensor: (batch_size, num_filters, 0, 1)
		B = input.shape[0] / 2 		# pairs in batch
		odd_input = input[:B]
		even_input = input[B:]

		# (B * num_filters, image_size, image_size)
		odd_input = odd_input.reshape((B * num_filters, image_size, image_size))
		even_input = even_input.reshape((B * num_filters, image_size, image_size))

		def step(glimpse_count, c_tm1, h_tm1, odd_input, even_input, W_lstm, W_g):
			# c_tm1, h_tm1 are (B, lstm_states)
			
			turn = T.eq(glimpse_count % 2, 0)
			I = ifelse(turn, even_input, odd_input)
			
			# (B, attn_win, attn_win)
			glimpse = self.attend(I, h_tm1, W_g)
			flat_glimpse = glimpse.reshape((B, num_filters * (attn_win ** 2)))

			# (4 * states, num_input + states + 1) x transpose(B, num_input + states + 1)
			# result: (4 * states, B)
			lstm_ip = T.concatenate([flat_glimpse, h_tm1, T.ones((B, 1))], axis=1)
			pre_activation = T.dot(W_lstm, lstm_ip.T) 	

			z = T.tanh(pre_activation[0*lstm_states:1*lstm_states])
			i = T.nnet.sigmoid(pre_activation[1*lstm_states:2*lstm_states])
			f = T.nnet.sigmoid(pre_activation[2*lstm_states:3*lstm_states])
			o = T.nnet.sigmoid(pre_activation[3*lstm_states:4*lstm_states])

			# all in (states, B)
			c_t = f * c_tm1.T + i * z
			h_t = o * T.tanh(c_t)

			#c_t = T.clip(c_t, -1.0, 1.0)
			#h_t = T.clip(h_t, -1.0, 1.0)

			# output: (B, states)
			return glimpse_count + 1, c_t.T, h_t.T

		glimpse_count_0 = 0
		c_0 = T.zeros((B, lstm_states))
		h_0 = T.zeros((B, lstm_states))

		_, cells, hiddens = theano.scan(fn=step, non_sequences=[odd_input, even_input, self.W_lstm, self.W_g], 
						outputs_info=[glimpse_count_0, c_0, h_0], n_steps=self.glimpses * 2)[0]

		if self.final_state_only:
			return hiddens[-1]
		else:
			return hiddens

	def get_output_shape_for(self, input_shape):
		# the batch size in both must be input_shape[0] / 2
		# but since that it is none, we leave it as it is
		if self.final_state_only:
			return (input_shape[0], self.lstm_states) 
		else:
			return (2 * self.num_glimpses, input_shape[0], self.lstm_states)


if __name__ == "__main__":
	
	from lasagne.layers import InputLayer, Conv2DLayer, get_output
	
	l_in = InputLayer(shape=(2, 1, 7, 7))
	l_conv = Conv2DLayer(l_in, num_filters=1, filter_size=(3, 3), stride=(1, 1), pad='same')
	l_carc = ConvARC(l_conv, num_filters=1, lstm_states=5, image_size=7, \
						attn_win=2, glimpses=1, fg_bias_init=0.0)

	y = get_output(l_carc)

	fn = theano.function([l_in.input_var], outputs=y)
	X = np.random.random((2, 1, 7, 7)).astype(dtype)
	
	print fn(X).shape
learning_rate = 1e-4
image_size = 32
attn_win = 4
glimpses = 2
lstm_states = 64
fg_bias_init = 0.2
batch_size = 32

wrn_n = 2	# depth of wide resnet will be 4 * wrn_n + 7 conv layers deep
wrn_k = 1 	# width of resnet - a multiple of num of filters


print "... importing libraries"
#import sys
#sys.setrecursionlimit(10000)

import numpy as np

import theano
import theano.tensor as T

import lasagne
from lasagne.layers import InputLayer, DenseLayer, DropoutLayer
from lasagne.layers import batch_norm, BatchNormLayer, ExpressionLayer
from lasagne.layers import Conv2DLayer as ConvLayer
from lasagne.layers import ElemwiseSumLayer, NonlinearityLayer, GlobalPoolLayer
from lasagne.nonlinearities import rectify, sigmoid
from lasagne.init import HeNormal
from lasagne.layers import get_all_params, get_all_layers, get_output
from lasagne.regularization import regularize_layer_params
from lasagne.objectives import binary_crossentropy
from lasagne.updates import adam
from lasagne.layers import helper

from conv_arc import ConvARC

import time


def residual_block(l, increase_dim=False, projection=True, first=False, filters=16):
	if increase_dim:
		first_stride = (2, 2)
	else:
		first_stride = (1, 1)
	
	if first:
		bn_pre_relu = l
	else:
		bn_pre_conv = BatchNormLayer(l)
		bn_pre_relu = NonlinearityLayer(bn_pre_conv, rectify)
	
	conv_1 = batch_norm(ConvLayer(bn_pre_relu, num_filters=filters, filter_size=(3,3), stride=first_stride, nonlinearity=rectify, pad='same', W=HeNormal(gain='relu')))
	dropout = DropoutLayer(conv_1, p=0.3)
	conv_2 = ConvLayer(dropout, num_filters=filters, filter_size=(3,3), stride=(1,1), nonlinearity=None, pad='same', W=HeNormal(gain='relu'))
	
	if increase_dim:
		projection = ConvLayer(l, num_filters=filters, filter_size=(1,1), stride=(2,2), nonlinearity=None, pad='same', b=None)
		block = ElemwiseSumLayer([conv_2, projection])
	elif first:
		projection = ConvLayer(l, num_filters=filters, filter_size=(1,1), stride=(1,1), nonlinearity=None, pad='same', b=None)
		block = ElemwiseSumLayer([conv_2, projection])
	else:
		block = ElemwiseSumLayer([conv_2, l])
	
	return block

tick = time.time()

print "... setting up the network"
n_filters = {0: 16, 1: 16 * wrn_k, 2: 32 * wrn_k}

X = T.tensor4("input")
y = T.imatrix("target")

l_in = InputLayer(shape=(None, 1, image_size, image_size), input_var=X)

print "adding first layer...\t", time.time() - tick
l = batch_norm(ConvLayer(l_in, num_filters=n_filters[0], filter_size=(3, 3), \
	stride=(1, 1), nonlinearity=rectify, pad='same', W=HeNormal(gain='relu')))

print "adding first stack of residual blocks...\t", time.time() - tick
l = residual_block(l, first=True, filters=n_filters[1])
for i in range(1, wrn_n):
	print "\tadded residual block ", i, '\t', time.time() - tick
	l = residual_block(l, filters=n_filters[1])

print "adding second stack of residual blocks..."
l = residual_block(l, increase_dim=True, filters=n_filters[2])
for i in range(1, (wrn_n+2)):
	print "\tadded residual block ", i, '\t', time.time() - tick
	l = residual_block(l, filters=n_filters[2])

bn_post_conv = BatchNormLayer(l)
bn_post_relu = NonlinearityLayer(bn_post_conv, rectify)

print "adding Conv ARC layer\t", time.time() - tick

l_carc = ConvARC(bn_post_relu, num_filters=n_filters[2], lstm_states=lstm_states, image_size=16, 
					attn_win=attn_win, glimpses=glimpses, fg_bias_init=fg_bias_init)
l_y = DenseLayer(l_carc, num_units=1, nonlinearity=sigmoid)

prediction = get_output(l_y)
prediction_clean = get_output(l_y, deterministic=True)

print "specifying loss and accuracy funcs...\t", time.time() - tick
loss = T.mean(binary_crossentropy(prediction, y))
accuracy = T.mean(T.eq(prediction_clean > 0.5, y), dtype=theano.config.floatX)

print "adding L2 penalty...\t", time.time() - tick
all_layers = get_all_layers(l_y)
l2_penalty = 0.0001 * regularize_layer_params(all_layers, lasagne.regularization.l2)
loss = loss + l2_penalty

print "fetching all params and specifying updates (includes grad step)...\t", time.time() - tick
params = get_all_params(l_y, trainable=True)
updates = adam(loss, params, learning_rate=learning_rate)

print "total number of parameters: ", lasagne.layers.count_params(l_y)

print "... compiling\t", time.time() - tick
train_fn = theano.function(inputs=[X, y], outputs=loss, updates=updates)
val_fn = theano.function(inputs=[X, y], outputs=[loss, accuracy])

print "... compilation done\t", time.time() - tick

print "testing ... \t", time.time() - tick
X_train, y_train = np.random.randn(batch_size, 1, image_size, image_size)
y_train = np.zeros((batch_size, 1), dtype='int32')
batch_loss = train_fn(X_train, y_train)
	
print "all done, exiting...\t", time.time() - tick

Reply via email to