Repository: incubator-singa Updated Branches: refs/heads/master 9b2d16134 -> f2ad93381
SINGA-300 - Add residual networks for imagenet classification Add the wide residual network for imagenet serving; refer to https://github.com/szagoruyko/wide-residual-networks convert original resnet from torch to singa. tested wide resnet; Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/c0317d18 Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/c0317d18 Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/c0317d18 Branch: refs/heads/master Commit: c0317d185004ea594f19b74cbb9bc2c97e66a7b1 Parents: 59ca44a Author: Wei Wang <[email protected]> Authored: Fri Feb 10 14:32:42 2017 +0800 Committer: Wei Wang <[email protected]> Committed: Sun Feb 12 22:00:46 2017 +0800 ---------------------------------------------------------------------- examples/imagenet/resnet/convert.py | 48 +++++++++++ examples/imagenet/resnet/model.py | 138 ++++++++++++++++++++++++++++++ examples/imagenet/resnet/serve.py | 138 ++++++++++++++++++++++++++++++ python/singa/device.py | 4 +- python/singa/layer.py | 33 +++---- python/singa/net.py | 34 +++++--- src/model/layer/convolution.cc | 8 +- src/model/layer/convolution.h | 5 +- src/model/layer/cudnn_convolution.cc | 8 +- src/model/layer/dense.cc | 16 ++-- src/model/layer/dense.h | 7 +- 11 files changed, 398 insertions(+), 41 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/c0317d18/examples/imagenet/resnet/convert.py ---------------------------------------------------------------------- diff --git a/examples/imagenet/resnet/convert.py b/examples/imagenet/resnet/convert.py new file mode 100644 index 0000000..c976bf5 --- /dev/null +++ b/examples/imagenet/resnet/convert.py @@ -0,0 +1,48 @@ +import torchfile +import numpy as np +import cPickle as pickle + +def conv(m, name, params): + outplane = m['weight'].shape[0] + params[name + '-conv_weight'] = np.reshape(m['weight'], (outplane, -1)) + return params + +def batchnorm(m, name, params): + params[name + '-bn_gamma'] = m['weight'] + params[name + '-bn_beta'] = m['bias'] + params[name + '-bn_mean'] = m['running_mean'] + params[name + '-bn_var'] = m['running_var'] + return params + +def block(m, name, params, has_identity): + branch=m[0].modules[0].modules + params = conv(branch[0], name + '-1', params) + params = batchnorm(branch[1], name + '-1', params) + params = conv(branch[3], name + '-2', params) + params = batchnorm(branch[4], name + '-2', params) + params = conv(branch[6], name + '-3', params) + params = batchnorm(branch[7], name + '-3', params) + if not has_identity: + shortcut = m[0].modules[1].modules + params = conv(shortcut[0], name + '-shortcut', params) + params = batchnorm(shortcut[1], name + '-shortcut', params) + return params + +def stage(sid, m, num_blk, params): + for i in range(num_blk): + params = block(m[i].modules, 'stage%d-blk%d' % (sid, i), params, i!=0) + return params + +params = {} +model = torchfile.load('wrn-50-2.t7').modules +params = conv(model[0], 'input', params) +params = batchnorm(model[1], 'input', params) +params = stage(0, model[4].modules, 3, params) +params = stage(1, model[5].modules, 4, params) +params = stage(2, model[6].modules, 6, params) +params = stage(3, model[7].modules, 3, params) + +params['dense_weight'] = np.transpose(model[10]['weight']) +params['dense_bias'] = model[10]['bias'] +with open('wrn-50-2.pickle', 'wb') as fd: + pickle.dump(params, fd) http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/c0317d18/examples/imagenet/resnet/model.py ---------------------------------------------------------------------- diff --git a/examples/imagenet/resnet/model.py b/examples/imagenet/resnet/model.py new file mode 100644 index 0000000..7c9a3cf --- /dev/null +++ b/examples/imagenet/resnet/model.py @@ -0,0 +1,138 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= +''' This model is created following https://github.com/facebook/fb.resnet.torch.git +''' +from singa.layer import Conv2D, Activation, MaxPooling2D, AvgPooling2D,\ + Split, Merge, Flatten, Dense, BatchNormalization, Softmax +from singa import net as ffnet +from singa import initializer + +ffnet.verbose=True + +conv_bias = False + +def conv(net, prefix, n, ksize, stride=1, pad=0, bn=True, relu=True, src=None): + ret = net.add(Conv2D( + prefix + '-conv', n, ksize, stride, pad=pad, use_bias=conv_bias), src) + if bn: + ret = net.add(BatchNormalization(prefix + '-bn')) + if relu: + ret = net.add(Activation(prefix + '-relu')) + return ret + + +def shortcut(net, prefix, inplane, outplane, stride, src): + if inplane == outplane: + return src + return conv(net, prefix + '-shortcut', outplane, 1, stride, 0, True, False, src) + + +def bottleneck(name, net, inplane, midplane, outplane, stride=1, preact=False): + split = net.add(Split(name + '-split', 2)) + conv(net, name + '-1', midplane, 1, 1, 0, True, True, src=split) + conv(net, name + '-2', midplane, 3, stride, 1, True, True) + br0 = conv(net, name + '-3', outplane, 1, 1, 0, True, False) + br1 = shortcut(net, name, inplane, outplane, stride, split) + net.add(Merge(name + '-add'), [br0, br1]) + return net.add(Activation(name + '-relu')) + +def basicblock(name, net, inplane, midplane, outplane, stride=1, preact=False): + assert midplane==outplane, 'midplan and outplane should be the same' + split = net.add(Split(name + '-split', 2)) + if preact: + net.add(BatchNormalization(name + '-preact-bn'), split) + net.add(Activation(name + '-preact-relu')) + conv(net, name + '-1', outplane, 3, stride, 1, True, True, split) + br0 = conv(net, name + '-2', outplane, 3, 1, 1, True, False) + br1 = shortcut(net, name, inplane, outplane, stride, split) + net.add(Merge(name + '-add'), [br0, br1]) + return net.add(Activation(name + '-add-relu')) + + +def stage(sid, net, num_blk, inplane, midplane, outplane, stride, block): + block('stage%d-blk%d' % (sid, 0), net, inplane, midplane, outplane, stride) + for i in range(1, num_blk): + block('stage%d-blk%d' % (sid, i), net, outplane, midplane, outplane) + +def init_params(net, weight_path): + if weight_path == None: + for pname, pval in zip(net.param_names(), net.param_values()): + print pname, pval.shape + if 'conv' in pname and len(pval.shape) > 1: + initializer.gaussian(pval, 0, pval.shape[1]) + elif 'dense' in pname: + if len(pval.shape) > 1: + initializer.gaussian(pval, 0, pval.shape[0]) + else: + pval.set_value(0) + # init params from batch norm layer + elif 'mean' in pname or 'beta' in pname: + pval.set_value(0) + elif 'var' in pname: + pval.set_value(1) + elif 'gamma' in pname: + initializer.uniform(pval, 0, 1) + else: + net.load(weight_path, use_pickle = 'pickle' in weight_path) + +def create_resnet(weight_path=None, depth=50): + cfg = { + 50: ([3, 4, 6, 3], bottleneck), + 101: ([3, 4, 23, 3], bottleneck), + 152: ([3, 8, 36, 3], bottleneck), + } + net = ffnet.FeedForwardNet() + net.add(Conv2D('input-conv', 64, 7, 2, pad=3, input_sample_shape=(3, 224, 224))) + net.add(BatchNormalization('input-bn')) + net.add(Activation('input_relu')) + net.add(MaxPooling2D('input_pool', 3, 2, pad=1)) + + conf = cfg[depth] + stage(0, net, conf[0][0], 64, 64, 256, 1, conf[1]) + stage(1, net, conf[0][1], 256, 128, 512, 2, conf[1]) + stage(2, net, conf[0][2], 512, 256, 1024, 2, conf[1]) + stage(3, net, conf[0][3], 1024, 512, 2048, 2, conf[1]) + net.add(AvgPooling2D('avg', 7, 1)) + net.add(Flatten('flat')) + net.add(Dense('dense', 1000)) + + init_params(net, weight_path) + return net + + +def create_wide_resnet(weight_path=None): + net = ffnet.FeedForwardNet() + net.add(Conv2D('input-conv', 64, 7, 2, pad=3, use_bias=False, input_sample_shape=(3, 224, 224))) + net.add(BatchNormalization('input-bn')) + net.add(Activation('input_relu')) + net.add(MaxPooling2D('input_pool', 3, 2, pad=1)) + + stage(0, net, 3, 64, 128, 256, 1, bottleneck) + stage(1, net, 4, 256, 256, 512, 2, bottleneck) + stage(2, net, 6, 512, 512, 1024, 2, bottleneck) + stage(3, net, 3, 1024, 1024, 2048, 2, bottleneck) + + net.add(AvgPooling2D('avg_pool', 7, 1, pad=0)) + net.add(Flatten('flag')) + net.add(Dense('dense', 1000)) + + init_params(net, weight_path) + return net + + +if __name__ == '__main__': + create_net('wrn-50-2.pickle') http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/c0317d18/examples/imagenet/resnet/serve.py ---------------------------------------------------------------------- diff --git a/examples/imagenet/resnet/serve.py b/examples/imagenet/resnet/serve.py new file mode 100644 index 0000000..d9609ce --- /dev/null +++ b/examples/imagenet/resnet/serve.py @@ -0,0 +1,138 @@ +import os +import sys +import time +import numpy as np +import threading +import traceback +from scipy.misc import imread, imresize +from argparse import ArgumentParser + +from singa import device +from singa import tensor +from singa import data +from singa import image_tool +from singa import metric +from rafiki.agent import Agent, MsgType +import model + +tool = image_tool.ImageTool() +num_augmentation = 10 +crop_size = 224 +mean = np.array([0.485, 0.456, 0.406]) +std = np.array([ 0.229, 0.224, 0.225]) +def image_transform(img): + '''Input an image path and return a set of augmented images (type Image)''' + global tool + return tool.load(img).resize_by_list([256]).crop5((crop_size, crop_size), 5).flip(2).get() + + +def predict(net, images, num=10): + '''predict probability distribution for one net. + + Args: + net: neural net (vgg or resnet) + images: a batch of augmented images (type numpy) + num: num of augmentations + ''' + prob = net.predict(images) + prob = tensor.to_numpy(prob) + prob = prob.reshape((images.shape[0] / num, num, -1)) + prob = np.average(prob, 1) + return prob + + +def allowed_file(filename): + return '.' in filename and filename.rsplit('.', 1)[1] in \ + ["PNG", "png", "jpg", "JPG", "JPEG", "jpeg"] + + +def serve(net, label_map, dev, agent, topk=5): + '''Serve to predict image labels. + + It prints the topk food names for each image. + + Args: + label_map: a list of food names, corresponding to the index in meta_file + ''' + + images =tensor.Tensor((num_augmentation, 3, crop_size, crop_size), dev) + while True: + msg, val = agent.pull() + if msg is None: + time.sleep(0.1) + continue + msg = MsgType.parse(msg) + if msg.is_request(): + try: + # process images + im = [np.array(x.convert('RGB'), dtype=np.float32).transpose(2, 0, 1) for x in image_transform(val['image'])] + im = np.array(im) / 256 + im -= mean[np.newaxis, :, np.newaxis, np.newaxis] + im /= std[np.newaxis, :, np.newaxis, np.newaxis] + images.copy_from_numpy(im) + print "input: ", images.l1() + # do prediction + prob = predict(net, images, num_augmentation)[0] + idx = np.argsort(-prob) + # prepare results + response = "" + for i in range(topk): + response += "%s:%f <br/>" % (label_map[idx[i]], prob[idx[i]]) + except: + traceback.print_exc() + response = "sorry, system error during prediction." + agent.push(MsgType.kResponse, response) + elif msg.is_command(): + if MsgType.kCommandStop.equal(msg): + print 'get stop command' + agent.push(MsgType.kStatus, "success") + break + else: + print 'get unsupported command %s' % str(msg) + agent.push(MsgType.kStatus, "Unknown command") + else: + print 'get unsupported message %s' % str(msg) + agent.push(MsgType.kStatus, "unsupported msg; going to shutdown") + break + print "server stop" + +def main(): + try: + # Setup argument parser + parser = ArgumentParser(description="Wide residual network") + + parser.add_argument("-p", "--port", default=9999, help="listen port") + parser.add_argument("-c", "--use_cpu", action="store_true", + help="If set, load models onto CPU devices") + parser.add_argument("--parameter_file", default="wrn-50-2.pickle") + + # Process arguments + args = parser.parse_args() + port = args.port + + # start to train + agent = Agent(port) + + net = model.create_wide_resnet(args.parameter_file) + dev = device.create_cuda_gpu() + net.to_device(dev) + print 'Finish loading models' + + labels = np.loadtxt('synset_words.txt', str, delimiter='\t ') + serve(net, labels, dev, agent) + + # acc = evaluate(net, '../val_list.txt', 'image/val', dev) + # print acc + + # wait the agent finish handling http request + agent.stop() + except SystemExit: + return + except: + traceback.print_exc() + sys.stderr.write(" for help use --help \n\n") + return 2 + + +if __name__ == '__main__': + main() http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/c0317d18/python/singa/device.py ---------------------------------------------------------------------- diff --git a/python/singa/device.py b/python/singa/device.py index 1df4c84..fdd2a92 100644 --- a/python/singa/device.py +++ b/python/singa/device.py @@ -132,12 +132,12 @@ def create_cuda_gpu_on(device_id): def create_opencl_device(): '''Create the default OpenCL device. - + Returns: a swig converted OpenCL device. ''' assert singa.USE_OPENCL, 'SINGA has not been compiled with OpenCL enabled.' - return singa.Platform.GetDefaultDevice() + return singa.Platform.GetDefaultOpenclDevice() default_device = singa.Platform.GetDefaultDevice() http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/c0317d18/python/singa/layer.py ---------------------------------------------------------------------- diff --git a/python/singa/layer.py b/python/singa/layer.py index 0bea2d2..7975042 100644 --- a/python/singa/layer.py +++ b/python/singa/layer.py @@ -337,18 +337,19 @@ class Conv2D(Layer): # conf.data_format = data_format if W_specs is None: W_specs = {'init': 'xavier'} - if b_specs is None: - b_specs = {'init': 'constant'} if 'name' not in W_specs: W_specs['name'] = name + '_weight' - if 'name' not in b_specs: - b_specs['name'] = name + '_bias' wspecs = _construct_param_specs_from_dict(W_specs) self.conf.param.extend([wspecs]) self.param_specs.append(wspecs) - bspecs = _construct_param_specs_from_dict(b_specs) - self.conf.param.extend([bspecs]) - self.param_specs.append(bspecs) + if use_bias: + if b_specs is None: + b_specs = {'init': 'constant'} + if 'name' not in b_specs: + b_specs['name'] = name + '_bias' + bspecs = _construct_param_specs_from_dict(b_specs) + self.conf.param.extend([bspecs]) + self.param_specs.append(bspecs) _check_engine(engine, ['cudnn', 'singacpp', 'singacl']) self.layer = _create_layer(engine, 'Convolution') @@ -610,16 +611,19 @@ class Dense(Layer): conf.transpose = W_transpose if W_specs is None: W_specs = {'init': 'xavier'} - if b_specs is None: - b_specs = {'init': 'constant', 'value': 0} if 'name' not in W_specs: W_specs['name'] = name + '_weight' - if 'name' not in b_specs: - b_specs['name'] = name + '_bias' wspecs = _construct_param_specs_from_dict(W_specs) - bspecs = _construct_param_specs_from_dict(b_specs) - self.conf.param.extend([wspecs, bspecs]) - self.param_specs.extend([wspecs, bspecs]) + self.conf.param.extend([wspecs]) + self.param_specs.append(wspecs) + if use_bias: + if b_specs is None: + b_specs = {'init': 'constant', 'value': 0} + if 'name' not in b_specs: + b_specs['name'] = name + '_bias' + bspecs = _construct_param_specs_from_dict(b_specs) + self.conf.param.extend([bspecs]) + self.param_specs.append(bspecs) # dense layer is transparent to engine. if engine == 'cudnn': self.layer = _create_layer('singacuda', 'Dense') @@ -775,7 +779,6 @@ class Split(Layer): input_sample_shape: includes a single integer for the input sample feature size. ''' - def __init__(self, name, num_output, input_sample_shape=None): self.num_output = num_output self.in_shape = input_sample_shape http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/c0317d18/python/singa/net.py ---------------------------------------------------------------------- diff --git a/python/singa/net.py b/python/singa/net.py index 027e78c..26fb61d 100644 --- a/python/singa/net.py +++ b/python/singa/net.py @@ -386,16 +386,16 @@ class FeedForwardNet(object): ''' if use_pickle: params = {} - for (specs, val) in zip(self.param_specs(), self.param_values()): + for (name, val) in zip(self.param_names(), self.param_values()): val.to_host() - params[specs.name] = tensor.to_numpy(val) + params[name] = tensor.to_numpy(val) with open(f, 'wb') as fd: pickle.dump(params, fd) else: sp = snapshot.Snapshot(f, True, buffer_size) - for (specs, val) in zip(self.param_specs(), self.param_values()): + for (name, val) in zip(self.param_names(), self.param_values()): val.to_host() - sp.write(specs.name, val) + sp.write(name, val) def load(self, f, buffer_size=10, use_pickle=False): '''Load model parameters using io/snapshot. @@ -407,18 +407,30 @@ class FeedForwardNet(object): 'then set use_pickle=False for loading it' with open(f, 'rb') as fd: params = pickle.load(fd) - for (specs, val) in zip(self.param_specs(), - self.param_values()): + for name, val in zip(self.param_names(), self.param_values()): + if name not in params: + print 'Param: %s missing in the checkpoint file' % name + continue try: - val.copy_from_numpy(params[specs.name]) + val.copy_from_numpy(params[name]) except AssertionError as err: - print 'Error from copying values for param: %s' % specs.name - print 'shape of param vs checkpoint', val.shape, params[specs.name].shape + print 'Error from copying values for param: %s' % name + print 'shape of param vs checkpoint', \ + val.shape, params[name].shape raise err else: print 'NOTE: If your model was saved using pickle, '\ 'then set use_pickle=True for loading it' sp = snapshot.Snapshot(f, False, buffer_size) params = sp.read() - for (specs, val) in zip(self.param_specs(), self.param_values()): - val.copy_data(params[specs.name]) + for (name, val) in zip(self.param_names(), self.param_values()): + if name not in params: + print 'Param: %s missing in the checkpoint file' % name + continue + try: + val.copy_data(params[name]) + except AssertionError as err: + print 'Error from copying values for param: %s' % name + print 'shape of param vs checkpoint', \ + val.shape, params[name].shape + raise err http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/c0317d18/src/model/layer/convolution.cc ---------------------------------------------------------------------- diff --git a/src/model/layer/convolution.cc b/src/model/layer/convolution.cc index 78ec1af..8940fb2 100644 --- a/src/model/layer/convolution.cc +++ b/src/model/layer/convolution.cc @@ -97,7 +97,8 @@ void Convolution::Setup(const Shape &in_sample, const LayerConf &conf) { // Setup shape of weight_ and bias_ weight_.Reshape(Shape{num_filters_, col_height_}); - bias_.Reshape(Shape{num_filters_}); + if (bias_term_) + bias_.Reshape(Shape{num_filters_}); // Assume the order of param is: weight, bias for (const auto &spec : conf.param()) param_specs_.push_back(spec); } @@ -143,7 +144,6 @@ const std::pair<Tensor, vector<Tensor>> Convolution::Backward( Tensor dx; Tensor db, dw; dx.ResetLike(src_data); - db.ResetLike(bias_); dw.ResetLike(weight_); dw.SetValue(0.0f); size_t batchsize = grad.shape(0); @@ -156,6 +156,7 @@ const std::pair<Tensor, vector<Tensor>> Convolution::Backward( SumColumns(tmp1, &tmp2); Tensor tmp3 = Reshape(tmp2, Shape{batchsize, num_filters_}); + db.ResetLike(bias_); SumRows(tmp3, &db); } @@ -178,7 +179,8 @@ const std::pair<Tensor, vector<Tensor>> Convolution::Backward( dx.CopyDataFromHostPtr(dx_b, imagesize, b * imagesize); } param_grad.push_back(dw); - param_grad.push_back(db); + if (bias_term_) + param_grad.push_back(db); delete[] data_col; delete[] dx_b; return std::make_pair(dx, param_grad); http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/c0317d18/src/model/layer/convolution.h ---------------------------------------------------------------------- diff --git a/src/model/layer/convolution.h b/src/model/layer/convolution.h index 7b7fd00..89b5319 100644 --- a/src/model/layer/convolution.h +++ b/src/model/layer/convolution.h @@ -57,7 +57,10 @@ class Convolution : public Layer { const int stride_w, float* data_im); const std::vector<Tensor> param_values() override { - return std::vector<Tensor>{weight_, bias_}; + if (bias_term_) + return std::vector<Tensor>{weight_, bias_}; + else + return std::vector<Tensor>{weight_}; } size_t kernel_w() const { return kernel_w_; } http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/c0317d18/src/model/layer/cudnn_convolution.cc ---------------------------------------------------------------------- diff --git a/src/model/layer/cudnn_convolution.cc b/src/model/layer/cudnn_convolution.cc index 196d137..03ad8b9 100644 --- a/src/model/layer/cudnn_convolution.cc +++ b/src/model/layer/cudnn_convolution.cc @@ -60,7 +60,8 @@ void CudnnConvolution::InitCudnn(const Tensor &input) { size_t batchsize = input.shape(0); CUDNN_CHECK(cudnnCreateTensorDescriptor(&x_desc_)); CUDNN_CHECK(cudnnCreateTensorDescriptor(&y_desc_)); - CUDNN_CHECK(cudnnCreateTensorDescriptor(&bias_desc_)); + if (bias_term_) + CUDNN_CHECK(cudnnCreateTensorDescriptor(&bias_desc_)); CUDNN_CHECK(cudnnCreateFilterDescriptor(&filter_desc_)); CUDNN_CHECK(cudnnCreateConvolutionDescriptor(&conv_desc_)); @@ -209,11 +210,11 @@ const std::pair<Tensor, vector<Tensor>> CudnnConvolution::Backward( Tensor dx; dx.ResetLike(src_data); Tensor db, dw; - db.ResetLike(bias_); dw.ResetLike(weight_); // LOG(ERROR) << "backward bias"; if (bias_term_) { + db.ResetLike(bias_); dx.device()->Exec([grad, db, this](Context *ctx) { Block *dyblock = grad.block(), *dbblock = db.block(); float alpha = 1.f, beta = 0.f; @@ -248,7 +249,8 @@ const std::pair<Tensor, vector<Tensor>> CudnnConvolution::Backward( this->x_desc_, dxblock->mutable_data()); }, {grad.block(), weight_.block()}, {dx.block(), workspace_.block()}); param_grad.push_back(dw); - param_grad.push_back(db); + if (bias_term_) + param_grad.push_back(db); return std::make_pair(dx, param_grad); } http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/c0317d18/src/model/layer/dense.cc ---------------------------------------------------------------------- diff --git a/src/model/layer/dense.cc b/src/model/layer/dense.cc index 64e3d86..fac9130 100644 --- a/src/model/layer/dense.cc +++ b/src/model/layer/dense.cc @@ -38,11 +38,13 @@ void Dense::Setup(const Shape& in_sample, const LayerConf &conf) { vdim_ = in_sample.at(0); hdim_ = dense_conf.num_output(); transpose_ = dense_conf.transpose(); + bias_term_ = dense_conf.bias_term(); if (transpose_) // was {vdim_, hdim} by zhaojing? weight_.Reshape(Shape{hdim_, vdim_}); else weight_.Reshape(Shape{vdim_, hdim_}); - bias_.Reshape(Shape{hdim_}); + if (bias_term_) + bias_.Reshape(Shape{hdim_}); for (auto specs: conf.param()) param_specs_.push_back(specs); } @@ -56,7 +58,8 @@ const Tensor Dense::Forward(int flag, const Tensor &input) { output = Mult(input, weight_.T()); else output = Mult(input, weight_); - AddRow(bias_, &output); + if (bias_term_) + AddRow(bias_, &output); if (flag & kTrain) buf_.push(input); return output; @@ -70,10 +73,12 @@ const std::pair<Tensor, vector<Tensor>> Dense::Backward(int flag, Tensor src_data = buf_.top(); buf_.pop(); Tensor db, dw, dx; - db.ResetLike(bias_); dw.ResetLike(weight_); dx.ResetLike(src_data); - SumRows(grad, &db); + if (bias_term_) { + db.ResetLike(bias_); + SumRows(grad, &db); + } if (transpose_) { dx = Mult(grad, weight_); dw = Mult(grad.T(), src_data); @@ -82,7 +87,8 @@ const std::pair<Tensor, vector<Tensor>> Dense::Backward(int flag, dw = Mult(src_data.T(), grad); } param_grad.push_back(dw); - param_grad.push_back(db); + if (bias_term_) + param_grad.push_back(db); return std::make_pair(dx, param_grad); } http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/c0317d18/src/model/layer/dense.h ---------------------------------------------------------------------- diff --git a/src/model/layer/dense.h b/src/model/layer/dense.h index 8a149a5..8f53699 100644 --- a/src/model/layer/dense.h +++ b/src/model/layer/dense.h @@ -46,7 +46,10 @@ class Dense : public Layer { void ToDevice(std::shared_ptr<Device> device) override; const std::vector<Tensor> param_values() override { - return std::vector<Tensor>{weight_, bias_}; + if (bias_term_) + return std::vector<Tensor>{weight_, bias_}; + else + return std::vector<Tensor>{weight_}; } size_t num_output() const { return hdim_; } size_t num_input() const { return vdim_; } @@ -67,6 +70,8 @@ class Dense : public Layer { /// Used in auto-encoder, where the decoder would share its weight matrix from /// the encoder's transposed weight matrix. bool transpose_ = false; + /// use bias or not; + bool bias_term_ = true; size_t vdim_, hdim_; Tensor weight_, bias_; // Tensor data_, grad_;
