SINGA-238 RBM on MNIST Enable the training on GPU. Fixed a bug from KernelSum() by removing it and implemented the Sum(Tensor)->float function using Dot() (from blas).
Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/5b332a40 Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/5b332a40 Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/5b332a40 Branch: refs/heads/dev Commit: 5b332a4086ff32b0c3a298169c0befef78f003ca Parents: e1a524d Author: Wei Wang <[email protected]> Authored: Sun Aug 14 17:07:22 2016 +0800 Committer: Wei Wang <[email protected]> Committed: Sun Aug 14 17:07:22 2016 +0800 ---------------------------------------------------------------------- examples/mnist/README.md | 21 ++- examples/mnist/train.py | 265 ++++++++++++++++---------------- include/singa/model/loss.h | 1 - src/core/tensor/math_kernel.cu | 5 + src/core/tensor/math_kernel.h | 2 +- src/core/tensor/tensor.cc | 10 +- src/core/tensor/tensor_math_cuda.h | 5 +- src/python/singa/optimizer.py | 1 + 8 files changed, 169 insertions(+), 141 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5b332a40/examples/mnist/README.md ---------------------------------------------------------------------- diff --git a/examples/mnist/README.md b/examples/mnist/README.md index bfd480f..9f59e7e 100644 --- a/examples/mnist/README.md +++ b/examples/mnist/README.md @@ -1,3 +1,18 @@ -This example is to train an RBM model using mnist data set. This RBM follows paper http://www.cs.toronto.edu/~hinton/science.pdf and the source code for this paper can be found http://www.cs.toronto.edu/~hinton/MatlabForSciencePaper.html -1. Download dataset mnist.pkl.gz from https://github.com/mnielsen/neural-networks-and-deep-learning/raw/master/data/mnist.pkl.gz -2. $ python train.py +# Train a RBM model against MNIST dataset + +This example is to train an RBM model using the +MNIST dataset. The RBM model and its hyper-parameters are set following +[Hinton's paper](http://www.cs.toronto.edu/~hinton/science.pdf) + +## Running instructions + +1. Download the pre-processed [MNIST dataset](https://github.com/mnielsen/neural-networks-and-deep-learning/raw/master/data/mnist.pkl.gz) + +2. Start the training + + python train.py + +By default the training code would run on CPU. To run it on a GPU card, please start +the program with an additional argument + + python train.py --use_gpu http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5b332a40/examples/mnist/train.py ---------------------------------------------------------------------- diff --git a/examples/mnist/train.py b/examples/mnist/train.py index 52b023a..43b8e26 100644 --- a/examples/mnist/train.py +++ b/examples/mnist/train.py @@ -1,131 +1,134 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================= - -import cPickle -import numpy as np -import numpy.matlib -import os -import sys -import gzip, numpy - - -sys.path.append(os.path.join(os.path.dirname(__file__), - '../../build/python')) -sys.path.append(os.path.join(os.path.dirname(__file__), - '../../build/lib')) -sys.path.append(os.path.join(os.path.dirname(__file__),'../../build/src')) -from singa import initializer -from singa import utils -from singa import optimizer -from singa import device -from singa import tensor -from singa.proto import core_pb2 - - - -def load_train_data(dir_path): - f = gzip.open(dir_path, 'rb') - train_set, valid_set, test_set = cPickle.load(f) - traindata = train_set[0].astype(np.float32) - validdata = valid_set[0].astype(np.float32) - return traindata, validdata - - - -def train(data_dir, num_epoch=10, batch_size=100): - print 'Start intialization............' - lr = 0.1 # Learning rate - weight_decay = 0.0002 - hdim = 1000 - vdim = 784 - opt = optimizer.SGD(momentum=0.8, weight_decay=weight_decay) - - shape = (vdim, hdim) - tweight = tensor.Tensor(shape) - initializer.gaussian(tweight, 0.0, 0.1) - tvbias = tensor.from_numpy(np.zeros(vdim, dtype = np.float32)) - thbias = tensor.from_numpy(np.zeros(hdim, dtype = np.float32)) - opt = optimizer.SGD(momentum=0.5, weight_decay=weight_decay) - - print 'Loading data ..................' - train_x, valid_x = load_train_data(data_dir) - - num_train_batch = train_x.shape[0]/batch_size - print "num_train_batch = \n", num_train_batch - for epoch in range(num_epoch): - trainerrorsum = 0.0 - validerrorsum = 0.0 - print 'Epoch %d' % epoch - for b in range(num_train_batch): - # positive phase - if b % 100 == 0: - print "batch: \n", b - - tdata = tensor.from_numpy(train_x[ (b * batch_size): ((b + 1) * batch_size), : ]) - tposhidprob = tensor.mult(tdata, tweight) - tposhidprob.add_row(thbias) - tposhidprob = tensor.sigmoid(tposhidprob) - tposhidrandom = tensor.Tensor(tposhidprob.shape) - initializer.uniform(tposhidrandom, 0.0, 1.0) - tposhidsample = tensor.gt(tposhidprob, tposhidrandom) - - # negative phase - tnegdata = tensor.mult(tposhidsample, tweight.transpose()) - tnegdata.add_row(tvbias) - tnegdata = tensor.sigmoid(tnegdata) - - tneghidprob = tensor.mult(tnegdata, tweight) - tneghidprob.add_row(thbias) - tneghidprob = tensor.sigmoid(tneghidprob) - trainerror = tensor.sum(tensor.eltwise_mult((tdata - tnegdata),(tdata - tnegdata))) - trainerrorsum = trainerror + trainerrorsum - - tgweight = tensor.mult(tnegdata.transpose(), tneghidprob) - tensor.mult(tdata.transpose(), tposhidprob) - tgvbias = tensor.sum(tnegdata, 0) - tensor.sum(tdata, 0) - tghbias = tensor.sum(tneghidprob, 0) - tensor.sum(tposhidprob, 0) - - opt.apply_with_lr(epoch, lr / batch_size, tgweight, tweight, '') - opt.apply_with_lr(epoch, lr / batch_size, tgvbias, tvbias, '') - opt.apply_with_lr(epoch, lr / batch_size, tghbias, thbias, '') - - info = 'train errorsum = %f' \ - % (trainerrorsum) - print info - - tvaliddata = tensor.from_numpy(valid_x[ :, : ]) - tvalidposhidprob = tensor.mult(tvaliddata, tweight) - tvalidposhidprob.add_row(thbias) - tvalidposhidprob = tensor.sigmoid(tvalidposhidprob) - tvalidposhidrandom = tensor.Tensor(tvalidposhidprob.shape) - initializer.uniform(tvalidposhidrandom, 0.0, 1.0) - tvalidposhidsample = tensor.gt(tvalidposhidprob, tvalidposhidrandom) - - tvalidnegdata = tensor.mult(tvalidposhidsample, tweight.transpose()) - tvalidnegdata.add_row(tvbias) - tvalidnegdata = tensor.sigmoid(tvalidnegdata) - - validerrorsum = tensor.sum(tensor.eltwise_mult((tvaliddata - tvalidnegdata),(tvaliddata - tvalidnegdata))) - validinfo = 'valid errorsum = %f' \ - % (validerrorsum) - print validinfo - - -if __name__ == '__main__': - data_dir = 'mnist.pkl.gz' - assert os.path.exists(data_dir), \ - 'Pls download the mnist dataset' - train(data_dir) +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +import numpy as np +import os +import gzip +import argparse +import cPickle +from singa import initializer +from singa import utils +from singa import optimizer +from singa import device +from singa import tensor + + +from singa.proto import core_pb2 + + + +def load_train_data(file_path): + f = gzip.open(file_path, 'rb') + train_set, valid_set, test_set = cPickle.load(f) + traindata = train_set[0].astype(np.float32) + validdata = valid_set[0].astype(np.float32) + print traindata.shape, validdata.shape + return traindata, validdata + + + +def train(data_file, use_gpu, num_epoch=10, batch_size=100): + print 'Start intialization............' + lr = 0.1 # Learning rate + weight_decay = 0.0002 + hdim = 1000 + vdim = 784 + opt = optimizer.SGD(momentum=0.8, weight_decay=weight_decay) + + tweight = tensor.Tensor((vdim, hdim)) + tweight.gaussian(0.0, 0.1) + tvbias = tensor.from_numpy(np.zeros(vdim, dtype = np.float32)) + thbias = tensor.from_numpy(np.zeros(hdim, dtype = np.float32)) + opt = optimizer.SGD(momentum=0.5, weight_decay=weight_decay) + + print 'Loading data ..................' + train_x, valid_x = load_train_data(data_file) + + if use_gpu: + dev = device.create_cuda_gpu() + else: + dev = device.get_default_device() + + for t in [tweight, tvbias, thbias]: + t.to_device(dev) + + num_train_batch = train_x.shape[0] / batch_size + print "num_train_batch = %d " % (num_train_batch) + for epoch in range(num_epoch): + trainerrorsum = 0.0 + validerrorsum = 0.0 + print 'Epoch %d' % epoch + for b in range(num_train_batch): + # positive phase + tdata = tensor.from_numpy( + train_x[(b * batch_size):((b + 1) * batch_size), : ]) + tdata.to_device(dev) + tposhidprob = tensor.mult(tdata, tweight) + tposhidprob.add_row(thbias) + tposhidprob = tensor.sigmoid(tposhidprob) + tposhidrandom = tensor.Tensor(tposhidprob.shape, dev) + tposhidrandom.uniform(0.0, 1.0) + tposhidsample = tensor.gt(tposhidprob, tposhidrandom) + + # negative phase + tnegdata = tensor.mult(tposhidsample, tweight.transpose()) + tnegdata.add_row(tvbias) + tnegdata = tensor.sigmoid(tnegdata) + + tneghidprob = tensor.mult(tnegdata, tweight) + tneghidprob.add_row(thbias) + tneghidprob = tensor.sigmoid(tneghidprob) + error = tensor.sum(tensor.square((tdata - tnegdata))) + trainerrorsum = error + trainerrorsum + + tgweight = tensor.mult(tnegdata.transpose(), tneghidprob) -\ + tensor.mult(tdata.transpose(), tposhidprob) + tgvbias = tensor.sum(tnegdata, 0) - tensor.sum(tdata, 0) + tghbias = tensor.sum(tneghidprob, 0) - tensor.sum(tposhidprob, 0) + + opt.apply_with_lr(epoch, lr / batch_size, tgweight, tweight, 'w') + opt.apply_with_lr(epoch, lr / batch_size, tgvbias, tvbias, 'vb') + opt.apply_with_lr(epoch, lr / batch_size, tghbias, thbias, 'hb') + + print 'training errorsum = %f' % (trainerrorsum) + + tvaliddata = tensor.from_numpy(valid_x) + tvaliddata.to_device(dev) + tvalidposhidprob = tensor.mult(tvaliddata, tweight) + tvalidposhidprob.add_row(thbias) + tvalidposhidprob = tensor.sigmoid(tvalidposhidprob) + tvalidposhidrandom = tensor.Tensor(tvalidposhidprob.shape, dev) + initializer.uniform(tvalidposhidrandom, 0.0, 1.0) + tvalidposhidsample = tensor.gt(tvalidposhidprob, tvalidposhidrandom) + + tvalidnegdata = tensor.mult(tvalidposhidsample, tweight.transpose()) + tvalidnegdata.add_row(tvbias) + tvalidnegdata = tensor.sigmoid(tvalidnegdata) + + validerrorsum = tensor.sum(tensor.square((tvaliddata - tvalidnegdata))) + print 'valid errorsum = %f' % (validerrorsum) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Train RBM over MNIST') + parser.add_argument('file', type=str, help='the dataset path') + parser.add_argument('--use_gpu', action='store_true') + args = parser.parse_args() + + assert os.path.exists(args.file), 'Pls download the MNIST dataset from' \ + 'https://github.com/mnielsen/neural-networks-and-deep-learning/raw/master/data/mnist.pkl.gz' + train(args.file, args.use_gpu) http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5b332a40/include/singa/model/loss.h ---------------------------------------------------------------------- diff --git a/include/singa/model/loss.h b/include/singa/model/loss.h index 951c477..4ee41cb 100644 --- a/include/singa/model/loss.h +++ b/include/singa/model/loss.h @@ -51,7 +51,6 @@ public: /// [Evaluate|Forward] Backward. float Evaluate(int flag, const Tensor &prediction, const Tensor &target) { Tensor loss = Forward(flag, prediction, target); - loss.ToHost(); return Sum<float>(loss) / (1.0f * loss.Size()); } http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5b332a40/src/core/tensor/math_kernel.cu ---------------------------------------------------------------------- diff --git a/src/core/tensor/math_kernel.cu b/src/core/tensor/math_kernel.cu index e0112f3..d3f3335 100644 --- a/src/core/tensor/math_kernel.cu +++ b/src/core/tensor/math_kernel.cu @@ -35,6 +35,8 @@ namespace singa { // Cuda Kernel Functions namespace cuda { +/* +wangwei: Not used due to error in the code. __global__ void KernelSum(const size_t n, const float *in, float *out) { int THREADS = blockDim.x; @@ -65,6 +67,7 @@ __global__ void KernelSum(const size_t n, const float *in, float *out) { __syncthreads(); *out = aux[0]; } +*/ __global__ void KernelAdd(const size_t n, const float *in1, const float *in2, float *out) { @@ -461,12 +464,14 @@ void div(const size_t n, const float *in1, const float *in2, float *out, KernelDiv <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in1, in2, out); } +/* void sum(const size_t n, const float *in, float *out, cudaStream_t s) { int threads_per_block = n > CU1DBLOCK ? CU1DBLOCK : n; // here, we only need one block int num_blocks = 1; KernelSum <<<num_blocks, threads_per_block>>> (n, in, out); } +*/ void ComputeCrossEntropy(size_t batchsize, const size_t dim, const float *p, const int *t, float *loss, cudaStream_t stream) { http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5b332a40/src/core/tensor/math_kernel.h ---------------------------------------------------------------------- diff --git a/src/core/tensor/math_kernel.h b/src/core/tensor/math_kernel.h index 202777e..cb0cb6a 100644 --- a/src/core/tensor/math_kernel.h +++ b/src/core/tensor/math_kernel.h @@ -101,7 +101,7 @@ void mult(const size_t n, const float *in1, const float *in2, float *out, void div(const size_t n, const float *in1, const float *in2, float *out, cudaStream_t s); -void sum(const size_t n, const float *in, float *out, cudaStream_t s); +// void sum(const size_t n, const float *in, float *out, cudaStream_t s); void ComputeCrossEntropy(const size_t batchsize, const size_t dim, const float *p, const int *t, float *loss, http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5b332a40/src/core/tensor/tensor.cc ---------------------------------------------------------------------- diff --git a/src/core/tensor/tensor.cc b/src/core/tensor/tensor.cc index b80e233..670b27e 100644 --- a/src/core/tensor/tensor.cc +++ b/src/core/tensor/tensor.cc @@ -626,10 +626,14 @@ Tensor Average(const Tensor &M, int axis) { template <> float Sum<float>(const Tensor &in) { float s = 0.0f; + Tensor one(in.shape(), in.device(), in.data_type()); + one.SetValue(1.0f); TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, { - in.device()->Exec([in, &s](Context *ctx) { - Sum<DType, Lang>(in.Size(), in.block(), &s, ctx); - }, {in.block()}, {}); + one.device()->Exec([in, one, &s](Context *ctx) { + DType ret = DType(0); + Dot<DType, Lang>(in.Size(), in.block(), one.block(), &ret, ctx); + s = ret; + }, {in.block(), one.block()}, {}); }); return s; } http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5b332a40/src/core/tensor/tensor_math_cuda.h ---------------------------------------------------------------------- diff --git a/src/core/tensor/tensor_math_cuda.h b/src/core/tensor/tensor_math_cuda.h index 1cd61b3..4daa97a 100644 --- a/src/core/tensor/tensor_math_cuda.h +++ b/src/core/tensor/tensor_math_cuda.h @@ -263,8 +263,9 @@ void Sub<float, lang::Cuda>(const size_t num, const Block* in1, template <> void Sum<float, lang::Cuda>(const size_t num, const Block* in, float* out, Context* ctx) { - const float* inPtr = static_cast<const float*>(in->data()); - cuda::sum(num, inPtr, out, ctx->stream); + LOG(FATAL) << "Cuda Sum is not implemented!"; + // const float* inPtr = static_cast<const float*>(in->data()); + // cuda::sum(num, inPtr, out, ctx->stream); } /// Element-wise operation, out[i]=tanh([in[i]) http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5b332a40/src/python/singa/optimizer.py ---------------------------------------------------------------------- diff --git a/src/python/singa/optimizer.py b/src/python/singa/optimizer.py index 7cab746..aa6bdd1 100644 --- a/src/python/singa/optimizer.py +++ b/src/python/singa/optimizer.py @@ -187,6 +187,7 @@ class SGD(Optimizer): """ super(SGD, self).__init__(lr, momentum, decay) conf = model_pb2.OptimizerConf() + conf.momentum = momentum self.opt = singa.CreateOptimizer('SGD') self.opt.Setup(conf.SerializeToString())
