SINGA-98 Add Support for AlexNet ImageNet Classification Model Update the CudnnActivationLayer to share the data and grad blob with conv layer for memory space reduction. It is controlled by the share_src_blobs field in the job config file. The loss reduces after 3000 iterations using 256 mini-batch like Caffe.
cpplint check; updte job conf for cpu training; Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/bb75a0be Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/bb75a0be Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/bb75a0be Branch: refs/heads/master Commit: bb75a0be5f1bf00d24552fb943b5fc40453b5855 Parents: 6e815db Author: Wei Wang <[email protected]> Authored: Tue Dec 29 19:10:00 2015 +0800 Committer: Wei Wang <[email protected]> Committed: Wed Jan 6 00:58:18 2016 +0800 ---------------------------------------------------------------------- examples/alexnet/cudnn.conf | 18 ++++++++++-- examples/alexnet/im2rec.cc | 35 +++++++++++++++++++----- examples/alexnet/job.conf | 50 ++++++++++++++++++---------------- examples/alexnet/rec2im_test.cc | 39 ++++++++++++++++++++------ src/neuralnet/layer.cc | 2 +- src/neuralnet/neuron_layer/lrn.cc | 5 ++-- src/proto/job.proto | 3 ++ src/utils/updater.cc | 2 +- 8 files changed, 108 insertions(+), 46 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/bb75a0be/examples/alexnet/cudnn.conf ---------------------------------------------------------------------- diff --git a/examples/alexnet/cudnn.conf b/examples/alexnet/cudnn.conf index eef20f9..6324185 100644 --- a/examples/alexnet/cudnn.conf +++ b/examples/alexnet/cudnn.conf @@ -1,7 +1,7 @@ name: "alexnet" train_steps: 450000 -#test_steps: 500 -#test_freq: 1000 +test_steps: 500 +test_freq: 1000 disp_freq: 20 checkpoint_freq: 100000 checkpoint_after: 100000 @@ -423,6 +423,20 @@ neuralnet { } srclayers: "ip8" srclayers: "data" + include: kTrain + } + layer { + name : "softmax" + type: kCudnnSoftmax + srclayers: "ip8" + include: kTest + } + layer { + name : "accuracy" + type: kAccuracy + srclayers: "softmax" + srclayers: "data" + include: kTest } } cluster { http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/bb75a0be/examples/alexnet/im2rec.cc ---------------------------------------------------------------------- diff --git a/examples/alexnet/im2rec.cc b/examples/alexnet/im2rec.cc index cf6eedf..58ee44f 100644 --- a/examples/alexnet/im2rec.cc +++ b/examples/alexnet/im2rec.cc @@ -1,4 +1,27 @@ +/************************************************************ +* +* Licensed to the Apache Software Foundation (ASF) under one +* or more contributor license agreements. See the NOTICE file +* distributed with this work for additional information +* regarding copyright ownership. The ASF licenses this file +* to you under the Apache License, Version 2.0 (the +* "License"); you may not use this file except in compliance +* with the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an +* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +* KIND, either express or implied. See the License for the +* specific language governing permissions and limitations +* under the License. +* +*************************************************************/ + + #include <glog/logging.h> +#include <opencv2/opencv.hpp> #include <algorithm> #include <random> #include <chrono> @@ -7,7 +30,6 @@ #include <cstdint> #include <iostream> #include <vector> -#include <opencv2/opencv.hpp> #include "singa/io/store.h" #include "singa/proto/common.pb.h" @@ -20,8 +42,7 @@ const int kImageNBytes = 256*256*3; void create_data(const string& image_list, const string& input_folder, const string& output_folder, - const string& backend = "kvfile") -{ + const string& backend = "kvfile") { singa::RecordProto image; image.add_shape(3); image.add_shape(kImageSize); @@ -49,12 +70,12 @@ void create_data(const string& image_list, string rec_buf; cv::Mat img, res; std::vector<std::pair<string, int>> file_list; - while(image_list_file >> image_file_name >> label) + while (image_list_file >> image_file_name >> label) file_list.push_back(std::make_pair(image_file_name, label)); LOG(INFO) << "Data Shuffling"; unsigned seed = std::chrono::system_clock::now().time_since_epoch().count(); std::shuffle(file_list.begin(), file_list.end() - ,std::default_random_engine()); + , std::default_random_engine()); LOG(INFO) << "Total number of images is " << file_list.size(); int ImageNum = file_list.size(); @@ -120,8 +141,8 @@ void create_data(const string& image_list, int main(int argc, char** argv) { if (argc < 4) { - std::cout << "Create Datashard for ImageNet dataset.\n" - << "Usage: <image_list> <input_folder> <output_folder>" + std::cout << "Create data stores for ImageNet dataset.\n" + << "Usage: <image_list_file> <input_image_folder> <output_folder>" << " <Optional: backend {lmdb, kvfile} default: kvfile>\n"; } else { google::InitGoogleLogging(argv[0]); http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/bb75a0be/examples/alexnet/job.conf ---------------------------------------------------------------------- diff --git a/examples/alexnet/job.conf b/examples/alexnet/job.conf index 1898a09..3b7eaf4 100644 --- a/examples/alexnet/job.conf +++ b/examples/alexnet/job.conf @@ -1,10 +1,10 @@ name: "alexnet" -train_steps: 100 -test_steps: 0 -test_freq: 300 -disp_freq: 5 -#debug: true -#checkpoint_path: "examples/alexnet/checkpoint/step10000-worker0" +train_steps: 450000 +test_steps: 500 +test_freq: 1000 +disp_freq: 20 +checkpoint_freq: 100000 +checkpoint_after: 100000 train_one_batch { alg: kBP } @@ -13,8 +13,12 @@ updater{ weight_decay: 0.0005 momentum: 0.9 learning_rate { - type: kFixed + type: kStep base_lr: 0.01 + step_conf { + gamma: 0.1 + change_freq: 100000 + } } } neuralnet { @@ -25,22 +29,22 @@ neuralnet { backend: "kvfile" path :"/data/dataset/imagenet/train_record.bin" mean_file: "/data/dataset/imagenet/image_mean.bin" - batchsize: 32 - #random_skip: 5000 + batchsize: 256 + #random_skip: 1000 shape: 3 shape: 256 shape: 256 } - include: kTrain + include: kTrain } layer{ name: "data" type: kRecordInput store_conf { backend: "kvfile" - path :"/data/dataset/val_record.bin" - mean_file: "/data/dataset/image_mean.bin" - batchsize: 256 + path :"/data/dataset/imagenet/val_record.bin" + mean_file: "/data/dataset/imagenet/image_mean.bin" + batchsize: 100 shape: 3 shape: 256 shape: 256 @@ -59,7 +63,7 @@ neuralnet { } layer{ name: "conv1" - type: kCConvolution + type: kConvolution srclayers: "image" convolution_conf { num_filters: 96 @@ -92,7 +96,7 @@ neuralnet { } layer { name: "pool1" - type: kCPooling + type: kPooling pooling_conf { pool: MAX kernel: 3 @@ -108,7 +112,7 @@ neuralnet { local_size: 5 alpha: 0.0001 beta: 0.75 - knorm: 2 + knorm: 1 } srclayers: "pool1" # partition_dim: 0 @@ -116,7 +120,7 @@ neuralnet { layer{ name: "conv2" - type: kCConvolution + type: kConvolution srclayers: "norm1" convolution_conf { num_filters: 256 @@ -149,7 +153,7 @@ neuralnet { } layer { name: "pool2" - type: kCPooling + type: kPooling pooling_conf { pool: MAX kernel: 3 @@ -166,14 +170,14 @@ neuralnet { local_size: 5 alpha: 0.0001 beta: 0.75 - knorm: 2 + knorm: 1 } srclayers: "pool2" # partition_dim: 0 } layer{ name: "conv3" - type: kCConvolution + type: kConvolution srclayers: "norm2" convolution_conf { num_filters: 384 @@ -206,7 +210,7 @@ neuralnet { } layer{ name: "conv4" - type: kCConvolution + type: kConvolution srclayers: "relu3" convolution_conf { num_filters: 384 @@ -239,7 +243,7 @@ neuralnet { } layer{ name: "conv5" - type: kCConvolution + type: kConvolution srclayers: "relu4" convolution_conf { num_filters: 256 @@ -272,7 +276,7 @@ neuralnet { } layer { name: "pool5" - type: kCPooling + type: kPooling pooling_conf { pool: MAX kernel: 3 http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/bb75a0be/examples/alexnet/rec2im_test.cc ---------------------------------------------------------------------- diff --git a/examples/alexnet/rec2im_test.cc b/examples/alexnet/rec2im_test.cc index 0fa3505..bb92d95 100644 --- a/examples/alexnet/rec2im_test.cc +++ b/examples/alexnet/rec2im_test.cc @@ -1,4 +1,28 @@ +/************************************************************ +* +* Licensed to the Apache Software Foundation (ASF) under one +* or more contributor license agreements. See the NOTICE file +* distributed with this work for additional information +* regarding copyright ownership. The ASF licenses this file +* to you under the Apache License, Version 2.0 (the +* "License"); you may not use this file except in compliance +* with the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an +* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +* KIND, either express or implied. See the License for the +* specific language governing permissions and limitations +* under the License. +* +*************************************************************/ + + #include <glog/logging.h> +#include <opencv2/opencv.hpp> + #include <algorithm> #include <random> #include <chrono> @@ -7,7 +31,6 @@ #include <cstdint> #include <iostream> #include <vector> -#include <opencv2/opencv.hpp> #include "singa/io/store.h" #include "singa/proto/common.pb.h" @@ -19,8 +42,7 @@ const int kImageNBytes = 256*256*3; void generate_image(const string& output_folder, const string& key, - const string& val) -{ + const string& val) { float image_buf[kImageNBytes]; singa::RecordProto image; image.ParseFromString(val); @@ -38,6 +60,7 @@ void generate_image(const string& output_folder, static_cast<uchar>( static_cast<uint8_t>( pixel[(c * kImageSize + h) * kImageSize + w])); + } } cv::imwrite(image_name, img); @@ -45,8 +68,7 @@ void generate_image(const string& output_folder, void visualize(const string& input_file, const string& output_folder, - const string& id_list) -{ + const string& id_list) { auto store = singa::io::OpenStore("kvfile", input_file, singa::io::kRead); @@ -55,7 +77,7 @@ void visualize(const string& input_file, std::ifstream id_list_file(id_list.c_str(), std::ios::in); CHECK(id_list_file.is_open()) << "Unable to open image id list"; string id_; - while(id_list_file >> id_) { + while (id_list_file >> id_) { int x; x = std::stoi(id_); image_id_list.push_back(x); @@ -80,10 +102,9 @@ void visualize(const string& input_file, } } -int main(int argc, char** argv) -{ +int main(int argc, char** argv) { if (argc != 4) { - std::cout << "Visualize images from binary kvfile record.\n" + std::cout << "Visualize images from binary kvfile records.\n" << "Usage: <input_file> <output_folder> <id_list>\n"; } else { google::InitGoogleLogging(argv[0]); http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/bb75a0be/src/neuralnet/layer.cc ---------------------------------------------------------------------- diff --git a/src/neuralnet/layer.cc b/src/neuralnet/layer.cc index ac673dd..cb1f3b8 100644 --- a/src/neuralnet/layer.cc +++ b/src/neuralnet/layer.cc @@ -56,7 +56,7 @@ const std::string Layer::ToString(bool debug, int flag) { if ((flag & kBackward) == kBackward && grad_.count() != 0) { ret += StringPrintf("grad:%e ", Asum(grad_)); for (Param* p : GetParams()) - ret += StringPrintf("%13.9f ", + ret += StringPrintf("%s:%13.9f ", p->name().c_str(), Asum(p->grad())); } return ret; http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/bb75a0be/src/neuralnet/neuron_layer/lrn.cc ---------------------------------------------------------------------- diff --git a/src/neuralnet/neuron_layer/lrn.cc b/src/neuralnet/neuron_layer/lrn.cc index ce96d11..b199b9a 100644 --- a/src/neuralnet/neuron_layer/lrn.cc +++ b/src/neuralnet/neuron_layer/lrn.cc @@ -64,12 +64,11 @@ void LRNLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers) { auto grad = Tensor4(&grad_); auto gsrc = Tensor4(srclayers[0]->mutable_grad(this)); - gsrc = grad * expr::F<op::power>(norm, -beta_ ); + gsrc = grad * expr::F<op::power>(norm, -beta_); Tensor<cpu, 4> tmp(gsrc.shape); AllocSpace(tmp); tmp = gsrc * src / norm; - gsrc += ( - 2.0f * beta_ * salpha ) * expr::chpool<red::sum>(tmp, lsize_ ) - * src; + gsrc += (- 2.0f * beta_ * salpha) * expr::chpool<red::sum>(tmp, lsize_) * src; FreeSpace(tmp); } http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/bb75a0be/src/proto/job.proto ---------------------------------------------------------------------- diff --git a/src/proto/job.proto b/src/proto/job.proto index 03ee327..db55987 100644 --- a/src/proto/job.proto +++ b/src/proto/job.proto @@ -192,6 +192,9 @@ message LayerProto { optional LayerType type = 20 [default = kUserLayer]; // type of user layer optional string user_type = 21; + // share data and grad blob with the single src layer, e.g., relu layer can + // share blobs from conv layer. It is useful for saving memory space. + optional bool share_src_blobs = 22 [default = false]; // overrides the partition dimension for neural net optional int32 partition_dim = 60 [default = -1]; // names of parameters shared from other layers http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/bb75a0be/src/utils/updater.cc ---------------------------------------------------------------------- diff --git a/src/utils/updater.cc b/src/utils/updater.cc index bb055c2..21608fa 100644 --- a/src/utils/updater.cc +++ b/src/utils/updater.cc @@ -60,7 +60,7 @@ float StepLRGen::Get(int step) { // do not cast int to float int freq = proto_.step_conf().change_freq(); float lr = proto_.base_lr() * pow(proto_.step_conf().gamma(), step / freq); - LOG_IF(ERROR, step % freq == 0) << "Update learning rate to " << lr + LOG_IF(INFO, step % freq == 0) << "Update learning rate to " << lr << " @ step " << step; return lr; }
