SINGA-300 - Add residual networks for imagenet classification udpate convert.py for general resnet models and pre-activation resnet
tested all models for serving; upload pickle files to s3 Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/45ec92d8 Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/45ec92d8 Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/45ec92d8 Branch: refs/heads/master Commit: 45ec92d8ffc1fa1385a9307fdf07e21da939ee2f Parents: c0317d1 Author: Wei Wang <[email protected]> Authored: Sat Feb 11 18:39:20 2017 +0800 Committer: Wei Wang <[email protected]> Committed: Sun Feb 12 22:01:19 2017 +0800 ---------------------------------------------------------------------- examples/imagenet/resnet/convert.py | 136 +++++++++++++------- examples/imagenet/resnet/model.py | 214 +++++++++++++++++++++++++------ examples/imagenet/resnet/serve.py | 5 +- 3 files changed, 271 insertions(+), 84 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/45ec92d8/examples/imagenet/resnet/convert.py ---------------------------------------------------------------------- diff --git a/examples/imagenet/resnet/convert.py b/examples/imagenet/resnet/convert.py index c976bf5..e630281 100644 --- a/examples/imagenet/resnet/convert.py +++ b/examples/imagenet/resnet/convert.py @@ -1,48 +1,98 @@ +import os import torchfile import numpy as np import cPickle as pickle +from argparse import ArgumentParser -def conv(m, name, params): +'''Extract the net parameters from the torch file and store them as python dict +using cPickle''' + +import model + +verbose=False + +def add_param(idx, name, val, params): + if type(params) == dict: + assert name not in params, 'duplicated param %s' % name + params[name] = val + else: + assert params[idx].size() == val.size, 'size mismatch for %s: %s - %s' % (name, (params[idx].shape,), (val.shape,)) + params[idx].copy_from_numpy(val) + + if verbose: + print name, val.shape + + +def conv(m, idx, params, param_names): outplane = m['weight'].shape[0] - params[name + '-conv_weight'] = np.reshape(m['weight'], (outplane, -1)) - return params - -def batchnorm(m, name, params): - params[name + '-bn_gamma'] = m['weight'] - params[name + '-bn_beta'] = m['bias'] - params[name + '-bn_mean'] = m['running_mean'] - params[name + '-bn_var'] = m['running_var'] - return params - -def block(m, name, params, has_identity): - branch=m[0].modules[0].modules - params = conv(branch[0], name + '-1', params) - params = batchnorm(branch[1], name + '-1', params) - params = conv(branch[3], name + '-2', params) - params = batchnorm(branch[4], name + '-2', params) - params = conv(branch[6], name + '-3', params) - params = batchnorm(branch[7], name + '-3', params) - if not has_identity: - shortcut = m[0].modules[1].modules - params = conv(shortcut[0], name + '-shortcut', params) - params = batchnorm(shortcut[1], name + '-shortcut', params) - return params - -def stage(sid, m, num_blk, params): - for i in range(num_blk): - params = block(m[i].modules, 'stage%d-blk%d' % (sid, i), params, i!=0) - return params - -params = {} -model = torchfile.load('wrn-50-2.t7').modules -params = conv(model[0], 'input', params) -params = batchnorm(model[1], 'input', params) -params = stage(0, model[4].modules, 3, params) -params = stage(1, model[5].modules, 4, params) -params = stage(2, model[6].modules, 6, params) -params = stage(3, model[7].modules, 3, params) - -params['dense_weight'] = np.transpose(model[10]['weight']) -params['dense_bias'] = model[10]['bias'] -with open('wrn-50-2.pickle', 'wb') as fd: - pickle.dump(params, fd) + name = param_names[idx] + val = np.reshape(m['weight'], (outplane, -1)) + add_param(idx, name, val, params) + return idx + 1 + + +def batchnorm(m, idx, params, param_names): + add_param(idx, param_names[idx], m['weight'], params) + add_param(idx + 1, param_names[idx + 1], m['bias'], params) + add_param(idx + 2, param_names[idx + 2], m['running_mean'], params) + add_param(idx + 3, param_names[idx + 3], m['running_var'], params) + return idx + 4 + + +def linear(m, idx, params, param_names): + add_param(idx, param_names[idx], np.transpose(m['weight']), params) + add_param(idx + 1, param_names[idx + 1], m['bias'], params) + return idx + 2 + + +def traverse(m, idx, params, param_names): + ''' Traverse all modules of the torch checkpoint file to extract params. + + Args: + m, a TorchObject + idx, index for the current cursor of param_names + params, an empty dictionary (name->numpy) to dump the params via pickle; + or a list of tensor objects which should be in the same order as + param_names, called to initialize net created in Singa directly + using param values from torch checkpoint file. + + Returns: + the updated idx + ''' + module_type = m.__dict__['_typename'] + if module_type in ['nn.Sequential', 'nn.ConcatTable'] : + for x in m.modules: + idx = traverse(x, idx, params, param_names) + elif 'SpatialConvolution' in module_type: + idx = conv(m, idx, params, param_names) + elif 'SpatialBatchNormalization' in module_type: + idx = batchnorm(m, idx, params, param_names) + elif 'Linear' in module_type: + idx = linear(m, idx, params, param_names) + return idx + + +if __name__ == '__main__': + parser = ArgumentParser(description='Convert params from torch to python ' + 'dict. \n resnet could have depth of 18, 34, 101, 152; \n + wrn has depth 50; preact has depth 200; addbn has depth 50') + parser.add_argument("infile", help="torch checkpoint file") + parser.add_argument("model", choices = ['resnet', 'wrn', 'preact', 'addbn']) + parser.add_argument("depth", type=int, choices = [18, 34, 50, 101, 152, 200]) + args = parser.parse_args() + + net = model.create_net(args.model, args.depth) + # model.init_params(net) + m = torchfile.load(args.infile) + params = {} + # params = net.param_values() + param_names = net.param_names() + traverse(m, 0, params, param_names) + miss = [name for name in param_names if name not in params] + if len(miss) > 0: + print 'The following params are missing from torch file' + print miss + + outfile = os.path.splitext(args.infile)[0] + '.pickle' + with open(outfile, 'wb') as fd: + pickle.dump(params, fd) http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/45ec92d8/examples/imagenet/resnet/model.py ---------------------------------------------------------------------- diff --git a/examples/imagenet/resnet/model.py b/examples/imagenet/resnet/model.py index 7c9a3cf..34dfd9f 100644 --- a/examples/imagenet/resnet/model.py +++ b/examples/imagenet/resnet/model.py @@ -14,7 +14,8 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================= -''' This model is created following https://github.com/facebook/fb.resnet.torch.git +''' This models are created following https://github.com/facebook/fb.resnet.torch.git +and https://github.com/szagoruyko/wide-residual-networks ''' from singa.layer import Conv2D, Activation, MaxPooling2D, AvgPooling2D,\ Split, Merge, Flatten, Dense, BatchNormalization, Softmax @@ -26,6 +27,17 @@ ffnet.verbose=True conv_bias = False def conv(net, prefix, n, ksize, stride=1, pad=0, bn=True, relu=True, src=None): + '''Add a convolution layer and optionally a batchnorm and relu layer. + + Args: + prefix, a string for the prefix of the layer name + n, num of filters for the conv layer + bn, if true add batchnorm + relu, if true add relu + + Returns: + the last added layer + ''' ret = net.add(Conv2D( prefix + '-conv', n, ksize, stride, pad=pad, use_bias=conv_bias), src) if bn: @@ -35,40 +47,95 @@ def conv(net, prefix, n, ksize, stride=1, pad=0, bn=True, relu=True, src=None): return ret -def shortcut(net, prefix, inplane, outplane, stride, src): +def shortcut(net, prefix, inplane, outplane, stride, src, bn=False): + '''Add a conv shortcut layer if inplane != outplane; or return the source + layer directly. + + Args: + prefix, a string for the prefix of the layer name + bn, if true add a batchnorm layer after the conv layer + + Returns: + return the last added layer or the source layer. + ''' if inplane == outplane: return src - return conv(net, prefix + '-shortcut', outplane, 1, stride, 0, True, False, src) + return conv(net, prefix + '-shortcut', outplane, 1, stride, 0, bn, False, src) + + +def bottleneck(name, net, inplane, midplane, outplane, stride=1, preact=False, add_bn=False): + '''Add three conv layers, with a>=b<=c filters. + The default structure is + input + -split - conv1-bn1-relu1-conv2-bn2-relu2-conv3-bn3 + - conv-bn or dummy + -add + -relu -def bottleneck(name, net, inplane, midplane, outplane, stride=1, preact=False): + Args: + inplane, num of feature maps of the input + midplane, num of featue maps of the middle layer + outplane, num of feature maps of the output + preact, if true, move the bn3 and relu before conv1, i.e., pre-activation ref identity mapping paper + add_bn, if true, move the last bn after the addition layer (for resnet-50) + ''' + assert not (preact and add_bn), 'preact and batchnorm after addition cannot be true at the same time' split = net.add(Split(name + '-split', 2)) - conv(net, name + '-1', midplane, 1, 1, 0, True, True, src=split) - conv(net, name + '-2', midplane, 3, stride, 1, True, True) - br0 = conv(net, name + '-3', outplane, 1, 1, 0, True, False) - br1 = shortcut(net, name, inplane, outplane, stride, split) - net.add(Merge(name + '-add'), [br0, br1]) - return net.add(Activation(name + '-relu')) - -def basicblock(name, net, inplane, midplane, outplane, stride=1, preact=False): - assert midplane==outplane, 'midplan and outplane should be the same' + if preact: + net.add(BatchNormalization(name + '-preact-bn')) + net.add(Activation(name + '-preact-relu')) + conv(net, name + '-0', midplane, 1, 1, 0, True, True) + conv(net, name + '-1', midplane, 3, stride, 1, True, True) + br0 = conv(net, name + '-2', outplane, 1, 1, 0, not (preact or add_bn), False) + br1 = shortcut(net, name, inplane, outplane, stride, split, not add_bn) + ret = net.add(Merge(name + '-add'), [br0, br1]) + if add_bn: + ret = net.add(BatchNormalization(name + '-add-bn')) + if not preact: + ret = net.add(Activation(name + '-add-relu')) + return ret + + +def basicblock(name, net, inplane, midplane, outplane, stride=1, preact=False, add_bn=False): + '''Add two conv layers, with a<=b filters. + + The default structure is + input + -split - conv1-bn1-relu1-conv2-bn2 + - conv or dummy + -add + -relu + + Args: + inplane, num of feature maps of the input + midplane, num of featue maps of the middle layer + outplane, num of feature maps of the output + preact, if true, move the bn2 and relu before conv1, i.e., pre-activation ref identity mapping paper + add_bn, if true, move the last bn after the addition layer (for resnet-50) + ''' + assert not (preact and add_bn), 'preact and batchnorm after addition cannot be true at the same time' split = net.add(Split(name + '-split', 2)) if preact: - net.add(BatchNormalization(name + '-preact-bn'), split) + net.add(BatchNormalization(name + '-preact-bn')) net.add(Activation(name + '-preact-relu')) - conv(net, name + '-1', outplane, 3, stride, 1, True, True, split) - br0 = conv(net, name + '-2', outplane, 3, 1, 1, True, False) - br1 = shortcut(net, name, inplane, outplane, stride, split) - net.add(Merge(name + '-add'), [br0, br1]) - return net.add(Activation(name + '-add-relu')) + conv(net, name + '-0', midplane, 3, stride, 1, True, True) + br0 = conv(net, name + '-1', outplane, 3, 1, 1, not preact, False) + br1 = shortcut(net, name, inplane, outplane, stride, split, False) + ret = net.add(Merge(name + '-add'), [br0, br1]) + if add_bn: + ret = net.add(BatchNormalization(name + '-add-bn')) + if not preact: + ret = net.add(Activation(name + '-add-relu')) + return ret -def stage(sid, net, num_blk, inplane, midplane, outplane, stride, block): - block('stage%d-blk%d' % (sid, 0), net, inplane, midplane, outplane, stride) +def stage(sid, net, num_blk, inplane, midplane, outplane, stride, block, preact=False, add_bn=False): + block('stage%d-blk%d' % (sid, 0), net, inplane, midplane, outplane, stride, preact, add_bn) for i in range(1, num_blk): - block('stage%d-blk%d' % (sid, i), net, outplane, midplane, outplane) + block('stage%d-blk%d' % (sid, i), net, outplane, midplane, outplane, 1, preact, add_bn) -def init_params(net, weight_path): +def init_params(net, weight_path=None): if weight_path == None: for pname, pval in zip(net.param_names(), net.param_values()): print pname, pval.shape @@ -89,32 +156,90 @@ def init_params(net, weight_path): else: net.load(weight_path, use_pickle = 'pickle' in weight_path) -def create_resnet(weight_path=None, depth=50): - cfg = { - 50: ([3, 4, 6, 3], bottleneck), - 101: ([3, 4, 23, 3], bottleneck), - 152: ([3, 8, 36, 3], bottleneck), - } + +cfg = { 18: [2, 2, 2, 2], # basicblock + 34: [3, 4, 6, 3], # basicblock + 50: [3, 4, 6, 3], # bottleneck + 101: [3, 4, 23, 3], # bottleneck + 152: [3, 8, 36, 3], # bottleneck + 200: [3, 24, 36, 3]} # bottleneck + + +def create_addbn_resnet(depth=50): + '''Original resnet with the last batchnorm of each block moved to after the addition layer''' net = ffnet.FeedForwardNet() - net.add(Conv2D('input-conv', 64, 7, 2, pad=3, input_sample_shape=(3, 224, 224))) + net.add(Conv2D('input-conv', 64, 7, 2, pad=3, use_bias=False, input_sample_shape=(3, 224, 224))) net.add(BatchNormalization('input-bn')) net.add(Activation('input_relu')) net.add(MaxPooling2D('input_pool', 3, 2, pad=1)) + conf = cfg[depth] + if depth > 34: + stage(0, net, conf[0], 64, 64, 256, 1, bottleneck, add_bn=True) + stage(1, net, conf[1], 256, 128, 512, 2, bottleneck, add_bn=True) + stage(2, net, conf[2], 512, 256, 1024, 2, bottleneck, add_bn=True) + stage(3, net, conf[3], 1024, 512, 2048, 2, bottleneck, add_bn=True) + else: + stage(0, net, conf[0], 64, 64, 64, 1, basicblock, add_bn=True) + stage(1, net, conf[1], 64, 128, 128, 2, basicblock, add_bn=True) + stage(2, net, conf[2], 128, 256, 256, 2, basicblock, add_bn=True) + stage(3, net, conf[3], 256, 512, 512, 2, basicblock, add_bn=True) + net.add(AvgPooling2D('avg', 7, 1, pad=0)) + net.add(Flatten('flat')) + net.add(Dense('dense', 1000)) + return net + +def create_resnet(depth=18): + '''Original resnet, where the there is a relue after the addition layer''' + net = ffnet.FeedForwardNet() + net.add(Conv2D('input-conv', 64, 7, 2, pad=3, use_bias=False, input_sample_shape=(3, 224, 224))) + net.add(BatchNormalization('input-bn')) + net.add(Activation('input_relu')) + net.add(MaxPooling2D('input_pool', 3, 2, pad=1)) conf = cfg[depth] - stage(0, net, conf[0][0], 64, 64, 256, 1, conf[1]) - stage(1, net, conf[0][1], 256, 128, 512, 2, conf[1]) - stage(2, net, conf[0][2], 512, 256, 1024, 2, conf[1]) - stage(3, net, conf[0][3], 1024, 512, 2048, 2, conf[1]) - net.add(AvgPooling2D('avg', 7, 1)) + if depth > 34: + stage(0, net, conf[0], 64, 64, 256, 1, bottleneck) + stage(1, net, conf[1], 256, 128, 512, 2, bottleneck) + stage(2, net, conf[2], 512, 256, 1024, 2, bottleneck) + stage(3, net, conf[3], 1024, 512, 2048, 2, bottleneck) + else: + stage(0, net, conf[0], 64, 64, 64, 1, basicblock) + stage(1, net, conf[1], 64, 128, 128, 2, basicblock) + stage(2, net, conf[2], 128, 256, 256, 2, basicblock) + stage(3, net, conf[3], 256, 512, 512, 2, basicblock) + net.add(AvgPooling2D('avg', 7, 1, pad=0)) net.add(Flatten('flat')) net.add(Dense('dense', 1000)) + return net - init_params(net, weight_path) +def create_preact_resnet(depth=200): + '''Resnet with the batchnorm and relu moved to before the conv layer for each block''' + net = ffnet.FeedForwardNet() + net.add(Conv2D('input-conv', 64, 7, 2, pad=3, use_bias=False, input_sample_shape=(3, 224, 224))) + net.add(BatchNormalization('input-bn')) + net.add(Activation('input_relu')) + net.add(MaxPooling2D('input_pool', 3, 2, pad=1)) + conf = cfg[depth] + if depth > 34: + stage(0, net, conf[0], 64, 64, 256, 1, bottleneck, preact=True) + stage(1, net, conf[1], 256, 128, 512, 2, bottleneck, preact=True) + stage(2, net, conf[2], 512, 256, 1024, 2, bottleneck, preact=True) + stage(3, net, conf[3], 1024, 512, 2048, 2, bottleneck, preact=True) + else: + stage(0, net, conf[0], 64, 64, 64, 1, basicblock, preact=True) + stage(1, net, conf[1], 64, 128, 128, 2, basicblock, preact=True) + stage(2, net, conf[2], 128, 256, 256, 2, basicblock, preact=True) + stage(3, net, conf[3], 256, 512, 512, 2, basicblock, preact=True) + net.add(BatchNormalization('final-bn')) + net.add(Activation('final-relu')) + net.add(AvgPooling2D('avg', 7, 1, pad=0)) + net.add(Flatten('flat')) + net.add(Dense('dense', 1000)) return net -def create_wide_resnet(weight_path=None): +def create_wide_resnet(depth=50): + '''Similar original resnet except that a<=b<=c for the bottleneck block''' net = ffnet.FeedForwardNet() net.add(Conv2D('input-conv', 64, 7, 2, pad=3, use_bias=False, input_sample_shape=(3, 224, 224))) net.add(BatchNormalization('input-bn')) @@ -129,10 +254,19 @@ def create_wide_resnet(weight_path=None): net.add(AvgPooling2D('avg_pool', 7, 1, pad=0)) net.add(Flatten('flag')) net.add(Dense('dense', 1000)) - - init_params(net, weight_path) return net +def create_net(name, depth): + if name == 'resnet': + return create_resnet(depth) + elif name == 'wrn': + return create_wide_resnet(depth) + elif name == 'preact': + return create_preact_resnet(depth) + elif name == 'addbn': + return create_addbn_resnet(depth) + + if __name__ == '__main__': - create_net('wrn-50-2.pickle') + create_net('wrn', 50) http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/45ec92d8/examples/imagenet/resnet/serve.py ---------------------------------------------------------------------- diff --git a/examples/imagenet/resnet/serve.py b/examples/imagenet/resnet/serve.py index d9609ce..a6163f7 100644 --- a/examples/imagenet/resnet/serve.py +++ b/examples/imagenet/resnet/serve.py @@ -105,6 +105,8 @@ def main(): parser.add_argument("-c", "--use_cpu", action="store_true", help="If set, load models onto CPU devices") parser.add_argument("--parameter_file", default="wrn-50-2.pickle") + parser.add_argument("--model", choices = ['resnet', 'wrn', 'preact', 'addbn'], default='wrn') + parser.add_argument("--depth", type=int, choices = [18, 34, 50, 101, 152, 200], default='50') # Process arguments args = parser.parse_args() @@ -113,9 +115,10 @@ def main(): # start to train agent = Agent(port) - net = model.create_wide_resnet(args.parameter_file) + net = model.create_net(args.model, args.depth) dev = device.create_cuda_gpu() net.to_device(dev) + model.init_params(net, args.parameter_file) print 'Finish loading models' labels = np.loadtxt('synset_words.txt', str, delimiter='\t ')
