This is an automated email from the ASF dual-hosted git repository.
nswamy pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git
The following commit(s) were added to refs/heads/master by this push:
new 3d35260 benchmark.py modified (#7600)
3d35260 is described below
commit 3d35260c3416d7fa102d440bf40c2f91f516dfd2
Author: Roshani Nagmote <[email protected]>
AuthorDate: Fri Sep 8 11:35:48 2017 -0700
benchmark.py modified (#7600)
benchmark script to include gluon
---
example/gluon/image_classification.py | 2 +-
example/image-classification/README.md | 18 ++++-
example/image-classification/benchmark.py | 127 +++++++++++++++++++-----------
3 files changed, 97 insertions(+), 50 deletions(-)
diff --git a/example/gluon/image_classification.py
b/example/gluon/image_classification.py
index b8d018d..c6e99a1 100644
--- a/example/gluon/image_classification.py
+++ b/example/gluon/image_classification.py
@@ -147,7 +147,7 @@ def train(epochs, ctx):
metric.update(label, outputs)
if opt.log_interval and not (i+1)%opt.log_interval:
name, acc = metric.get()
- logging.info('[Epoch %d Batch %d] speed: %f samples/s,
training: %s=%f'%(
+ logging.info('Epoch[%d] Batch [%d]\tSpeed: %f
samples/sec\t%s=%f'%(
epoch, i, batch_size/(time.time()-btic), name,
acc))
btic = time.time()
diff --git a/example/image-classification/README.md
b/example/image-classification/README.md
index 1c72a1d..8a64b55 100644
--- a/example/image-classification/README.md
+++ b/example/image-classification/README.md
@@ -263,14 +263,28 @@ The `benchmark.py` can be used to run a series of
benchmarks against different i
- `--worker_file`: file that contains a list of worker hostnames or list of
worker ip addresses that have passwordless ssh enabled.
- `--worker_count`: number of workers to run benchmark on.
- `--gpu_count`: number of gpus on each worker to use.
-- `--networks`: one or more networks in the format
network_name:batch_size:image_size.
+- `--networks`: one or more networks in the format
mode:network_name:batch_size:image_size. (Use `native` mode for imagenet
benchmarks and any of the symbolic/imperative/hybrid for gluon benchmarks). Be
sure to use appropriate models according to the mode you are using.
The `benchmark.py` script runs benchmarks on variable number of gpus upto
gpu_count starting from 1 gpu doubling the number of gpus in each run using
`kv-store=device` and after that running on variable number of nodes on all
gpus starting with 1 node upto `worker_count` doubling the number of nodes used
in each run using `kv-store=dist_sync_device`.
An example to run the benchmark script is shown below with 8 workers and 16
gpus on each worker:
```
python benchmark.py --worker_file /opt/deeplearning/workers --worker_count 8 \
- --gpu_count 16 --networks 'inception-v3:32:299'
+ --gpu_count 16 --networks 'native:inception-v3:32:299'
+```
+
+Additionally, this script also runs [Gluon vision
models](mxnet/python/mxnet/gluon/model_zoo/model_store.py) benchmarking
[image_classification](mxnet/example/gluon/image_classification.py) script
+for all three symbolic, imperative and hybrid paradigms using synthetic data.
+An example to run the benchmark script is shown below with 8 workers and 16
gpus on each worker:
+```
+python benchmark.py --worker_file /opt/deeplearning/workers --worker_count 8 \
+ --gpu_count 16 --networks 'imperative:resnet152_v1:32:299'
+```
+
+To run benchmark on gluon vision models, use `--benchmark 1` as the argument
to `image_classification.py`, An example is shown below:
+```
+python ../gluon/image_classification.py --dataset dummy --gpus 2 --epochs 1
--benchmark --mode imperative \
+ --model resnet152_v1 --batch-size 32 --log-interval 1 --kv-store
dist_sync_device
```
### Scalability Results
diff --git a/example/image-classification/benchmark.py
b/example/image-classification/benchmark.py
index 3096fae..2a50d50 100644
--- a/example/image-classification/benchmark.py
+++ b/example/image-classification/benchmark.py
@@ -87,7 +87,8 @@ log_loc = './benchmark'
LOGGER = setup_logging(log_loc)
class Network(object):
- def __init__(self, name, img_size, batch_size):
+ def __init__(self, mode, name, img_size, batch_size):
+ self.mode = mode
self.name = name
self.img_size = img_size
self.batch_size = batch_size
@@ -97,44 +98,54 @@ def parse_args():
class NetworkArgumentAction(argparse.Action):
def validate(self, attrs):
args = attrs.split(':')
- if len(args) != 3 or isinstance(args[0], str) == False:
- print('expected network attributes in format
network_name:batch_size:image_size \
- \nThe network_name is a valid model defined as network_name.py
in the image-classification/symbol folder.')
+ if len(args) != 4 or isinstance(args[0], str) == False or
isinstance(args[1], str) == False:
+ print('expected network attributes in format
mode:network_name:batch_size:image_size \
+ \nThe network_name is a valid model defined as network_name.py
in the image-classification/symbol folder. \
+ \nOr a gluon vision model defined in
mxnet/python/mxnet/gluon/model_zoo/model_store.py.')
sys.exit(1)
try:
- #check if the network exists
- importlib.import_module('symbols.'+ args[0])
- batch_size = int(args[1])
- img_size = int(args[2])
- return Network(name=args[0], batch_size=batch_size,
img_size=img_size)
+ # check if the network exists
+ if args[0] == 'native':
+ importlib.import_module('symbols.' + args[1])
+ batch_size = int(args[2])
+ img_size = int(args[3])
+ return Network(mode=args[0], name=args[1],
batch_size=batch_size, img_size=img_size)
except Exception as e:
- print('expected network attributes in format
network_name:batch_size:image_size \
- \nThe network_name is a valid model defined as network_name.py
in the image-classification/symbol folder.')
+ print('expected network attributes in format
mode:network_name:batch_size:image_size \
+ \nThe network_name is a valid model defined as network_name.py
in the image-classification/symbol folder. \
+ \nOr a gluon vision model defined in
mxnet/python/mxnet/gluon/model_zoo/model_store.py.')
print(e)
sys.exit(1)
+
def __init__(self, *args, **kw):
kw['nargs'] = '+'
argparse.Action.__init__(self, *args, **kw)
+
def __call__(self, parser, namespace, values, option_string=None):
if isinstance(values, list) == True:
setattr(namespace, self.dest, map(self.validate, values))
else:
setattr(namespace, self.dest, self.validate(values))
+
parser = argparse.ArgumentParser(description='Run Benchmark on various
imagenet networks using train_imagenent.py')
- parser.add_argument('--networks', dest='networks', nargs= '+', type=str,
help= 'one or more networks in the format network_name:batch_size:image_size \
- \nThe network_name is a valid model defined as network_name.py in the
image-classification/symbol folder.',action=NetworkArgumentAction)
- parser.add_argument('--worker_file', type=str, help='file that contains a
list of worker hostnames or list of worker ip addresses that can be sshed
without a password.',required=True)
+ parser.add_argument('--networks', dest='networks', nargs='+', type=str,
help='one or more networks in the format
mode:network_name:batch_size:image_size \
+ \nThe network_name is a valid model defined as network_name.py in the
image-classification/symbol folder for native imagenet \
+ \n Or a gluon vision model defined in
mxnet/python/mxnet/gluon/model_zoo/model_store.py.',
+ action=NetworkArgumentAction)
+ parser.add_argument('--worker_file', type=str,
+ help='file that contains a list of worker hostnames or
list of worker ip addresses that can be sshed without a password.',
+ required=True)
parser.add_argument('--worker_count', type=int, help='number of workers to
run benchmark on.', required=True)
parser.add_argument('--gpu_count', type=int, help='number of gpus on each
worker to use.', required=True)
args = parser.parse_args()
return args
def series(max_count):
- i=1
- s=[]
+ i = 1
+ s = []
while i <= max_count:
s.append(i)
- i=i*2
+ i = i * 2
if s[-1] < max_count:
s.append(max_count)
return s
@@ -142,9 +153,12 @@ def series(max_count):
'''
Choose the middle iteration to get the images processed per sec
'''
-def images_processed(log_loc):
- f=open(log_loc)
- img_per_sec =
re.findall("(?:Batch\s+\[30\]\\\\tSpeed:\s+)(\d+\.\d+)(?:\s+)",
str(f.readlines()))
+def images_processed(log_loc, mode):
+ f = open(log_loc)
+ if mode == 'native':
+ img_per_sec =
re.findall("(?:Batch\s+\[30\]\\\\tSpeed:\s+)(\d+\.\d+)(?:\s+)",
str(f.readlines()))
+ else:
+ img_per_sec =
re.findall("(?:Batch\s+\[3\]\\\\tSpeed:\s+)(\d+\.\d+)(?:\s+)",
str(f.readlines()))
f.close()
img_per_sec = map(float, img_per_sec)
total_img_per_sec = sum(img_per_sec)
@@ -157,48 +171,64 @@ def generate_hosts_file(num_nodes, workers_file,
args_workers_file):
f.close()
return
-def stop_old_processes(hosts_file):
- stop_args = ['python', '../../tools/kill-mxnet.py', hosts_file]
+def stop_old_processes(hosts_file, prog_name):
+ stop_args = ['python', '../../tools/kill-mxnet.py', hosts_file, 'python',
prog_name]
stop_args_str = ' '.join(stop_args)
LOGGER.info('killing old remote processes\n %s', stop_args_str)
stop = subprocess.check_output(stop_args, stderr=subprocess.STDOUT)
LOGGER.debug(stop)
time.sleep(1)
-def run_imagenet(kv_store, data_shape, batch_size, num_gpus, num_nodes,
network, args_workers_file):
- imagenet_args=['python', 'train_imagenet.py', '--gpus', ','.join(str(i)
for i in range(num_gpus)), \
- '--network', network, '--batch-size', str(batch_size *
num_gpus), \
- '--image-shape', '3,' + str(data_shape) + ',' +
str(data_shape), '--num-epochs', '1' ,'--kv-store', kv_store, '--benchmark',
'1', '--disp-batches', '10']
- log = log_loc + '/' + network + '_' + str(num_nodes*num_gpus) + '_log'
- hosts = log_loc + '/' + network + '_' + str(num_nodes*num_gpus) +
'_workers'
+def run_benchmark(kv_store, data_shape, batch_size, num_gpus, num_nodes,
network, args_workers_file, mode):
+ if mode == 'native':
+ benchmark_args = ['python', 'train_imagenet.py', '--gpus',
','.join(str(i) for i in range(num_gpus)), \
+ '--network', network, '--batch-size', str(batch_size
* num_gpus), \
+ '--image-shape', '3,' + str(data_shape) + ',' +
str(data_shape), '--num-epochs', '1',
+ '--kv-store', kv_store, '--benchmark', '1',
'--disp-batches', '10']
+ else:
+ benchmark_args = ['python', '../gluon/image_classification.py',
'--dataset', 'dummy', '--gpus', str(num_gpus), \
+ '--epochs', '1', '--benchmark', '--mode', mode,
'--model', network, '--batch-size',
+ str(batch_size), \
+ '--log-interval', str(1), '--kvstore', kv_store]
+
+ log = log_loc + '/' + network + '_' + str(num_nodes * num_gpus) + '_log'
+ hosts = log_loc + '/' + network + '_' + str(num_nodes * num_gpus) +
'_workers'
generate_hosts_file(num_nodes, hosts, args_workers_file)
- stop_old_processes(hosts)
- launch_args = ['../../tools/launch.py', '-n', str(num_nodes), '-s',
str(num_nodes*2), '-H', hosts, ' '.join(imagenet_args) ]
+ if mode == 'native':
+ stop_old_processes(hosts, 'train_imagenet.py')
+ else:
+ stop_old_processes(hosts, '../gluon/image-classification.py')
+ launch_args = ['../../tools/launch.py', '-n', str(num_nodes), '-s',
str(num_nodes * 2), '-H', hosts,
+ ' '.join(benchmark_args)]
- #use train_imagenet when running on a single node
+ # use train_imagenet/image_classification when running on a single node
if kv_store == 'device':
- imagenet = RunCmd(imagenet_args, log)
- imagenet.startCmd(timeout = 60 * 10)
+ imagenet = RunCmd(benchmark_args, log)
+ imagenet.startCmd(timeout=60 * 10)
else:
launch = RunCmd(launch_args, log)
- launch.startCmd(timeout = 60 * 10)
+ launch.startCmd(timeout=60 * 10)
- stop_old_processes(hosts)
- img_per_sec = images_processed(log)
- LOGGER.info('network: %s, num_gpus: %d, image/sec: %f', network,
num_gpus*num_nodes, img_per_sec)
+ if mode == 'native':
+ stop_old_processes(hosts, 'train_imagenet.py')
+ else:
+ stop_old_processes(hosts, '../gluon/image-classification.py')
+ img_per_sec = images_processed(log, mode)
+ LOGGER.info('network: %s, num_gpus: %d, image/sec: %f', network, num_gpus
* num_nodes, img_per_sec)
return img_per_sec
def plot_graph(args):
- speedup_chart = pygal.Line(x_title ='gpus',y_title ='speedup',
logarithmic=True)
+ speedup_chart = pygal.Line(x_title='gpus', y_title='speedup',
logarithmic=True)
speedup_chart.x_labels = map(str, series(args.worker_count *
args.gpu_count))
speedup_chart.add('ideal speedup', series(args.worker_count *
args.gpu_count))
for net in args.networks:
image_single_gpu = net.gpu_speedup[1] if 1 in net.gpu_speedup or not
net.gpu_speedup[1] else 1
- y_values = [ each/image_single_gpu for each in
net.gpu_speedup.values() ]
- LOGGER.info('%s: image_single_gpu:%.2f' %(net.name, image_single_gpu))
+ y_values = [each / image_single_gpu for each in
net.gpu_speedup.values()]
+ LOGGER.info('%s: image_single_gpu:%.2f' % (net.name, image_single_gpu))
LOGGER.debug('network:%s, y_values: %s' % (net.name, ' '.join(map(str,
y_values))))
- speedup_chart.add(net.name , y_values \
- , formatter= lambda y_val, img = copy.deepcopy(image_single_gpu),
batch_size = copy.deepcopy(net.batch_size): 'speedup:%.2f, img/sec:%.2f,
batch/gpu:%d' % \
+ speedup_chart.add(net.name, y_values \
+ , formatter=lambda y_val, img=copy.deepcopy(image_single_gpu),
batch_size=copy.deepcopy(
+ net.batch_size): 'speedup:%.2f, img/sec:%.2f, batch/gpu:%d' % \
(0 if y_val is None else y_val, 0 if y_val is None else y_val *
img, batch_size))
speedup_chart.render_to_file(log_loc + '/speedup.svg')
@@ -212,14 +242,17 @@ def write_csv(log_loc, args):
def main():
args = parse_args()
for net in args.networks:
- #use kv_store='device' when running on 1 node
+ # use kv_store='device' when running on 1 node
for num_gpus in series(args.gpu_count):
- imgs_per_sec = run_imagenet(kv_store='device',
data_shape=net.img_size, batch_size=net.batch_size, \
- num_gpus=num_gpus, num_nodes=1,
network=net.name, args_workers_file=args.worker_file)
+ imgs_per_sec = run_benchmark(kv_store='device',
data_shape=net.img_size, batch_size=net.batch_size, \
+ num_gpus=num_gpus, num_nodes=1,
network=net.name,
+ args_workers_file=args.worker_file,
mode=net.mode)
net.gpu_speedup[num_gpus] = imgs_per_sec
for num_nodes in series(args.worker_count)[1::]:
- imgs_per_sec = run_imagenet(kv_store='dist_sync_device',
data_shape=net.img_size, batch_size=net.batch_size, \
- num_gpus=args.gpu_count, num_nodes=num_nodes,
network=net.name, args_workers_file=args.worker_file)
+ imgs_per_sec = run_benchmark(kv_store='dist_sync_device',
data_shape=net.img_size,
+ batch_size=net.batch_size, \
+ num_gpus=args.gpu_count,
num_nodes=num_nodes, network=net.name,
+ args_workers_file=args.worker_file,
mode=net.mode)
net.gpu_speedup[num_nodes * args.gpu_count] = imgs_per_sec
LOGGER.info('Network: %s (num_gpus, images_processed): %s', net.name,
','.join(map(str, net.gpu_speedup.items())))
write_csv(log_loc, args)
--
To stop receiving notification emails like this one, please contact
['"[email protected]" <[email protected]>'].