[incubator-mxnet] branch master updated: benchmark.py modified (#7600)

nswamy Fri, 08 Sep 2017 11:36:33 -0700

This is an automated email from the ASF dual-hosted git repository.

nswamy pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git



The following commit(s) were added to refs/heads/master by this push:
     new 3d35260  benchmark.py modified (#7600)
3d35260 is described below

commit 3d35260c3416d7fa102d440bf40c2f91f516dfd2
Author: Roshani Nagmote <[email protected]>
AuthorDate: Fri Sep 8 11:35:48 2017 -0700

    benchmark.py modified (#7600)
    
    benchmark script to include gluon
---
 example/gluon/image_classification.py     |   2 +-
 example/image-classification/README.md    |  18 ++++-
 example/image-classification/benchmark.py | 127 +++++++++++++++++++-----------
 3 files changed, 97 insertions(+), 50 deletions(-)

diff --git a/example/gluon/image_classification.py 
b/example/gluon/image_classification.py
index b8d018d..c6e99a1 100644
--- a/example/gluon/image_classification.py
+++ b/example/gluon/image_classification.py
@@ -147,7 +147,7 @@ def train(epochs, ctx):
             metric.update(label, outputs)
             if opt.log_interval and not (i+1)%opt.log_interval:
                 name, acc = metric.get()
-                logging.info('[Epoch %d Batch %d] speed: %f samples/s, 
training: %s=%f'%(
+                logging.info('Epoch[%d] Batch [%d]\tSpeed: %f 
samples/sec\t%s=%f'%(
                                epoch, i, batch_size/(time.time()-btic), name, 
acc))
             btic = time.time()
 
diff --git a/example/image-classification/README.md 
b/example/image-classification/README.md
index 1c72a1d..8a64b55 100644
--- a/example/image-classification/README.md
+++ b/example/image-classification/README.md
@@ -263,14 +263,28 @@ The `benchmark.py` can be used to run a series of 
benchmarks against different i
 - `--worker_file`: file that contains a list of worker hostnames or list of 
worker ip addresses that have passwordless ssh enabled.
 - `--worker_count`: number of workers to run benchmark on.
 - `--gpu_count`: number of gpus on each worker to use.
-- `--networks`: one or more networks in the format 
network_name:batch_size:image_size.
+- `--networks`: one or more networks in the format 
mode:network_name:batch_size:image_size. (Use `native` mode for imagenet 
benchmarks and any of the symbolic/imperative/hybrid for gluon benchmarks). Be 
sure to use appropriate models according to the mode you are using.
 
 The `benchmark.py` script runs benchmarks on variable number of gpus upto 
gpu_count starting from 1 gpu doubling the number of gpus in each run using 
`kv-store=device` and after that running on variable number of nodes on all 
gpus starting with 1 node upto `worker_count` doubling the number of nodes used 
in each run using `kv-store=dist_sync_device`.
 
 An example to run the benchmark script is shown below with 8 workers and 16 
gpus on each worker:
 ```
 python benchmark.py --worker_file /opt/deeplearning/workers --worker_count 8 \
-  --gpu_count 16 --networks 'inception-v3:32:299'
+  --gpu_count 16 --networks 'native:inception-v3:32:299'
+```
+
+Additionally, this script also runs [Gluon vision 
models](mxnet/python/mxnet/gluon/model_zoo/model_store.py) benchmarking 
[image_classification](mxnet/example/gluon/image_classification.py) script
+for all three symbolic, imperative and hybrid paradigms using synthetic data.
+An example to run the benchmark script is shown below with 8 workers and 16 
gpus on each worker:
+```
+python benchmark.py --worker_file /opt/deeplearning/workers --worker_count 8 \
+  --gpu_count 16 --networks 'imperative:resnet152_v1:32:299'
+```
+
+To run benchmark on gluon vision models, use `--benchmark 1`  as the argument 
to `image_classification.py`, An example is shown below:
+```
+python ../gluon/image_classification.py --dataset dummy --gpus 2 --epochs 1 
--benchmark --mode imperative \
+  --model resnet152_v1 --batch-size 32 --log-interval 1 --kv-store 
dist_sync_device
 ```
 
 ### Scalability Results
diff --git a/example/image-classification/benchmark.py 
b/example/image-classification/benchmark.py
index 3096fae..2a50d50 100644
--- a/example/image-classification/benchmark.py
+++ b/example/image-classification/benchmark.py
@@ -87,7 +87,8 @@ log_loc = './benchmark'
 LOGGER = setup_logging(log_loc)
 
 class Network(object):
-    def __init__(self, name, img_size, batch_size):
+    def __init__(self, mode, name, img_size, batch_size):
+        self.mode = mode
         self.name = name
         self.img_size = img_size
         self.batch_size = batch_size
@@ -97,44 +98,54 @@ def parse_args():
     class NetworkArgumentAction(argparse.Action):
         def validate(self, attrs):
             args = attrs.split(':')
-            if len(args) != 3 or isinstance(args[0], str) == False:
-                print('expected network attributes in format 
network_name:batch_size:image_size \
-                \nThe network_name is a valid model defined as network_name.py 
in the image-classification/symbol folder.')
+            if len(args) != 4 or isinstance(args[0], str) == False or 
isinstance(args[1], str) == False:
+                print('expected network attributes in format 
mode:network_name:batch_size:image_size \
+                \nThe network_name is a valid model defined as network_name.py 
in the image-classification/symbol folder. \
+                \nOr a gluon vision model defined in 
mxnet/python/mxnet/gluon/model_zoo/model_store.py.')
                 sys.exit(1)
             try:
-                #check if the network exists
-                importlib.import_module('symbols.'+ args[0])
-                batch_size = int(args[1])
-                img_size = int(args[2])
-                return Network(name=args[0], batch_size=batch_size, 
img_size=img_size)
+                # check if the network exists
+                if args[0] == 'native':
+                    importlib.import_module('symbols.' + args[1])
+                batch_size = int(args[2])
+                img_size = int(args[3])
+                return Network(mode=args[0], name=args[1], 
batch_size=batch_size, img_size=img_size)
             except Exception as e:
-                print('expected network attributes in format 
network_name:batch_size:image_size \
-                \nThe network_name is a valid model defined as network_name.py 
in the image-classification/symbol folder.')
+                print('expected network attributes in format 
mode:network_name:batch_size:image_size \
+                \nThe network_name is a valid model defined as network_name.py 
in the image-classification/symbol folder. \
+                \nOr a gluon vision model defined in 
mxnet/python/mxnet/gluon/model_zoo/model_store.py.')
                 print(e)
                 sys.exit(1)
+
         def __init__(self, *args, **kw):
             kw['nargs'] = '+'
             argparse.Action.__init__(self, *args, **kw)
+
         def __call__(self, parser, namespace, values, option_string=None):
             if isinstance(values, list) == True:
                 setattr(namespace, self.dest, map(self.validate, values))
             else:
                 setattr(namespace, self.dest, self.validate(values))
+
     parser = argparse.ArgumentParser(description='Run Benchmark on various 
imagenet networks using train_imagenent.py')
-    parser.add_argument('--networks', dest='networks', nargs= '+', type=str, 
help= 'one or more networks in the format network_name:batch_size:image_size \
-    \nThe network_name is a valid model defined as network_name.py in the 
image-classification/symbol folder.',action=NetworkArgumentAction)
-    parser.add_argument('--worker_file', type=str, help='file that contains a 
list of worker hostnames or list of worker ip addresses that can be sshed 
without a password.',required=True)
+    parser.add_argument('--networks', dest='networks', nargs='+', type=str, 
help='one or more networks in the format 
mode:network_name:batch_size:image_size \
+    \nThe network_name is a valid model defined as network_name.py in the 
image-classification/symbol folder for native imagenet \
+    \n Or a gluon vision model defined in 
mxnet/python/mxnet/gluon/model_zoo/model_store.py.',
+                        action=NetworkArgumentAction)
+    parser.add_argument('--worker_file', type=str,
+                        help='file that contains a list of worker hostnames or 
list of worker ip addresses that can be sshed without a password.',
+                        required=True)
     parser.add_argument('--worker_count', type=int, help='number of workers to 
run benchmark on.', required=True)
     parser.add_argument('--gpu_count', type=int, help='number of gpus on each 
worker to use.', required=True)
     args = parser.parse_args()
     return args
 
 def series(max_count):
-    i=1
-    s=[]
+    i = 1
+    s = []
     while i <= max_count:
         s.append(i)
-        i=i*2
+        i = i * 2
     if s[-1] < max_count:
         s.append(max_count)
     return s
@@ -142,9 +153,12 @@ def series(max_count):
 '''
 Choose the middle iteration to get the images processed per sec
 '''
-def images_processed(log_loc):
-    f=open(log_loc)
-    img_per_sec = 
re.findall("(?:Batch\s+\[30\]\\\\tSpeed:\s+)(\d+\.\d+)(?:\s+)", 
str(f.readlines()))
+def images_processed(log_loc, mode):
+    f = open(log_loc)
+    if mode == 'native':
+        img_per_sec = 
re.findall("(?:Batch\s+\[30\]\\\\tSpeed:\s+)(\d+\.\d+)(?:\s+)", 
str(f.readlines()))
+    else:
+        img_per_sec = 
re.findall("(?:Batch\s+\[3\]\\\\tSpeed:\s+)(\d+\.\d+)(?:\s+)", 
str(f.readlines()))
     f.close()
     img_per_sec = map(float, img_per_sec)
     total_img_per_sec = sum(img_per_sec)
@@ -157,48 +171,64 @@ def generate_hosts_file(num_nodes, workers_file, 
args_workers_file):
     f.close()
     return
 
-def stop_old_processes(hosts_file):
-    stop_args = ['python', '../../tools/kill-mxnet.py', hosts_file]
+def stop_old_processes(hosts_file, prog_name):
+    stop_args = ['python', '../../tools/kill-mxnet.py', hosts_file, 'python', 
prog_name]
     stop_args_str = ' '.join(stop_args)
     LOGGER.info('killing old remote processes\n %s', stop_args_str)
     stop = subprocess.check_output(stop_args, stderr=subprocess.STDOUT)
     LOGGER.debug(stop)
     time.sleep(1)
 
-def run_imagenet(kv_store, data_shape, batch_size, num_gpus, num_nodes, 
network, args_workers_file):
-    imagenet_args=['python',  'train_imagenet.py',  '--gpus', ','.join(str(i) 
for i in range(num_gpus)), \
-                   '--network', network, '--batch-size', str(batch_size * 
num_gpus), \
-                   '--image-shape', '3,' + str(data_shape) + ',' + 
str(data_shape), '--num-epochs', '1' ,'--kv-store', kv_store, '--benchmark', 
'1', '--disp-batches', '10']
-    log = log_loc + '/' + network + '_' + str(num_nodes*num_gpus) + '_log'
-    hosts = log_loc + '/' + network + '_' + str(num_nodes*num_gpus) + 
'_workers'
+def run_benchmark(kv_store, data_shape, batch_size, num_gpus, num_nodes, 
network, args_workers_file, mode):
+    if mode == 'native':
+        benchmark_args = ['python', 'train_imagenet.py', '--gpus', 
','.join(str(i) for i in range(num_gpus)), \
+                          '--network', network, '--batch-size', str(batch_size 
* num_gpus), \
+                          '--image-shape', '3,' + str(data_shape) + ',' + 
str(data_shape), '--num-epochs', '1',
+                          '--kv-store', kv_store, '--benchmark', '1', 
'--disp-batches', '10']
+    else:
+        benchmark_args = ['python', '../gluon/image_classification.py', 
'--dataset', 'dummy', '--gpus', str(num_gpus), \
+                          '--epochs', '1', '--benchmark', '--mode', mode, 
'--model', network, '--batch-size',
+                          str(batch_size), \
+                          '--log-interval', str(1), '--kvstore', kv_store]
+
+    log = log_loc + '/' + network + '_' + str(num_nodes * num_gpus) + '_log'
+    hosts = log_loc + '/' + network + '_' + str(num_nodes * num_gpus) + 
'_workers'
     generate_hosts_file(num_nodes, hosts, args_workers_file)
-    stop_old_processes(hosts)
-    launch_args = ['../../tools/launch.py', '-n', str(num_nodes), '-s', 
str(num_nodes*2), '-H', hosts, ' '.join(imagenet_args) ]
+    if mode == 'native':
+        stop_old_processes(hosts, 'train_imagenet.py')
+    else:
+        stop_old_processes(hosts, '../gluon/image-classification.py')
+    launch_args = ['../../tools/launch.py', '-n', str(num_nodes), '-s', 
str(num_nodes * 2), '-H', hosts,
+                   ' '.join(benchmark_args)]
 
-    #use train_imagenet when running on a single node
+    # use train_imagenet/image_classification when running on a single node
     if kv_store == 'device':
-        imagenet = RunCmd(imagenet_args, log)
-        imagenet.startCmd(timeout = 60 * 10)
+        imagenet = RunCmd(benchmark_args, log)
+        imagenet.startCmd(timeout=60 * 10)
     else:
         launch = RunCmd(launch_args, log)
-        launch.startCmd(timeout = 60 * 10)
+        launch.startCmd(timeout=60 * 10)
 
-    stop_old_processes(hosts)
-    img_per_sec = images_processed(log)
-    LOGGER.info('network: %s, num_gpus: %d, image/sec: %f', network, 
num_gpus*num_nodes, img_per_sec)
+    if mode == 'native':
+        stop_old_processes(hosts, 'train_imagenet.py')
+    else:
+        stop_old_processes(hosts, '../gluon/image-classification.py')
+    img_per_sec = images_processed(log, mode)
+    LOGGER.info('network: %s, num_gpus: %d, image/sec: %f', network, num_gpus 
* num_nodes, img_per_sec)
     return img_per_sec
 
 def plot_graph(args):
-    speedup_chart = pygal.Line(x_title ='gpus',y_title ='speedup', 
logarithmic=True)
+    speedup_chart = pygal.Line(x_title='gpus', y_title='speedup', 
logarithmic=True)
     speedup_chart.x_labels = map(str, series(args.worker_count * 
args.gpu_count))
     speedup_chart.add('ideal speedup', series(args.worker_count * 
args.gpu_count))
     for net in args.networks:
         image_single_gpu = net.gpu_speedup[1] if 1 in net.gpu_speedup or not 
net.gpu_speedup[1] else 1
-        y_values = [ each/image_single_gpu for each in 
net.gpu_speedup.values() ]
-        LOGGER.info('%s: image_single_gpu:%.2f' %(net.name, image_single_gpu))
+        y_values = [each / image_single_gpu for each in 
net.gpu_speedup.values()]
+        LOGGER.info('%s: image_single_gpu:%.2f' % (net.name, image_single_gpu))
         LOGGER.debug('network:%s, y_values: %s' % (net.name, ' '.join(map(str, 
y_values))))
-        speedup_chart.add(net.name , y_values \
-            , formatter= lambda y_val, img = copy.deepcopy(image_single_gpu), 
batch_size = copy.deepcopy(net.batch_size): 'speedup:%.2f, img/sec:%.2f, 
batch/gpu:%d' % \
+        speedup_chart.add(net.name, y_values \
+            , formatter=lambda y_val, img=copy.deepcopy(image_single_gpu), 
batch_size=copy.deepcopy(
+            net.batch_size): 'speedup:%.2f, img/sec:%.2f, batch/gpu:%d' % \
             (0 if y_val is None else y_val, 0 if y_val is None else y_val * 
img, batch_size))
     speedup_chart.render_to_file(log_loc + '/speedup.svg')
 
@@ -212,14 +242,17 @@ def write_csv(log_loc, args):
 def main():
     args = parse_args()
     for net in args.networks:
-        #use kv_store='device' when running on 1 node
+        # use kv_store='device' when running on 1 node
         for num_gpus in series(args.gpu_count):
-            imgs_per_sec = run_imagenet(kv_store='device', 
data_shape=net.img_size, batch_size=net.batch_size, \
-                                        num_gpus=num_gpus, num_nodes=1, 
network=net.name, args_workers_file=args.worker_file)
+            imgs_per_sec = run_benchmark(kv_store='device', 
data_shape=net.img_size, batch_size=net.batch_size, \
+                                         num_gpus=num_gpus, num_nodes=1, 
network=net.name,
+                                         args_workers_file=args.worker_file, 
mode=net.mode)
             net.gpu_speedup[num_gpus] = imgs_per_sec
         for num_nodes in series(args.worker_count)[1::]:
-            imgs_per_sec = run_imagenet(kv_store='dist_sync_device', 
data_shape=net.img_size, batch_size=net.batch_size, \
-                         num_gpus=args.gpu_count, num_nodes=num_nodes, 
network=net.name, args_workers_file=args.worker_file)
+            imgs_per_sec = run_benchmark(kv_store='dist_sync_device', 
data_shape=net.img_size,
+                                         batch_size=net.batch_size, \
+                                         num_gpus=args.gpu_count, 
num_nodes=num_nodes, network=net.name,
+                                         args_workers_file=args.worker_file, 
mode=net.mode)
             net.gpu_speedup[num_nodes * args.gpu_count] = imgs_per_sec
         LOGGER.info('Network: %s (num_gpus, images_processed): %s', net.name, 
','.join(map(str, net.gpu_speedup.items())))
     write_csv(log_loc, args)

-- 
To stop receiving notification emails like this one, please contact
['"[email protected]" <[email protected]>'].

[incubator-mxnet] branch master updated: benchmark.py modified (#7600)

Reply via email to