[incubator-mxnet] branch master updated: Re-enable the test_gpu_memory_profiler_gluon test case (#18704)

zhasheng Tue, 08 Sep 2020 00:51:20 -0700

This is an automated email from the ASF dual-hosted git repository.

zhasheng pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git



The following commit(s) were added to refs/heads/master by this push:
     new 5b7a6d9  Re-enable the test_gpu_memory_profiler_gluon test case 
(#18704)
5b7a6d9 is described below

commit 5b7a6d979b3bcf43416be25a6e47e0fd150daa54
Author: Bojian Zheng <[email protected]>
AuthorDate: Tue Sep 8 03:49:08 2020 -0400

    Re-enable the test_gpu_memory_profiler_gluon test case (#18704)
    
    * Re-enable the test_gpu_memory_profiler_gluon test case
    
    * Change the naming of head gradients
---
 config/linux_gpu.cmake                |   1 +
 python/mxnet/gluon/block.py           |   4 +-
 python/mxnet/gluon/parameter.py       |   2 +-
 python/mxnet/profiler.py              |   6 +-
 python/mxnet/symbol/symbol.py         |  24 +++---
 src/imperative/imperative.cc          |  14 +++-
 src/profiler/storage_profiler.cc      |  11 ++-
 tests/python/gpu/test_profiler_gpu.py | 145 ++++++++++++++++++++++------------
 8 files changed, 137 insertions(+), 70 deletions(-)

diff --git a/config/linux_gpu.cmake b/config/linux_gpu.cmake
index c75d294..50932d8 100644
--- a/config/linux_gpu.cmake
+++ b/config/linux_gpu.cmake
@@ -129,4 +129,5 @@ set(USE_INT64_TENSOR_SIZE OFF CACHE BOOL "Use int64_t to 
represent the total num
 # Other GPU features
 set(USE_NCCL "Use NVidia NCCL with CUDA" OFF)
 set(NCCL_ROOT "" CACHE BOOL "NCCL install path. Supports autodetection.")
+set(USE_NVML OFF CACHE BOOL "Build with NVML support")
 set(USE_NVTX ON CACHE BOOL "Build with NVTX support")
diff --git a/python/mxnet/gluon/block.py b/python/mxnet/gluon/block.py
index 0a1c758..3178c20 100644
--- a/python/mxnet/gluon/block.py
+++ b/python/mxnet/gluon/block.py
@@ -947,8 +947,8 @@ class HybridBlock(Block):
                     flatten_inputs.append(None)
             grouped_inputs = _regroup(flatten_inputs, self._in_format)
 
-            params = {i: j.var() for i, j in self._reg_params.items()}
             with _block_scope(self):
+                params = {i: j.var() for i, j in self._reg_params.items()}
                 out = self.hybrid_forward(symbol, *grouped_inputs, **params)  
# pylint: disable=no-value-for-parameter
             out, self._out_format = _flatten(out, "output")
 
@@ -1447,8 +1447,8 @@ class HybridBlock(Block):
 
                 return self.hybrid_forward(ndarray, x, *args, **params)
 
-        params = {i: j.var() for i, j in self._reg_params.items()}
         with _block_scope(self):
+            params = {i: j.var() for i, j in self._reg_params.items()}
             return self.hybrid_forward(symbol, x, *args, **params)
 
     def hybrid_forward(self, F, x, *args, **kwargs):
diff --git a/python/mxnet/gluon/parameter.py b/python/mxnet/gluon/parameter.py
index 2f1f115..68e860b 100644
--- a/python/mxnet/gluon/parameter.py
+++ b/python/mxnet/gluon/parameter.py
@@ -644,7 +644,7 @@ class Parameter(object):
         """Returns a symbol representing this parameter."""
         if self._var is None:
             if self._var_name is None:  # _var_name is set manually in 
SymbolBlock.import
-                self._var_name = self._uuid
+                self._var_name = self._uuid.replace('-', '_') + '_' + 
self._name
 
             self._var = symbol.var(self._var_name, shape=self.shape, 
dtype=self.dtype,
                                    lr_mult=self.lr_mult, wd_mult=self.wd_mult,
diff --git a/python/mxnet/profiler.py b/python/mxnet/profiler.py
index 1b9583e..78a7dfc 100644
--- a/python/mxnet/profiler.py
+++ b/python/mxnet/profiler.py
@@ -504,7 +504,7 @@ class Marker(object):
 
 
 @contextlib.contextmanager
-def scope(name='<unk>:', append_mode=False):
+def scope(name='<unk>:', append_mode=True):
     """Assign the profiler scope for the GPU memory profiler.
 
     It is implicitly invoked when the Gluon API is used.
@@ -516,7 +516,9 @@ def scope(name='<unk>:', append_mode=False):
 
     """
     name = name + ":" if not name.endswith(":") else name
-    token = _current_scope.set(_current_scope.get() + name if append_mode else 
name)
+    if append_mode and _current_scope.get() != "<unk>:":
+        name = _current_scope.get() + name
+    token = _current_scope.set(name)
     # Invoke the C API to propagate the profiler scope information to the
     # C++ backend.
     check_call(_LIB.MXSetProfilerScope(c_str(name)))
diff --git a/python/mxnet/symbol/symbol.py b/python/mxnet/symbol/symbol.py
index 2eebfdf..ddc4a1e 100644
--- a/python/mxnet/symbol/symbol.py
+++ b/python/mxnet/symbol/symbol.py
@@ -43,7 +43,8 @@ from . import _internal
 from . import op
 from ._internal import SymbolBase, _set_symbol_class
 from ..util import is_np_shape
-from ..profiler import _current_scope as _profiler_scope
+from ..profiler import scope as _profiler_scope
+from ..profiler import _current_scope as _current_profiler_scope
 
 __all__ = ["Symbol", "var", "Variable", "Group", "load", "load_json",
            "pow", "power", "maximum", "minimum", "hypot", "eye", "zeros",
@@ -1782,15 +1783,16 @@ class Symbol(SymbolBase):
                     index = aux_names.index(name)
                     aux_states[index] = aux_states[index].totype(stype)
 
-        if grad_req == 'null':
-            args_grad = None
-        elif isinstance(grad_req, dict):
-            args_grad = {}
-            for i, name in enumerate(arg_names):
-                if grad_req[name] != 'null':
-                    args_grad[name] = args[i].copy()
-        else:
-            args_grad = [x.copy() for x in args]
+        with _profiler_scope("symbol:arg_grad:"):
+            if grad_req == 'null':
+                args_grad = None
+            elif isinstance(grad_req, dict):
+                args_grad = {}
+                for i, name in enumerate(arg_names):
+                    if grad_req[name] != 'null':
+                        args_grad[name] = args[i].copy()
+            else:
+                args_grad = [x.copy() for x in args]
         return Executor(self, ctx, args, args_grad, grad_req, aux_states)
 
     def _bind(self, ctx, args, args_grad=None, grad_req='write',
@@ -2728,7 +2730,7 @@ def var(name, attr=None, shape=None, lr_mult=None, 
wd_mult=None, dtype=None,
     if profiler_scope is not None:
         attr['__profiler_scope__'] = profiler_scope
     else:
-        attr['__profiler_scope__'] = _profiler_scope.get()
+        attr['__profiler_scope__'] = _current_profiler_scope.get()
     for k, v in kwargs.items():
         if k.startswith('__') and k.endswith('__'):
             attr[k] = str(v)
diff --git a/src/imperative/imperative.cc b/src/imperative/imperative.cc
index 8702d93..e2028b0 100644
--- a/src/imperative/imperative.cc
+++ b/src/imperative/imperative.cc
@@ -233,7 +233,7 @@ void Imperative::RecordOp(
 
   nnvm::ObjectPtr node = nnvm::Node::Create();
   node->attrs = std::move(attrs);
-  node->attrs.name = "node_" + std::to_string(node_count_++);
+  node_count_ += 1;
   AGInfo& info = AGInfo::Create(node);
   info.state = state;
   info.ctx = outputs[0]->ctx();
@@ -322,7 +322,7 @@ void Imperative::RecordDeferredCompute(nnvm::NodeAttrs 
&&attrs,
   }
   node->attrs = std::move(attrs);
   // Need to support NameManager in imperative API to better name 
node->attrs.name
-  node->attrs.name = "node_" + std::to_string(node_count_++);
+  node_count_ += 1;
 
   for (uint32_t i = 0; i < outputs.size(); ++i) {
     outputs[i]->deferredcompute_entry_ = nnvm::NodeEntry{node, i, 0};
@@ -598,6 +598,16 @@ std::vector<NDArray*> Imperative::Backward(
     }
   }
 
+  for (size_t nid = num_forward_nodes;
+       nid < idx.num_nodes(); ++nid) {
+    const nnvm::NodeAttrs& attrs = idx[nid].source->attrs;
+    for (size_t oid = 0; oid < idx[nid].source->num_outputs(); ++oid) {
+      size_t eid = idx.entry_id(nid, oid);
+      arrays[eid]->AssignStorageInfo(common::NodeAttrsGetProfilerScope(attrs),
+                                     attrs.name);
+    }
+  }  // for (nid ∈ [num_forward_nodes, idx.num_nodes()))
+
   if (dmlc::GetEnv("MXNET_MEM_PLAN_VERBOSE_LOGGING", false)) {
     common::LogMemoryPlan(graph);
   }
diff --git a/src/profiler/storage_profiler.cc b/src/profiler/storage_profiler.cc
index 5bbfa59..b0025a9 100644
--- a/src/profiler/storage_profiler.cc
+++ b/src/profiler/storage_profiler.cc
@@ -23,6 +23,7 @@
 #endif  // MXNET_USE_NVML
 #include <fstream>
 #include <map>
+#include <regex>
 #include <unordered_map>
 #include <vector>
 #include "./profiler.h"
@@ -61,11 +62,17 @@ void GpuDeviceStorageProfiler::DumpProfile() const {
   std::multimap<std::string, AllocEntryDumpFmt> gpu_mem_ordered_alloc_entries;
   // map the GPU device ID to the total amount of allocations
   std::unordered_map<int, size_t> gpu_dev_id_total_alloc_map;
+  std::regex gluon_param_regex("([0-9a-fA-F]{8})_([0-9a-fA-F]{4})_"
+                               "([0-9a-fA-F]{4})_([0-9a-fA-F]{4})_"
+                               "([0-9a-fA-F]{12})_([^ ]*)");
+
   for (const std::pair<void *const, AllocEntry>& alloc_entry :
        gpu_mem_alloc_entries_) {
+    std::string alloc_entry_name
+        = std::regex_replace(alloc_entry.second.name, gluon_param_regex, "$6");
     gpu_mem_ordered_alloc_entries.emplace(
-        alloc_entry.second.profiler_scope +
-        alloc_entry.second.name, AllocEntryDumpFmt{
+        alloc_entry.second.profiler_scope + alloc_entry_name,
+        AllocEntryDumpFmt{
           alloc_entry.second.requested_size,
           alloc_entry.second.dev_id,
           alloc_entry.second.actual_size,
diff --git a/tests/python/gpu/test_profiler_gpu.py 
b/tests/python/gpu/test_profiler_gpu.py
index 89eb425..05bec4a 100644
--- a/tests/python/gpu/test_profiler_gpu.py
+++ b/tests/python/gpu/test_profiler_gpu.py
@@ -15,25 +15,24 @@
 # specific language governing permissions and limitations
 # under the License.
 
+import csv
 import os
 import sys
 
+import numpy as np
 import mxnet as mx
 mx.test_utils.set_default_context(mx.gpu(0))
 
 curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
 sys.path.insert(0, os.path.join(curr_path, '../unittest'))
-# We import all tests from ../unittest/test_profiler.py
-# They will be detected by test framework, as long as the current file has a 
different filename
-from test_profiler import *
+from mxnet import profiler
+from mxnet.gluon import nn
+from mxnet.gluon.block import _block_scope
+from test_profiler import enable_profiler
 
-# Test seen to crash pytest worker during development of 
https://github.com/apache/incubator-mxnet/pull/18694
-del test_aggregate_duplication
 
 def test_gpu_memory_profiler_symbolic():
-    iter_num = 5
-
-    enable_profiler('test_profiler.json', False, False)
+    enable_profiler('test_profiler.json')
     profiler.set_state('run')
 
     with profiler.scope("tensordot"):
@@ -41,18 +40,19 @@ def test_gpu_memory_profiler_symbolic():
         B = mx.sym.Variable('B')
         C = mx.symbol.dot(A, B, name='dot')
 
-    executor = C._simple_bind(mx.gpu(), 'write', A=(4096, 4096), B=(4096, 
4096))
+    executor = C._simple_bind(mx.gpu(), 'write', A=(1024, 2048), B=(2048, 
4096))
 
-    a = mx.random.uniform(-1.0, 1.0, shape=(4096, 4096))
-    b = mx.random.uniform(-1.0, 1.0, shape=(4096, 4096))
+    with profiler.scope("init"):
+        a = mx.random.uniform(-1.0, 1.0, shape=(1024, 2048))
+        b = mx.random.uniform(-1.0, 1.0, shape=(2048, 4096))
 
     a.copyto(executor.arg_dict['A'])
     b.copyto(executor.arg_dict['B'])
 
-    for i in range(iter_num):
-        executor.forward()
-        c = executor.outputs[0]
-        mx.nd.waitall()
+    executor.forward()
+    executor.backward()
+    c = executor.outputs[0]
+    mx.nd.waitall()
     profiler.set_state('stop')
     profiler.dump(True)
 
@@ -62,41 +62,53 @@ def test_gpu_memory_profiler_symbolic():
             {'Attribute Name' : 'tensordot:in_arg:B',
              'Requested Size' : str(4 * b.size)},
             {'Attribute Name' : 'tensordot:dot',
-             'Requested Size' : str(4 * c.size)}]
+             'Requested Size' : str(4 * c.size)},
+            {'Attribute Name' : 'init:_random_uniform',
+             'Requested Size' : str(4 * a.size)},
+            {'Attribute Name' : 'init:_random_uniform',
+             'Requested Size' : str(4 * b.size)}]
 
     # Sample gpu_memory_profile.csv:
     # "Attribute Name","Requested Size","Device","Actual Size","Reuse?"
-    # "<unk>:_zeros","67108864","0","67108864","0"
-    # "<unk>:_zeros","67108864","0","67108864","0"
-    # "tensordot:dot","67108864","0","67108864","1"
-    # "tensordot:dot","67108864","0","67108864","1"
-    # "tensordot:in_arg:A","67108864","0","67108864","0"
-    # "tensordot:in_arg:B","67108864","0","67108864","0"
-    # "nvml_amend","1074790400","0","1074790400","0"
+    # init:_random_uniform,33554432,0,33554432,1
+    # init:_random_uniform,8388608,0,8388608,1
+    # resource:temp_space (sample_op.h +365),8,0,4096,0
+    # symbol:arg_grad:unknown,8388608,0,8388608,0
+    # symbol:arg_grad:unknown,33554432,0,33554432,0
+    # tensordot:dot,16777216,0,16777216,0
+    # tensordot:dot_backward,33554432,0,33554432,0
+    # tensordot:dot_backward,8388608,0,8388608,0
+    # tensordot:dot_head_grad,16777216,0,16777216,0
+    # tensordot:in_arg:A,8388608,0,8388608,0
+    # tensordot:in_arg:B,33554432,0,33554432,0
 
     with open('gpu_memory_profile-pid_%d.csv' % (os.getpid()), mode='r') as 
csv_file:
         csv_reader = csv.DictReader(csv_file)
+        # TODO: Remove this print statement later on.
+        for row in csv_reader:
+            print(",".join(list(row.values())))
         for expected_alloc_entry in expected_alloc_entries:
             csv_file.seek(0)
             entry_found = False
             for row in csv_reader:
-                if row['Attribute Name'] == expected_alloc_entry['Attribute 
Name']:
-                    assert row['Requested Size'] == 
expected_alloc_entry['Requested Size'], \
-                           "requested size={} is not equal to the expected 
size={}" \
-                           .format(row['Requested Size'],
-                                   expected_alloc_entry['Requested Size'])
+                if row['Attribute Name'] == expected_alloc_entry['Attribute 
Name'] and \
+                   row['Requested Size'] == expected_alloc_entry['Requested 
Size']:
                     entry_found = True
                     break
             assert entry_found, \
-                   "Entry for attr_name={} has not been found" \
-                   .format(expected_alloc_entry['Attribute Name'])
+                    "Entry for (attr_name={}, alloc_size={}) has not been 
found" \
+                    .format(expected_alloc_entry['Attribute Name'],
+                            expected_alloc_entry['Requested Size'])
+        # Make sure that there is no unknown allocation entry.
+        csv_file.seek(0)
+        for row in csv_reader:
+            if row['Attribute Name'] == "<unk>:unknown" or \
+               row['Attribute Name'] == "<unk>:":
+                assert False, "Unknown allocation entry has been encountered"
 
 
[email protected](is_cd_run(), reason="flaky test - open issue #18564")
[email protected](reason='https://github.com/apache/incubator-mxnet/issues/18564')
 def test_gpu_memory_profiler_gluon():
-    enable_profiler(profile_filename='test_profiler.json',
-                    run=True, continuous_dump=True)
+    enable_profiler(profile_filename='test_profiler.json')
     profiler.set_state('run')
 
     model = nn.HybridSequential()
@@ -117,31 +129,64 @@ def test_gpu_memory_profiler_gluon():
     profiler.set_state('stop')
     profiler.dump(True)
 
+    # Sample gpu_memory_profile.csv:
+    # "Attribute Name","Requested Size","Device","Actual Size","Reuse?"
+    # <unk>:in_arg:data,640,0,4096,0
+    # 
hybridsequential:activation0:hybridsequential_activation0_fwd,2048,0,4096,0
+    # 
hybridsequential:activation0:hybridsequential_activation0_fwd_backward,8192,0,8192,0
+    # 
hybridsequential:activation0:hybridsequential_activation0_fwd_head_grad,2048,0,4096,0
+    # 
hybridsequential:dense0:activation0:hybridsequential_dense0_activation0_fwd,8192,0,8192,0
+    # hybridsequential:dense0:arg_grad:bias,512,0,4096,0
+    # hybridsequential:dense0:arg_grad:weight,5120,0,8192,0
+    # hybridsequential:dense0:hybridsequential_dense0_fwd,8192,0,8192,0
+    # hybridsequential:dense0:in_arg:bias,512,0,4096,0
+    # hybridsequential:dense0:in_arg:weight,5120,0,8192,0
+    # 
hybridsequential:dense1:activation0:hybridsequential_dense1_activation0_fwd,4096,0,4096,0
+    # hybridsequential:dense1:arg_grad:bias,256,0,4096,0
+    # hybridsequential:dense1:arg_grad:weight,32768,0,32768,0
+    # hybridsequential:dense1:hybridsequential_dense1_fwd,4096,0,4096,0
+    # hybridsequential:dense1:in_arg:bias,256,0,4096,0
+    # hybridsequential:dense1:in_arg:weight,32768,0,32768,0
+    # hybridsequential:dense2:arg_grad:bias,128,0,4096,0
+    # hybridsequential:dense2:arg_grad:weight,8192,0,8192,0
+    # 
hybridsequential:dense2:hybridsequential_dense2_fwd_backward,4096,0,4096,1
+    # hybridsequential:dense2:in_arg:bias,128,0,4096,0
+    # hybridsequential:dense2:in_arg:weight,8192,0,8192,0
+    # hybridsequential:dropout0:hybridsequential_dropout0_fwd,8192,0,8192,0
+    # hybridsequential:dropout0:hybridsequential_dropout0_fwd,8192,0,8192,0
+    # resource:cudnn_dropout_state (dropout-inl.h +256),1474560,0,1474560,0
+    # resource:temp_space (fully_connected-inl.h +316),15360,0,16384,0
+
     # We are only checking for weight parameters here, also making sure that
     # there is no unknown entries in the memory profile.
     with open('gpu_memory_profile-pid_%d.csv' % (os.getpid()), mode='r') as 
csv_file:
         csv_reader = csv.DictReader(csv_file)
+        # TODO: Remove this print statement later on.
         for row in csv_reader:
             print(",".join(list(row.values())))
-        for scope in ['in_arg', 'arg_grad']:
-            for key, nd in model.collect_params().items():
-                expected_arg_name = "%s:%s:" % (model.name, scope) + nd.name
-                expected_arg_size = str(4 * np.prod(nd.shape))
-                csv_file.seek(0)
-                entry_found = False
-                for row in csv_reader:
-                    if row['Attribute Name'] == expected_arg_name:
-                        assert row['Requested Size'] == expected_arg_size, \
-                            "requested size={} is not equal to the expected 
size={}" \
-                            .format(row['Requested Size'], expected_arg_size)
-                        entry_found = True
-                        break
-                assert entry_found, \
-                    "Entry for attr_name={} has not been found" \
-                    .format(expected_arg_name)
+        for param in model.collect_params().values():
+            expected_arg_name = "%sin_arg:" % 
param.var().attr('__profiler_scope__') + \
+                                param.name
+            expected_arg_size = str(4 * np.prod(param.shape))
+            csv_file.seek(0)
+            entry_found = False
+            for row in csv_reader:
+                if row['Attribute Name'] == expected_arg_name and \
+                   row['Requested Size'] == expected_arg_size:
+                    entry_found = True
+                    break
+            assert entry_found, \
+                    "Entry for (attr_name={}, alloc_size={}) has not been 
found" \
+                        .format(expected_arg_name,
+                                expected_arg_size)
         # Make sure that there is no unknown allocation entry.
         csv_file.seek(0)
         for row in csv_reader:
             if row['Attribute Name'] == "<unk>:unknown" or \
                row['Attribute Name'] == "<unk>:":
                 assert False, "Unknown allocation entry has been encountered"
+
+
+if __name__ == "__main__":
+    import nose
+    nose.runmodule()

[incubator-mxnet] branch master updated: Re-enable the test_gpu_memory_profiler_gluon test case (#18704)

Reply via email to