[incubator-mxnet] branch master updated: [MXNET-857] Add initial NVTX profiler implementation (#12328)

kellen Fri, 10 May 2019 22:16:25 -0700

This is an automated email from the ASF dual-hosted git repository.

kellen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git



The following commit(s) were added to refs/heads/master by this push:
     new b22ee95  [MXNET-857] Add initial NVTX profiler implementation (#12328)
b22ee95 is described below

commit b22ee951ae45f7d34b9ae79433f318db5b6bc5ac
Author: Kellen Sunderland <[email protected]>
AuthorDate: Fri May 10 22:15:12 2019 -0700

    [MXNET-857] Add initial NVTX profiler implementation (#12328)
    
    * [MXNET-857] Enable CUDA NVTX extensions for profiler
    
    These extensions mark readable ranges in the NVIDIA Visual Profiler which 
helps
    show correlations between kernel launches and graph node executions.
    
    Example shown here: 
https://user-images.githubusercontent.com/7443219/33946110-34296d18-e021-11e7-8d18-6d40b797405c.png
    The additional information enabled is in the 'Markers and Ranges' row.
    
    * [MXNET-857] Add initial NVTX profiler implementation
    
    This commit removes NVTX headers from the Amalgamation build process,
    but this is a CUDA/CMake only feature, so it's not relevant to
    Amalagamation builds.
    
    * [MXNET-857] Use macro for NVTX specific code
    
    * [MXNET-857] Add integration test.
    
    * Turn on NVTX by default in Unix.
    
    * Fixed typos and added NTVX info to profiler.md
    
    * Add NVTX example to profiling tutorial
    
    * Add NVTX flags for make
---
 CMakeLists.txt                                   |  10 ++++
 Makefile                                         |   5 ++
 amalgamation/amalgamation.py                     |   2 +-
 cmake/Modules/FindNVTX.cmake                     |  38 +++++++++++++++
 docs/api/python/profiler/profiler.md             |   4 +-
 docs/tutorials/python/profiler.md                |  25 +++++++++-
 docs/tutorials/python/profiler_nvprof.png        | Bin 0 -> 235747 bytes
 docs/tutorials/python/profiler_nvprof_zoomed.png | Bin 0 -> 254663 bytes
 docs/tutorials/python/profiler_winograd.png      | Bin 0 -> 75450 bytes
 make/config.mk                                   |   3 ++
 src/profiler/nvtx.cc                             |  21 ++++++++
 src/profiler/nvtx.h                              |  59 +++++++++++++++++++++++
 src/profiler/profiler.h                          |  20 ++++++++
 tests/python/profiling/simple_forward.py         |  42 ++++++++++++++++
 tests/python/profiling/test_nvtx.py              |  52 ++++++++++++++++++++
 15 files changed, 277 insertions(+), 4 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6f8c33b..896c7b7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -290,6 +290,16 @@ if(USE_CUDA)
       message(WARNING "Could not find NCCL libraries")
     endif()
   endif()
+  if(UNIX)
+    find_package(NVTX)
+    if(NVTX_FOUND)
+      include_directories(${NVTX_INCLUDE_DIRS})
+      list(APPEND mxnet_LINKER_LIBS ${NVTX_LIBRARIES})
+      add_definitions(-DMXNET_USE_NVTX=1)
+    else()
+      message(WARNING "Could not find NVTX libraries")
+    endif()
+  endif()
 else()
   add_definitions(-DMSHADOW_USE_CUDA=0)
 endif()
diff --git a/Makefile b/Makefile
index df0fe88..b578683 100644
--- a/Makefile
+++ b/Makefile
@@ -106,6 +106,11 @@ ifeq ($(ENABLE_TESTCOVERAGE), 1)
         LDFLAGS += --coverage
 endif
 
+ifeq ($(USE_NVTX), 1)
+        CFLAGS += -DMXNET_USE_NVTX=1
+        LDFLAGS += -lnvToolsExt
+endif
+
 ifeq ($(USE_TENSORRT), 1)
        CFLAGS +=  -I$(ROOTDIR) -I$(TPARTYDIR) 
-DONNX_NAMESPACE=$(ONNX_NAMESPACE) -DMXNET_USE_TENSORRT=1
        LDFLAGS += -lprotobuf -pthread -lonnx -lonnx_proto -lnvonnxparser 
-lnvonnxparser_runtime -lnvinfer -lnvinfer_plugin
diff --git a/amalgamation/amalgamation.py b/amalgamation/amalgamation.py
index e47ab6b..fef54aa 100644
--- a/amalgamation/amalgamation.py
+++ b/amalgamation/amalgamation.py
@@ -30,7 +30,7 @@ blacklist = [
     'opencv2/opencv.hpp', 'sys/stat.h', 'sys/types.h', 'cuda.h', 
'cuda_fp16.h', 'omp.h',
     'onnx/onnx.pb.h', 'execinfo.h', 'packet/sse-inl.h', 'emmintrin.h', 
'thrust/device_vector.h',
     'cusolverDn.h', 'internal/concurrentqueue_internal_debug.h', 
'relacy/relacy_std.hpp',
-    'relacy_shims.h', 'ittnotify.h', 'shared_mutex'
+    'relacy_shims.h', 'ittnotify.h', 'shared_mutex', 'nvToolsExt.h'
     ]
 
 minimum = int(sys.argv[6]) if len(sys.argv) > 5 else 0
diff --git a/cmake/Modules/FindNVTX.cmake b/cmake/Modules/FindNVTX.cmake
new file mode 100644
index 0000000..bf05eae
--- /dev/null
+++ b/cmake/Modules/FindNVTX.cmake
@@ -0,0 +1,38 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set(NVTX_ROOT_DIR "" CACHE PATH "Folder contains NVIDIA NVTX")
+
+find_path(NVTX_INCLUDE_DIRS
+  NAMES nvToolsExt.h
+  PATHS $ENV{NVTOOLSEXT_PATH} ${NVTX_ROOT_DIR}  ${CUDA_TOOLKIT_ROOT_DIR}
+  PATH_SUFFIXES include
+  )
+
+find_library(NVTX_LIBRARIES
+  NAMES nvToolsExt64_1.lib nvToolsExt32_1.lib nvToolsExt
+  PATHS $ENV{NVTOOLSEXT_PATH} ${NVTX_ROOT_DIR} ${CUDA_TOOLKIT_ROOT_DIR}
+  PATH_SUFFIXES lib lib64 lib/Win32 lib/x64
+  )
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(NVTX DEFAULT_MSG NVTX_INCLUDE_DIRS 
NVTX_LIBRARIES)
+
+if(NVTX_FOUND)
+  message(STATUS "Found NVTX (include: ${NVTX_INCLUDE_DIRS}, library: 
${NVTX_LIBRARIES})")
+  mark_as_advanced(NVTX_ROOT_DIR NVTX_INCLUDE_DIRS NVTX_LIBRARIES)
+endif()
diff --git a/docs/api/python/profiler/profiler.md 
b/docs/api/python/profiler/profiler.md
index 565495e..c025811 100644
--- a/docs/api/python/profiler/profiler.md
+++ b/docs/api/python/profiler/profiler.md
@@ -2,7 +2,7 @@
 
 ## Overview
 
-MXNet has a built-in profiler which is compatibule with both Intel® VTune™ 
Amplifier as well as Chrome's chrome://tracing visualization engine.  When 
built witht he USE_VTUNE=1 flag, MXNet makes actual VTune API calls to define 
Domains, Frames, Tasks, Events Counters, and Markers.  For a detailed 
explanation of these, see [Instrumentation and Tracing Technology API Reference 
](https://software.intel.com/en-us/vtune-amplifier-help-instrumentation-and-tracing-technology-api-reference)
+MXNet has a built-in profiler which is compatible with Intel® VTune™ 
Amplifier, NVIDIA NVTX and Chrome's chrome://tracing visualization engine.  
When built with the USE_VTUNE=1 flag, MXNet makes VTune API calls to define 
Domains, Frames, Tasks, Events Counters, and Markers.  For a detailed 
explanation of these, see [Instrumentation and Tracing Technology API Reference 
](https://software.intel.com/en-us/vtune-amplifier-help-instrumentation-and-tracing-technology-api-reference).
  When buil [...]
 
 ```eval_rst
 .. autosummary::
@@ -34,7 +34,7 @@ MXNet has a built-in profiler which is compatibule with both 
Intel® VTune™ Am
 
 ### Profiling Objects
 
-These profiling objects can be created and accessed from python in order to 
resord performance information of the python code paths 
+These profiling objects can be created and accessed from python in order to 
record performance information of the python code paths.
 
 ```eval_rst
 .. autosummary::
diff --git a/docs/tutorials/python/profiler.md 
b/docs/tutorials/python/profiler.md
index fe7611a..d3e3355 100644
--- a/docs/tutorials/python/profiler.md
+++ b/docs/tutorials/python/profiler.md
@@ -185,7 +185,7 @@ MXNet executes computation graphs in 'bulk mode' which 
reduces kernel launch gap
 
 ### Viewing profiler output
 
-There are two ways to view the information collected by the profiler. You can 
either view it in the console or you can view a more graphical version in a 
browser.
+There are a few ways to view the information collected by the profiler. You 
can view it in the console, you can view a more graphical version in a browser, 
or you can use a vendor tool such as Intel VTune or Nvidia NVProf to view 
output. For most scenarios the information you need can be obtained with 
MXNet's built in profiler support, but if you want to investigate the 
performance of operators along side extra context about your hardware (e.g. 
cache hit rates, or CUDA kernel timings) th [...]
 
 #### 1. View in console
 
@@ -215,6 +215,29 @@ Let's zoom in to check the time taken by operators
 
 The above picture visualizes the sequence in which the operators were executed 
and the time taken by each operator.
 
+#### 3. View in NVProf
+
+You can view all MXNet profiler information alongside CUDA kernel information 
by using the MXNet profiler along with NVProf.  Use the MXNet profiler as in 
the samples above, but invoke your python script with the following wrapper 
process available on most systems that support CUDA:
+
+```bash
+nvprof -o my_profile.nvvp python my_profiler_script.py
+==11588== NVPROF is profiling process 11588, command: python 
my_profiler_script.py
+==11588== Generated result file: 
/home/kellen/Development/incubator-mxnet/ci/my_profile.nvvp
+```
+Your my_profile.nvvp file will automatically be annotated with NVTX ranges 
displayed alongside your standard NVProf timeline.  This can be very useful 
when you're trying to find patterns between operators run by MXNet, and their 
associated CUDA kernel calls.
+
+![Operator profiling](profiler_nvprof.png)
+
+In this picture we see a rough overlay of a few types of information plotted 
on a horizontal timeline.  At the top of the plot we have CPU tasks such as 
driver operations, memory copy calls, MXNet engine operator invocations, and 
imperative MXNet API calls.  Below we see the kernels active on the GPU during 
the same time period.
+
+![Operator profiling](profiler_nvprof_zoomed.png)
+
+Zooming in on a backwards convolution operator we can see that it is in fact 
made up of a number of different GPU kernel calls, including a cuDNN winograd 
convolution call, and a fast-fourier transform call.
+
+![Operator profiling](profiler_winograd.png)
+
+Selecting any of these kernel calls (the winograd convolution call shown here) 
will get you some interesting GPU performance information such as occupancy 
rates (vs theoretical), shared memory usage and execution duration.
+
 ### Further reading
 
 - [Examples using MXNet 
profiler.](https://github.com/apache/incubator-mxnet/tree/master/example/profiler)
diff --git a/docs/tutorials/python/profiler_nvprof.png 
b/docs/tutorials/python/profiler_nvprof.png
new file mode 100644
index 0000000..37d8615
Binary files /dev/null and b/docs/tutorials/python/profiler_nvprof.png differ
diff --git a/docs/tutorials/python/profiler_nvprof_zoomed.png 
b/docs/tutorials/python/profiler_nvprof_zoomed.png
new file mode 100644
index 0000000..9b6b6e8
Binary files /dev/null and b/docs/tutorials/python/profiler_nvprof_zoomed.png 
differ
diff --git a/docs/tutorials/python/profiler_winograd.png 
b/docs/tutorials/python/profiler_winograd.png
new file mode 100644
index 0000000..5b4fcc3
Binary files /dev/null and b/docs/tutorials/python/profiler_winograd.png differ
diff --git a/make/config.mk b/make/config.mk
index 2083467..2080a01 100644
--- a/make/config.mk
+++ b/make/config.mk
@@ -80,6 +80,9 @@ ENABLE_CUDA_RTC = 1
 # whether use CuDNN R3 library
 USE_CUDNN = 0
 
+# whether to use NVTX when profiling
+USE_NVTX = 0
+
 #whether to use NCCL library
 USE_NCCL = 0
 #add the path to NCCL library
diff --git a/src/profiler/nvtx.cc b/src/profiler/nvtx.cc
new file mode 100644
index 0000000..9151873
--- /dev/null
+++ b/src/profiler/nvtx.cc
@@ -0,0 +1,21 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+
+#include "nvtx.h"
diff --git a/src/profiler/nvtx.h b/src/profiler/nvtx.h
new file mode 100644
index 0000000..c36bb50
--- /dev/null
+++ b/src/profiler/nvtx.h
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+
+#ifndef MXNET_PROFILER_NVTX_H_
+#define MXNET_PROFILER_NVTX_H_
+
+#if MXNET_USE_NVTX
+
+#include <string>
+#include <unordered_map>
+#include "nvToolsExt.h"
+
+namespace mxnet {
+namespace profiler {
+namespace nvtx {
+
+class NVTXDuration {
+ public:
+  explicit NVTXDuration(const char *name) noexcept
+      : range_id_(0), name_(name) {}
+
+  inline void start() {
+    range_id_ = nvtxRangeStartA(name_);
+  }
+
+  inline void stop() {
+    nvtxRangeEnd(range_id_);
+  }
+
+ private:
+  nvtxRangeId_t range_id_;
+  const char *name_;
+};
+
+
+
+}  // namespace nvtx
+}  // namespace profiler
+}  // namespace mxnet
+
+#endif  // MXNET_USE_NVTX
+#endif  // MXNET_PROFILER_NVTX_H_
diff --git a/src/profiler/profiler.h b/src/profiler/profiler.h
index f1fac9a..f9eb0af 100644
--- a/src/profiler/profiler.h
+++ b/src/profiler/profiler.h
@@ -35,6 +35,7 @@
 #include <array>
 #include "./vtune.h"
 #include "./aggregate_stats.h"
+#include "./nvtx.h"
 
 #if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
 #include <windows.h>
@@ -489,6 +490,12 @@ class Profiler {
 #define VTUNE_ONLY_CODE(...) /* */        /* This is undefined at the bottom 
of this file */
 #endif
 
+#ifdef MXNET_USE_NVTX
+#define NVTX_ONLY_CODE(...) __VA_ARGS__  /* This is undefined at the bottom of 
this file */
+#else
+#define NVTX_ONLY_CODE(...) /* */        /* This is undefined at the bottom of 
this file */
+#endif
+
 /**
  *  _____              __  _  _  _                ____  _     _            _
  * |  __ \            / _|(_)| |(_)              / __ \| |   (_)          | |
@@ -777,6 +784,7 @@ struct ProfileTask : public ProfileDuration {
     categories_.set(domain_->name());
     categories_.append(",task");
     VTUNE_ONLY_CODE(vtune_task_.reset(new vtune::VTuneTask(name, 
domain->dom())));
+    NVTX_ONLY_CODE(nvtx_duration_.reset(new nvtx::NVTXDuration(name)));
   }
 
   /*!
@@ -785,6 +793,7 @@ struct ProfileTask : public ProfileDuration {
   void start() override {
     start_time_ = ProfileStat::NowInMicrosec();
     VTUNE_ONLY_CODE(vtune_task_->start());
+    NVTX_ONLY_CODE(nvtx_duration_->start());
   }
 
   /*!
@@ -792,6 +801,7 @@ struct ProfileTask : public ProfileDuration {
    */
   void stop() override {
     VTUNE_ONLY_CODE(vtune_task_->stop());
+    NVTX_ONLY_CODE(nvtx_duration_->stop());
     SendStat();
   }
 
@@ -831,6 +841,8 @@ struct ProfileTask : public ProfileDuration {
   ProfileDomain *domain_;
   /*! \brief VTune task object */
   VTUNE_ONLY_CODE(std::unique_ptr<vtune::VTuneTask> vtune_task_);
+  /*! \brief NVTX duration object */
+  NVTX_ONLY_CODE(std::unique_ptr<nvtx::NVTXDuration> nvtx_duration_);
 
  protected:
   /*! \brief Task's start tick */
@@ -849,6 +861,7 @@ struct ProfileEvent  : public ProfileDuration {
     : name_(name)
       , categories_("event") {
     VTUNE_ONLY_CODE(vtune_event_ = vtune::VTuneEvent::registry_.get(name));
+    NVTX_ONLY_CODE(nvtx_duration_.reset(new nvtx::NVTXDuration(name)));
   }
 
   /*!
@@ -857,6 +870,7 @@ struct ProfileEvent  : public ProfileDuration {
   void start() override {
     start_time_ = ProfileStat::NowInMicrosec();
     VTUNE_ONLY_CODE(vtune_event_->start());
+    NVTX_ONLY_CODE(nvtx_duration_->start());
   }
 
   /*!
@@ -905,6 +919,8 @@ struct ProfileEvent  : public ProfileDuration {
   profile_stat_string categories_;
   /*! \brief VTune event object */
   VTUNE_ONLY_CODE(vtune::VTuneEvent *vtune_event_);
+  /*! \brief NVTX duration object */
+  NVTX_ONLY_CODE(std::unique_ptr<nvtx::NVTXDuration> nvtx_duration_;);
 
  protected:
   /*! \brief Start time of the event */
@@ -926,6 +942,7 @@ struct ProfileFrame : public ProfileDuration {
     CHECK_NOTNULL(domain);
     categories_.set(domain_->name());
     categories_.append(",frame");
+    NVTX_ONLY_CODE(nvtx_duration_.reset(new nvtx::NVTXDuration(name)));
     VTUNE_ONLY_CODE(vtune_frame_.reset(new vtune::VTuneFrame(domain->dom())));
   }
 
@@ -935,6 +952,7 @@ struct ProfileFrame : public ProfileDuration {
   void start() override {
     start_time_ = ProfileStat::NowInMicrosec();
     VTUNE_ONLY_CODE(vtune_frame_->start());
+    NVTX_ONLY_CODE(nvtx_duration_->start());
   }
 
   /*!
@@ -977,6 +995,8 @@ struct ProfileFrame : public ProfileDuration {
   ProfileDomain *domain_;
   /*! \brief VTune Frame object */
   VTUNE_ONLY_CODE(std::unique_ptr<vtune::VTuneFrame> vtune_frame_);
+  /*! \brief NVTX duration object */
+  NVTX_ONLY_CODE(std::unique_ptr<nvtx::NVTXDuration> nvtx_duration_);
 
  protected:
   /*! \brief Frame start time */
diff --git a/tests/python/profiling/simple_forward.py 
b/tests/python/profiling/simple_forward.py
new file mode 100644
index 0000000..0ad43c8
--- /dev/null
+++ b/tests/python/profiling/simple_forward.py
@@ -0,0 +1,42 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import mxnet as mx
+from mxnet.gluon import nn
+
+
+def simple_forward():
+    ctx = mx.gpu()
+    mx.profiler.set_config(profile_all=True)
+    mx.profiler.set_state('run')
+
+    # define simple gluon network with random weights
+    net = nn.Sequential()
+    with net.name_scope():
+        net.add(nn.Dense(128, activation='relu'))
+        net.add(nn.Dense(64, activation='relu'))
+        net.add(nn.Dense(10))
+    net.initialize(mx.init.Xavier(magnitude=2.24), ctx=ctx)
+
+    input = mx.nd.zeros((128,), ctx=ctx)
+    predictions = net(input)
+    print('Ran simple NN forward, results:')
+    print(predictions.asnumpy())
+
+
+if __name__ == '__main__':
+    simple_forward()
diff --git a/tests/python/profiling/test_nvtx.py 
b/tests/python/profiling/test_nvtx.py
new file mode 100644
index 0000000..35b209e
--- /dev/null
+++ b/tests/python/profiling/test_nvtx.py
@@ -0,0 +1,52 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import os
+import unittest
+
+import mxnet as mx
+import sys
+
+from subprocess import Popen, PIPE
+
+
+def test_nvtx_ranges_present_in_profile():
+
+    if not mx.test_utils.list_gpus():
+        unittest.skip('Test only applicable to machines with GPUs')
+
+    # Build a system independent wrapper to execute simple_forward with nvprof
+    # This requires nvprof to be on your path (which should be the case for 
most GPU workstations with cuda installed).
+    simple_forward_path = os.path.realpath(__file__)
+    simple_forward_path = simple_forward_path.replace('test_nvtx', 
'simple_forward')
+
+    process = Popen(["nvprof", sys.executable, simple_forward_path], 
stdout=PIPE, stderr=PIPE)
+    (output, profiler_output) = process.communicate()
+    process.wait()
+    profiler_output = profiler_output.decode('ascii')
+
+    # Verify that some of the NVTX ranges we should have created are present
+    # Verify that we have NVTX ranges for our simple operators.
+    assert "Range \"FullyConnected\"" in profiler_output
+    assert "Range \"_zeros\"" in profiler_output
+
+    # Verify that we have some expected output from the engine.
+    assert "Range \"WaitForVar\"" in profiler_output
+
+
+if __name__ == '__main__':
+    import nose
+    nose.runmodule()

[incubator-mxnet] branch master updated: [MXNET-857] Add initial NVTX profiler implementation (#12328)

Reply via email to