This is an automated email from the ASF dual-hosted git repository. jxie pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git
The following commit(s) were added to refs/heads/master by this push: new a13c46c [MXNET-81] Fix crash with mx.nd.ones (#10014) a13c46c is described below commit a13c46c72d580083cc2fcbd51ce90d5afd8e09d6 Author: Anirudh Subramanian <anirudh2...@gmail.com> AuthorDate: Tue Apr 3 10:33:56 2018 -0700 [MXNET-81] Fix crash with mx.nd.ones (#10014) * Add tests for Exception Handling in Iterators * Fixing test_random * Add documentation for exc handling * Fix for exc handling doc * Fix exc handling doc * Add exception handling documentation * Correct the seed change * Fix * Improve exception handling docs * Add dmlc-core * Empty commit * Add dmlc-core * Move to architecture design docs * Add exception handling to index * Trigger CI * Fix crash case * Rollback earlier changes * Check for valid gpu inside PushAsync * Move to PushAsync * Fix * Fix message * Add test * Fix * Add better error messaging * Trigger CI * Add 3p * make device_count_ atomic --- CONTRIBUTORS.md | 1 + src/engine/threaded_engine.cc | 14 ++++++++++++++ src/engine/threaded_engine.h | 5 +++++ tests/python/gpu/test_operator_gpu.py | 5 +++++ 4 files changed, 25 insertions(+) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 4e5dfdb..829d836 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -162,3 +162,4 @@ List of Contributors * [David Braude](https://github.com/dabraude/) * [Nick Robinson](https://github.com/nickrobinson) * [Kan Wu](https://github.com/wkcn) +* [Anirudh Subramanian](https://github.com/anirudh2290/) diff --git a/src/engine/threaded_engine.cc b/src/engine/threaded_engine.cc index 2910060..dc0436e 100644 --- a/src/engine/threaded_engine.cc +++ b/src/engine/threaded_engine.cc @@ -309,6 +309,20 @@ void ThreadedEngine::PushAsync(AsyncFn fn, Context exec_ctx, int priority, const char* opr_name, bool wait) { +#if MXNET_USE_CUDA + if (exec_ctx.dev_mask() == gpu::kDevMask) { + if (device_count_ < 0) { + int tmp = -1; + cudaGetDeviceCount(&tmp); + device_count_ = tmp; + CHECK_GT(device_count_, 0) << "GPU usage requires at least 1 GPU"; + } + CHECK_LT(exec_ctx.dev_id, device_count_) + << "Invalid GPU Id: " << exec_ctx.dev_id + << ", Valid device id should be less than device_count: " + << device_count_; + } +#endif BulkFlush(); ThreadedOpr *opr = NewOperator(std::move(fn), const_vars, mutable_vars, prop, opr_name, wait); opr->temporary = true; diff --git a/src/engine/threaded_engine.h b/src/engine/threaded_engine.h index bfb1b1d..428f0d8 100644 --- a/src/engine/threaded_engine.h +++ b/src/engine/threaded_engine.h @@ -535,6 +535,11 @@ class ThreadedEngine : public Engine { std::shared_ptr<common::ObjectPool<VersionedVarBlock> > objpool_varblk_ref_; std::shared_ptr<common::ObjectPool<ThreadedVar> > objpool_var_ref_; +#if MXNET_USE_CUDA + /*! \brief Number of GPU devices available */ + std::atomic<int> device_count_{-1}; +#endif + /*! \brief Hold a ref count ot the profiler */ std::shared_ptr<profiler::Profiler> profiler_; diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py index 1987041..2dd66ee 100644 --- a/tests/python/gpu/test_operator_gpu.py +++ b/tests/python/gpu/test_operator_gpu.py @@ -24,7 +24,9 @@ import unittest import mxnet as mx import numpy as np import unittest +from nose.tools import assert_raises from mxnet.test_utils import check_consistency, set_default_context, assert_almost_equal +from mxnet.base import MXNetError from numpy.testing import assert_allclose curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__))) @@ -1780,6 +1782,9 @@ def test_kernel_error_checking(): assert p.exitcode != 0,\ "Expected a synchronous kernel error from %s(), none seen." % f.__name__ +def test_incorrect_gpu(): + # Try setting dev_id to a really big number + assert_raises(MXNetError, mx.nd.ones, (2,2), ctx=mx.gpu(100001)) if __name__ == '__main__': import nose -- To stop receiving notification emails like this one, please contact j...@apache.org.