eric-haibin-lin closed pull request #9747: Add contrib.rand_zipfian URL: https://github.com/apache/incubator-mxnet/pull/9747
This is a PR merged from a forked repository. As GitHub hides the original diff on merge, it is displayed below for the sake of provenance: As this is a foreign pull request (from a fork), the diff is supplied below (as it won't show otherwise due to GitHub magic): diff --git a/python/mxnet/ndarray/contrib.py b/python/mxnet/ndarray/contrib.py index 0a3fe48c64..ba402e6f3f 100644 --- a/python/mxnet/ndarray/contrib.py +++ b/python/mxnet/ndarray/contrib.py @@ -18,9 +18,81 @@ # coding: utf-8 # pylint: disable=wildcard-import, unused-wildcard-import """Contrib NDArray API of MXNet.""" +import math +from ..context import current_context +from ..random import uniform try: from .gen_contrib import * except ImportError: pass -__all__ = [] +__all__ = ["rand_zipfian"] + +# pylint: disable=line-too-long +def rand_zipfian(true_classes, num_sampled, range_max, ctx=None): + """Draw random samples from an approximately log-uniform or Zipfian distribution. + + This operation randomly samples *num_sampled* candidates the range of integers [0, range_max). + The elements of sampled_candidates are drawn with replacement from the base distribution. + + The base distribution for this operator is an approximately log-uniform or Zipfian distribution: + + P(class) = (log(class + 2) - log(class + 1)) / log(range_max + 1) + + This sampler is useful when the true classes approximately follow such a distribution. + For example, if the classes represent words in a lexicon sorted in decreasing order of \ + frequency. If your classes are not ordered by decreasing frequency, do not use this op. + + Additionaly, it also returns the number of times each of the \ + true classes and the sampled classes is expected to occur. + + Parameters + ---------- + true_classes : NDArray + A 1-D NDArray of the target classes. + num_sampled: int + The number of classes to randomly sample. + range_max: int + The number of possible classes. + ctx : Context + Device context of output. Default is current context. Overridden by + `mu.context` when `mu` is an NDArray. + + Returns + ------- + samples: NDArray + The sampled candidate classes in 1-D `int64` dtype. + expected_count_true: NDArray + The expected count for true classes in 1-D `float64` dtype. + expected_count_sample: NDArray + The expected count for sampled candidates in 1-D `float64` dtype. + + Examples + -------- + >>> true_cls = mx.nd.array([3]) + >>> samples, exp_count_true, exp_count_sample = mx.nd.contrib.rand_zipfian(true_cls, 4, 5) + >>> samples + [1 3 3 3] + <NDArray 4 @cpu(0)> + >>> exp_count_true + [ 0.12453879] + <NDArray 1 @cpu(0)> + >>> exp_count_sample + [ 0.22629439 0.12453879 0.12453879 0.12453879] + <NDArray 4 @cpu(0)> + """ + if ctx is None: + ctx = current_context() + log_range = math.log(range_max + 1) + rand = uniform(0, log_range, shape=(num_sampled,), dtype='float64', ctx=ctx) + # make sure sampled_classes are in the range of [0, range_max) + sampled_classes = (rand.exp() - 1).astype('int64') % range_max + + true_cls = true_classes.as_in_context(ctx).astype('float64') + expected_count_true = ((true_cls + 2.0) / (true_cls + 1.0)).log() / log_range * num_sampled + # cast sampled classes to fp64 to avoid interget division + sampled_cls_fp64 = sampled_classes.astype('float64') + expected_prob_sampled = ((sampled_cls_fp64 + 2.0) / (sampled_cls_fp64 + 1.0)).log() / log_range + expected_count_sampled = expected_prob_sampled * num_sampled + return sampled_classes, expected_count_true, expected_count_sampled +# pylint: enable=line-too-long diff --git a/python/mxnet/symbol/contrib.py b/python/mxnet/symbol/contrib.py index 13feb78e37..83e90e6873 100644 --- a/python/mxnet/symbol/contrib.py +++ b/python/mxnet/symbol/contrib.py @@ -18,9 +18,76 @@ # coding: utf-8 # pylint: disable=wildcard-import, unused-wildcard-import """Contrib Symbol API of MXNet.""" +import math +from .random import uniform +from .symbol import Symbol try: from .gen_contrib import * except ImportError: pass -__all__ = [] +__all__ = ["rand_zipfian"] + +def rand_zipfian(true_classes, num_sampled, range_max): + """Draw random samples from an approximately log-uniform or Zipfian distribution. + + This operation randomly samples *num_sampled* candidates the range of integers [0, range_max). + The elements of sampled_candidates are drawn with replacement from the base distribution. + + The base distribution for this operator is an approximately log-uniform or Zipfian distribution: + + P(class) = (log(class + 2) - log(class + 1)) / log(range_max + 1) + + This sampler is useful when the true classes approximately follow such a distribution. + For example, if the classes represent words in a lexicon sorted in decreasing order of \ + frequency. If your classes are not ordered by decreasing frequency, do not use this op. + + Additionaly, it also returns the number of times each of the \ + true classes and the sampled classes is expected to occur. + + Parameters + ---------- + true_classes : Symbol + The target classes in 1-D. + num_sampled: int + The number of classes to randomly sample. + range_max: int + The number of possible classes. + + Returns + ------- + samples: Symbol + The sampled candidate classes in 1-D `int64` dtype. + expected_count_true: Symbol + The expected count for true classes in 1-D `float64` dtype. + expected_count_sample: Symbol + The expected count for sampled candidates in 1-D `float64` dtype. + + Examples + -------- + >>> true_cls = mx.nd.array([3]) + >>> samples, exp_count_true, exp_count_sample = mx.nd.contrib.rand_zipfian(true_cls, 4, 5) + >>> samples + [1 3 3 3] + <NDArray 4 @cpu(0)> + >>> exp_count_true + [ 0.12453879] + <NDArray 1 @cpu(0)> + >>> exp_count_sample + [ 0.22629439 0.12453879 0.12453879 0.12453879] + <NDArray 4 @cpu(0)> + """ + assert(isinstance(true_classes, Symbol)), "unexpected type %s" % type(true_classes) + log_range = math.log(range_max + 1) + rand = uniform(0, log_range, shape=(num_sampled,), dtype='float64') + # make sure sampled_classes are in the range of [0, range_max) + sampled_classes = (rand.exp() - 1).astype('int64') % range_max + + true_classes = true_classes.astype('float64') + expected_prob_true = ((true_classes + 2.0) / (true_classes + 1.0)).log() / log_range + expected_count_true = expected_prob_true * num_sampled + # cast sampled classes to fp64 to avoid interget division + sampled_cls_fp64 = sampled_classes.astype('float64') + expected_prob_sampled = ((sampled_cls_fp64 + 2.0) / (sampled_cls_fp64 + 1.0)).log() / log_range + expected_count_sampled = expected_prob_sampled * num_sampled + return sampled_classes, expected_count_true, expected_count_sampled diff --git a/tests/python/unittest/test_random.py b/tests/python/unittest/test_random.py index 6b77fc13d4..f042f57c4e 100644 --- a/tests/python/unittest/test_random.py +++ b/tests/python/unittest/test_random.py @@ -521,6 +521,36 @@ def check_data(a, b): for j in range(i+1, num_seeds): check_data(data[i],data[j]) +@with_seed() +def test_zipfian_generator(): + # dummy true classes + num_true = 5 + num_sampled = 1000 + range_max = 20 + + def compute_expected_prob(): + # P(class) = (log(class + 2) - log(class + 1)) / log(range_max + 1) + classes = mx.nd.arange(0, range_max) + expected_counts = ((classes + 2).log() - (classes + 1).log()) / np.log(range_max + 1) + return expected_counts + + exp_cnt = compute_expected_prob() * num_sampled + + # test ndarray + true_classes = mx.nd.random.uniform(0, range_max, shape=(num_true,)).astype('int32') + sampled_classes, exp_cnt_true, exp_cnt_sampled = mx.nd.contrib.rand_zipfian(true_classes, num_sampled, range_max) + mx.test_utils.assert_almost_equal(exp_cnt_sampled.asnumpy(), exp_cnt[sampled_classes].asnumpy(), rtol=1e-1, atol=1e-2) + mx.test_utils.assert_almost_equal(exp_cnt_true.asnumpy(), exp_cnt[true_classes].asnumpy(), rtol=1e-1, atol=1e-2) + + # test symbol + true_classes_var = mx.sym.var('true_classes') + outputs = mx.sym.contrib.rand_zipfian(true_classes_var, num_sampled, range_max) + outputs = mx.sym.Group(outputs) + executor = outputs.bind(mx.context.current_context(), {'true_classes' : true_classes}) + executor.forward() + sampled_classes, exp_cnt_true, exp_cnt_sampled = executor.outputs + mx.test_utils.assert_almost_equal(exp_cnt_sampled.asnumpy(), exp_cnt[sampled_classes].asnumpy(), rtol=1e-1, atol=1e-2) + mx.test_utils.assert_almost_equal(exp_cnt_true.asnumpy(), exp_cnt[true_classes].asnumpy(), rtol=1e-1, atol=1e-2) if __name__ == '__main__': import nose ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services