eric-haibin-lin closed pull request #9747: Add contrib.rand_zipfian
URL: https://github.com/apache/incubator-mxnet/pull/9747
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/python/mxnet/ndarray/contrib.py b/python/mxnet/ndarray/contrib.py
index 0a3fe48c64..ba402e6f3f 100644
--- a/python/mxnet/ndarray/contrib.py
+++ b/python/mxnet/ndarray/contrib.py
@@ -18,9 +18,81 @@
 # coding: utf-8
 # pylint: disable=wildcard-import, unused-wildcard-import
 """Contrib NDArray API of MXNet."""
+import math
+from ..context import current_context
+from ..random import uniform
 try:
     from .gen_contrib import *
 except ImportError:
     pass
 
-__all__ = []
+__all__ = ["rand_zipfian"]
+
+# pylint: disable=line-too-long
+def rand_zipfian(true_classes, num_sampled, range_max, ctx=None):
+    """Draw random samples from an approximately log-uniform or Zipfian 
distribution.
+
+    This operation randomly samples *num_sampled* candidates the range of 
integers [0, range_max).
+    The elements of sampled_candidates are drawn with replacement from the 
base distribution.
+
+    The base distribution for this operator is an approximately log-uniform or 
Zipfian distribution:
+
+    P(class) = (log(class + 2) - log(class + 1)) / log(range_max + 1)
+
+    This sampler is useful when the true classes approximately follow such a 
distribution.
+    For example, if the classes represent words in a lexicon sorted in 
decreasing order of \
+    frequency. If your classes are not ordered by decreasing frequency, do not 
use this op.
+
+    Additionaly, it also returns the number of times each of the \
+    true classes and the sampled classes is expected to occur.
+
+    Parameters
+    ----------
+    true_classes : NDArray
+        A 1-D NDArray of the target classes.
+    num_sampled: int
+        The number of classes to randomly sample.
+    range_max: int
+        The number of possible classes.
+    ctx : Context
+        Device context of output. Default is current context. Overridden by
+        `mu.context` when `mu` is an NDArray.
+
+    Returns
+    -------
+    samples: NDArray
+        The sampled candidate classes in 1-D `int64` dtype.
+    expected_count_true: NDArray
+        The expected count for true classes in 1-D `float64` dtype.
+    expected_count_sample: NDArray
+        The expected count for sampled candidates in 1-D `float64` dtype.
+
+    Examples
+    --------
+    >>> true_cls = mx.nd.array([3])
+    >>> samples, exp_count_true, exp_count_sample = 
mx.nd.contrib.rand_zipfian(true_cls, 4, 5)
+    >>> samples
+    [1 3 3 3]
+    <NDArray 4 @cpu(0)>
+    >>> exp_count_true
+    [ 0.12453879]
+    <NDArray 1 @cpu(0)>
+    >>> exp_count_sample
+    [ 0.22629439  0.12453879  0.12453879  0.12453879]
+    <NDArray 4 @cpu(0)>
+    """
+    if ctx is None:
+        ctx = current_context()
+    log_range = math.log(range_max + 1)
+    rand = uniform(0, log_range, shape=(num_sampled,), dtype='float64', 
ctx=ctx)
+    # make sure sampled_classes are in the range of [0, range_max)
+    sampled_classes = (rand.exp() - 1).astype('int64') % range_max
+
+    true_cls = true_classes.as_in_context(ctx).astype('float64')
+    expected_count_true = ((true_cls + 2.0) / (true_cls + 1.0)).log() / 
log_range * num_sampled
+    # cast sampled classes to fp64 to avoid interget division
+    sampled_cls_fp64 = sampled_classes.astype('float64')
+    expected_prob_sampled = ((sampled_cls_fp64 + 2.0) / (sampled_cls_fp64 + 
1.0)).log() / log_range
+    expected_count_sampled = expected_prob_sampled * num_sampled
+    return sampled_classes, expected_count_true, expected_count_sampled
+# pylint: enable=line-too-long
diff --git a/python/mxnet/symbol/contrib.py b/python/mxnet/symbol/contrib.py
index 13feb78e37..83e90e6873 100644
--- a/python/mxnet/symbol/contrib.py
+++ b/python/mxnet/symbol/contrib.py
@@ -18,9 +18,76 @@
 # coding: utf-8
 # pylint: disable=wildcard-import, unused-wildcard-import
 """Contrib Symbol API of MXNet."""
+import math
+from .random import uniform
+from .symbol import Symbol
 try:
     from .gen_contrib import *
 except ImportError:
     pass
 
-__all__ = []
+__all__ = ["rand_zipfian"]
+
+def rand_zipfian(true_classes, num_sampled, range_max):
+    """Draw random samples from an approximately log-uniform or Zipfian 
distribution.
+
+    This operation randomly samples *num_sampled* candidates the range of 
integers [0, range_max).
+    The elements of sampled_candidates are drawn with replacement from the 
base distribution.
+
+    The base distribution for this operator is an approximately log-uniform or 
Zipfian distribution:
+
+    P(class) = (log(class + 2) - log(class + 1)) / log(range_max + 1)
+
+    This sampler is useful when the true classes approximately follow such a 
distribution.
+    For example, if the classes represent words in a lexicon sorted in 
decreasing order of \
+    frequency. If your classes are not ordered by decreasing frequency, do not 
use this op.
+
+    Additionaly, it also returns the number of times each of the \
+    true classes and the sampled classes is expected to occur.
+
+    Parameters
+    ----------
+    true_classes : Symbol
+        The target classes in 1-D.
+    num_sampled: int
+        The number of classes to randomly sample.
+    range_max: int
+        The number of possible classes.
+
+    Returns
+    -------
+    samples: Symbol
+        The sampled candidate classes in 1-D `int64` dtype.
+    expected_count_true: Symbol
+        The expected count for true classes in 1-D `float64` dtype.
+    expected_count_sample: Symbol
+        The expected count for sampled candidates in 1-D `float64` dtype.
+
+    Examples
+    --------
+    >>> true_cls = mx.nd.array([3])
+    >>> samples, exp_count_true, exp_count_sample = 
mx.nd.contrib.rand_zipfian(true_cls, 4, 5)
+    >>> samples
+    [1 3 3 3]
+    <NDArray 4 @cpu(0)>
+    >>> exp_count_true
+    [ 0.12453879]
+    <NDArray 1 @cpu(0)>
+    >>> exp_count_sample
+    [ 0.22629439  0.12453879  0.12453879  0.12453879]
+    <NDArray 4 @cpu(0)>
+    """
+    assert(isinstance(true_classes, Symbol)), "unexpected type %s" % 
type(true_classes)
+    log_range = math.log(range_max + 1)
+    rand = uniform(0, log_range, shape=(num_sampled,), dtype='float64')
+    # make sure sampled_classes are in the range of [0, range_max)
+    sampled_classes = (rand.exp() - 1).astype('int64') % range_max
+
+    true_classes = true_classes.astype('float64')
+    expected_prob_true = ((true_classes + 2.0) / (true_classes + 1.0)).log() / 
log_range
+    expected_count_true = expected_prob_true * num_sampled
+    # cast sampled classes to fp64 to avoid interget division
+    sampled_cls_fp64 = sampled_classes.astype('float64')
+    expected_prob_sampled = ((sampled_cls_fp64 + 2.0) / (sampled_cls_fp64 + 
1.0)).log() / log_range
+    expected_count_sampled = expected_prob_sampled * num_sampled
+    return sampled_classes, expected_count_true, expected_count_sampled
diff --git a/tests/python/unittest/test_random.py 
b/tests/python/unittest/test_random.py
index 6b77fc13d4..f042f57c4e 100644
--- a/tests/python/unittest/test_random.py
+++ b/tests/python/unittest/test_random.py
@@ -521,6 +521,36 @@ def check_data(a, b):
         for j in range(i+1, num_seeds):
             check_data(data[i],data[j])
 
+@with_seed()
+def test_zipfian_generator():
+    # dummy true classes
+    num_true = 5
+    num_sampled = 1000
+    range_max = 20
+
+    def compute_expected_prob():
+        # P(class) = (log(class + 2) - log(class + 1)) / log(range_max + 1)
+        classes = mx.nd.arange(0, range_max)
+        expected_counts = ((classes + 2).log() - (classes + 1).log()) / 
np.log(range_max + 1)
+        return expected_counts
+
+    exp_cnt = compute_expected_prob() * num_sampled
+
+    # test ndarray
+    true_classes = mx.nd.random.uniform(0, range_max, 
shape=(num_true,)).astype('int32')
+    sampled_classes, exp_cnt_true, exp_cnt_sampled = 
mx.nd.contrib.rand_zipfian(true_classes, num_sampled, range_max)
+    mx.test_utils.assert_almost_equal(exp_cnt_sampled.asnumpy(), 
exp_cnt[sampled_classes].asnumpy(), rtol=1e-1, atol=1e-2)
+    mx.test_utils.assert_almost_equal(exp_cnt_true.asnumpy(), 
exp_cnt[true_classes].asnumpy(), rtol=1e-1, atol=1e-2)
+
+    # test symbol
+    true_classes_var = mx.sym.var('true_classes')
+    outputs = mx.sym.contrib.rand_zipfian(true_classes_var, num_sampled, 
range_max)
+    outputs = mx.sym.Group(outputs)
+    executor = outputs.bind(mx.context.current_context(), {'true_classes' : 
true_classes})
+    executor.forward()
+    sampled_classes, exp_cnt_true, exp_cnt_sampled = executor.outputs
+    mx.test_utils.assert_almost_equal(exp_cnt_sampled.asnumpy(), 
exp_cnt[sampled_classes].asnumpy(), rtol=1e-1, atol=1e-2)
+    mx.test_utils.assert_almost_equal(exp_cnt_true.asnumpy(), 
exp_cnt[true_classes].asnumpy(), rtol=1e-1, atol=1e-2)
 
 if __name__ == '__main__':
     import nose


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

Reply via email to