Fix-Point commented on code in PR #17214:
URL: https://github.com/apache/nuttx/pull/17214#discussion_r2444978398
##########
include/nuttx/lib/math32.h:
##########
@@ -341,6 +355,238 @@ extern "C"
# define div_const_roundnearest(n, base) (((n) + ((base) / 2)) / (base))
#endif
+/* Division optimizing method proposed by T. Granlund and L. Montgomery.
+ * This method converts the runtime-invariant integer division
+ * into multiplication and right-shifting.
+ *
+ * Usage:
+ * If you want to do n/d division where d is an invariant integer,
+ * initialize the param by `invdiv_init_param(d, param)` first,
+ * then do the division by `invdiv(n, param)`.
+ */
+
+/****************************************************************************
+ * Name: invdiv_init_param32
+ *
+ * Description:
+ * Calculate the triple (multiplier, right shift number 1, right shift
+ * number 2) during initialization.
+ *
+ * Input Parameters:
+ * d - The divisor (unsigned integer). d != 0 and d != 1 is required.
+ *
+ * Output Parameters:
+ * param - The invariant division parameter to be cached.
+ *
+ ****************************************************************************/
+
+static inline_function
+void invdiv_init_param32(uint32_t d, FAR invdiv_param32_t *param)
+{
+ int l = log2ceil(d);
+ uint64_t t1 = (uint64_t)1 << 32;
+ uint64_t t2 = (uint64_t)1 << l;
+
+ param->mult = (t1 * (t2 - d)) / d + 1;
+#if ULONG_MAX == 4294967295UL
+ param->shift = l - 1;
+#else
+ param->shift = l;
+#endif
+}
+
+/****************************************************************************
+ * Name: invdiv_u32
+ *
+ * Description:
+ * Return the result of `n / d`
+ * The division is realized by multiplication and right shift,
+ * where d is already converted to invdiv_param_t.
+ *
+ * Input Parameters:
+ * n - The dividend (uint32_t).
+ * param - The invariant division parameter already cached.
+ *
+ * Returned Value:
+ * The result of `n / d`
+ *
+ ****************************************************************************/
+
+static inline_function
+uint64_t invdiv_u32(uint32_t n, FAR const invdiv_param32_t *param)
+{
+ uint8_t sh = param->shift;
+#if ULONG_MAX == 4294967295UL
+ uint32_t m = param->mult;
+ uint32_t t1 = ((uint64_t)m * n) >> 32; /* UMULH if supported */
+ uint32_t t2 = (n - t1) >> 1;
+#else
+ /* This division can be 25% faster using 64-bit register. */
+
+ uint64_t m = param->mult;
+ uint64_t t1 = (m * n) >> 32;
+ uint32_t t2 = n;
+#endif
+
+ return (t1 + t2) >> sh;
+}
+
+/* Helper function to do n bits integer division. */
+
+#define invdiv_udiv_soft(arr, n, q, d) \
+do \
+{ \
+ uint32_t idx; \
+ uint32_t bits_per_idx = sizeof(uint64_t) * 8; \
+ uint32_t bits_max = bits_per_idx * n; \
+ uint64_t reminder = 0ull; \
+ uint64_t high_rem = 0ull; \
+ for (idx = 0; idx < bits_max; idx++) \
+ { \
+ uint32_t bits = bits_max - idx - 1; \
+ uint32_t bits_idx = bits / bits_per_idx; \
+ uint32_t bits_idx_off = bits % bits_per_idx; \
+ reminder <<= 1u; \
+ reminder |= ((arr)[bits_idx] >> bits_idx_off) & 1u; \
+ if (reminder >= (d)) \
+ { \
+ reminder -= (d); \
+ q[bits_idx] |= (1ull << bits_idx_off); \
+ } \
+ else if (high_rem != 0) \
+ { \
+ uint64_t c = 0ull - (d); \
+ high_rem -= 1; \
+ reminder += c; \
+ q[bits_idx] |= (1ull << bits_idx_off); \
+ } \
+ high_rem += (reminder >> (bits_per_idx - 1)) & 1u; \
+ } \
+} \
+while(0)
+
+/* Helper function to calculate 2 64-bit unsigned integers multiplication
+ * The result is the highest half of the 128-bit unsigned integer.
+ */
+
+static inline_function uint64_t invdiv_umulh64(uint64_t a, uint64_t b)
Review Comment:
1. Yes. Here is the test cases:
https://github.com/Fix-Point/projOPENVELA-InvariantDivisor/blob/main/test_div.c.
2. I prefer `UMULH` inline assembly for each architecture, which is
compiler-irelevant. Sadly, `math32.h` should be architecture-independent, so I
can not add these hard-coded instruction in this file.
3. `libdivide` implements the same algorithm as this commit
https://gmplib.org/~tege/divcnst-pldi94.pdf. The performance of invdiv is close
to or even better than the branch-free implementation of libdivide. Considering
embedded and real-time applications, we want all these functions to have
minimal jitter (WCET - BCET). Therefore, branch-free versions should be
prefered, while non-branchfree implementation may have better average
performance.
##########
include/nuttx/lib/math32.h:
##########
@@ -341,6 +355,238 @@ extern "C"
# define div_const_roundnearest(n, base) (((n) + ((base) / 2)) / (base))
#endif
+/* Division optimizing method proposed by T. Granlund and L. Montgomery.
+ * This method converts the runtime-invariant integer division
+ * into multiplication and right-shifting.
+ *
+ * Usage:
+ * If you want to do n/d division where d is an invariant integer,
+ * initialize the param by `invdiv_init_param(d, param)` first,
+ * then do the division by `invdiv(n, param)`.
+ */
+
+/****************************************************************************
+ * Name: invdiv_init_param32
+ *
+ * Description:
+ * Calculate the triple (multiplier, right shift number 1, right shift
+ * number 2) during initialization.
+ *
+ * Input Parameters:
+ * d - The divisor (unsigned integer). d != 0 and d != 1 is required.
+ *
+ * Output Parameters:
+ * param - The invariant division parameter to be cached.
+ *
+ ****************************************************************************/
+
+static inline_function
+void invdiv_init_param32(uint32_t d, FAR invdiv_param32_t *param)
+{
+ int l = log2ceil(d);
+ uint64_t t1 = (uint64_t)1 << 32;
+ uint64_t t2 = (uint64_t)1 << l;
+
+ param->mult = (t1 * (t2 - d)) / d + 1;
+#if ULONG_MAX == 4294967295UL
+ param->shift = l - 1;
+#else
+ param->shift = l;
+#endif
+}
+
+/****************************************************************************
+ * Name: invdiv_u32
+ *
+ * Description:
+ * Return the result of `n / d`
+ * The division is realized by multiplication and right shift,
+ * where d is already converted to invdiv_param_t.
+ *
+ * Input Parameters:
+ * n - The dividend (uint32_t).
+ * param - The invariant division parameter already cached.
+ *
+ * Returned Value:
+ * The result of `n / d`
+ *
+ ****************************************************************************/
+
+static inline_function
+uint64_t invdiv_u32(uint32_t n, FAR const invdiv_param32_t *param)
+{
+ uint8_t sh = param->shift;
+#if ULONG_MAX == 4294967295UL
+ uint32_t m = param->mult;
+ uint32_t t1 = ((uint64_t)m * n) >> 32; /* UMULH if supported */
+ uint32_t t2 = (n - t1) >> 1;
+#else
+ /* This division can be 25% faster using 64-bit register. */
+
+ uint64_t m = param->mult;
+ uint64_t t1 = (m * n) >> 32;
+ uint32_t t2 = n;
+#endif
+
+ return (t1 + t2) >> sh;
+}
+
+/* Helper function to do n bits integer division. */
+
+#define invdiv_udiv_soft(arr, n, q, d) \
+do \
+{ \
+ uint32_t idx; \
+ uint32_t bits_per_idx = sizeof(uint64_t) * 8; \
+ uint32_t bits_max = bits_per_idx * n; \
+ uint64_t reminder = 0ull; \
+ uint64_t high_rem = 0ull; \
+ for (idx = 0; idx < bits_max; idx++) \
+ { \
+ uint32_t bits = bits_max - idx - 1; \
+ uint32_t bits_idx = bits / bits_per_idx; \
+ uint32_t bits_idx_off = bits % bits_per_idx; \
+ reminder <<= 1u; \
+ reminder |= ((arr)[bits_idx] >> bits_idx_off) & 1u; \
+ if (reminder >= (d)) \
+ { \
+ reminder -= (d); \
+ q[bits_idx] |= (1ull << bits_idx_off); \
+ } \
+ else if (high_rem != 0) \
+ { \
+ uint64_t c = 0ull - (d); \
+ high_rem -= 1; \
+ reminder += c; \
+ q[bits_idx] |= (1ull << bits_idx_off); \
+ } \
+ high_rem += (reminder >> (bits_per_idx - 1)) & 1u; \
+ } \
+} \
+while(0)
+
+/* Helper function to calculate 2 64-bit unsigned integers multiplication
+ * The result is the highest half of the 128-bit unsigned integer.
+ */
+
+static inline_function uint64_t invdiv_umulh64(uint64_t a, uint64_t b)
Review Comment:
1. Yes. Here is the test cases:
https://github.com/Fix-Point/projOPENVELA-InvariantDivisor/blob/main/test_div.c.
2. I prefer `UMULH` inline assembly for each architecture, which is
compiler-independent. Sadly, `math32.h` should be architecture-independent, so
I can not add these hard-coded instruction in this file.
3. `libdivide` implements the same algorithm as this commit
https://gmplib.org/~tege/divcnst-pldi94.pdf. The performance of invdiv is close
to or even better than the branch-free implementation of libdivide. Considering
embedded and real-time applications, we want all these functions to have
minimal jitter (WCET - BCET). Therefore, branch-free versions should be
prefered, while non-branchfree implementation may have better average
performance.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]