This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch branch-4.1
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-4.1 by this push:
new b29c3e45a11 branch-4.1: [optimize](be) optimize floating fmod fast
path #64161 (#64193)
b29c3e45a11 is described below
commit b29c3e45a1197baf55ad7fcd4822bb232a618303
Author: github-actions[bot]
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Mon Jun 8 14:13:55 2026 +0800
branch-4.1: [optimize](be) optimize floating fmod fast path #64161 (#64193)
Cherry-picked from #64161
Co-authored-by: zclllyybb <[email protected]>
---
be/benchmark/benchmark_fmod.hpp | 442 +++++++++++++++++++++
be/benchmark/benchmark_main.cpp | 1 +
be/src/exprs/function/fmod_fast.cpp | 160 ++++++++
be/src/exprs/function/fmod_fast.h | 41 ++
be/src/exprs/function/modulo.cpp | 68 +++-
be/test/exprs/function/function_fmod_fast_test.cpp | 353 ++++++++++++++++
6 files changed, 1048 insertions(+), 17 deletions(-)
diff --git a/be/benchmark/benchmark_fmod.hpp b/be/benchmark/benchmark_fmod.hpp
new file mode 100644
index 00000000000..ca9718fa94b
--- /dev/null
+++ b/be/benchmark/benchmark_fmod.hpp
@@ -0,0 +1,442 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <benchmark/benchmark.h>
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <cstring>
+#include <random>
+#include <vector>
+
+#include "exprs/function/fmod_fast.h"
+
+namespace doris {
+namespace {
+
+enum FmodBenchCase {
+ DB_DB = 0,
+ IN_ONE_DB = 1,
+ DB_IN_ONE = 2,
+ DB_IN_TEN = 3,
+ MIXED_SIGNS_AND_ZEROS = 4,
+};
+
+void fill_actual_load_sample(int case_id, size_t size, std::vector<double>*
lhs,
+ std::vector<double>* rhs) {
+ lhs->resize(size);
+ rhs->resize(size);
+ constexpr double db_scales[] = {1234.4500, 1876.2222, 8945.7353,
5612.6245, 4646.7853,
+ 6523.5285, 1000.1575, 6555.5678,
2587.8535, 3754.2575};
+ constexpr uint64_t rows_per_load = 5'000'000;
+ constexpr uint64_t total_rows = 50'000'000;
+
+ std::mt19937_64 rng(0xadc83b19ULL);
+ std::uniform_real_distribution<double> signed_large(-4.5e10, 4.5e10);
+ std::uniform_real_distribution<double> signed_small(-10.0, 10.0);
+
+ for (size_t i = 0; i < size; ++i) {
+ uint64_t virtual_row = (static_cast<uint64_t>(i) * total_rows) / size;
+ uint64_t block = std::min<uint64_t>(virtual_row / rows_per_load, 9);
+ double row_num = static_cast<double>((virtual_row % rows_per_load) +
1);
+ double db = row_num * db_scales[block];
+ double in_one = row_num * 2e-7;
+ double in_ten = row_num * 2e-6;
+
+ switch (case_id) {
+ case DB_DB:
+ (*lhs)[i] = db;
+ (*rhs)[i] = db;
+ break;
+ case IN_ONE_DB:
+ (*lhs)[i] = in_one;
+ (*rhs)[i] = db;
+ break;
+ case DB_IN_ONE:
+ (*lhs)[i] = db;
+ (*rhs)[i] = in_one;
+ break;
+ case DB_IN_TEN:
+ (*lhs)[i] = db;
+ (*rhs)[i] = in_ten;
+ break;
+ case MIXED_SIGNS_AND_ZEROS:
+ (*lhs)[i] = signed_large(rng);
+ (*rhs)[i] = i % 97 == 0 ? 0.0 : signed_small(rng);
+ break;
+ default:
+ (*lhs)[i] = db;
+ (*rhs)[i] = in_one;
+ break;
+ }
+ }
+}
+
+void fill_actual_load_sample_float(int case_id, size_t size,
std::vector<float>* lhs,
+ std::vector<float>* rhs) {
+ std::vector<double> lhs_double;
+ std::vector<double> rhs_double;
+ fill_actual_load_sample(case_id, size, &lhs_double, &rhs_double);
+
+ lhs->resize(size);
+ rhs->resize(size);
+ for (size_t i = 0; i < size; ++i) {
+ (*lhs)[i] = static_cast<float>(lhs_double[i]);
+ (*rhs)[i] = static_cast<float>(rhs_double[i]);
+ }
+}
+
+void std_vector_vector(const double* lhs, const double* rhs, double* result,
uint8_t* null_map,
+ size_t size) {
+ for (size_t i = 0; i < size; ++i) {
+ uint8_t is_null = rhs[i] == 0.0;
+ null_map[i] = is_null;
+ result[i] = std::fmod(lhs[i], rhs[i] + static_cast<double>(is_null));
+ }
+}
+
+void std_vector_vector(const float* lhs, const float* rhs, float* result,
uint8_t* null_map,
+ size_t size) {
+ for (size_t i = 0; i < size; ++i) {
+ uint8_t is_null = rhs[i] == 0.0F;
+ null_map[i] = is_null;
+ float adjusted_rhs = rhs[i] + static_cast<float>(is_null);
+ result[i] = static_cast<float>(
+ std::fmod(static_cast<double>(lhs[i]),
static_cast<double>(adjusted_rhs)));
+ }
+}
+
+void std_vector_constant(const double* lhs, double rhs, double* result,
uint8_t* null_map,
+ size_t size) {
+ uint8_t is_null = rhs == 0.0;
+ memset(null_map, is_null, size);
+ if (is_null) {
+ return;
+ }
+ for (size_t i = 0; i < size; ++i) {
+ result[i] = std::fmod(lhs[i], rhs);
+ }
+}
+
+void std_vector_constant(const float* lhs, float rhs, float* result, uint8_t*
null_map,
+ size_t size) {
+ uint8_t is_null = rhs == 0.0F;
+ memset(null_map, is_null, size);
+ if (is_null) {
+ return;
+ }
+ for (size_t i = 0; i < size; ++i) {
+ result[i] = static_cast<float>(
+ std::fmod(static_cast<double>(lhs[i]),
static_cast<double>(rhs)));
+ }
+}
+
+void std_constant_vector(double lhs, const double* rhs, double* result,
uint8_t* null_map,
+ size_t size) {
+ for (size_t i = 0; i < size; ++i) {
+ uint8_t is_null = rhs[i] == 0.0;
+ null_map[i] = is_null;
+ result[i] = std::fmod(lhs, rhs[i] + static_cast<double>(is_null));
+ }
+}
+
+void std_constant_vector(float lhs, const float* rhs, float* result, uint8_t*
null_map,
+ size_t size) {
+ for (size_t i = 0; i < size; ++i) {
+ uint8_t is_null = rhs[i] == 0.0F;
+ null_map[i] = is_null;
+ float adjusted_rhs = rhs[i] + static_cast<float>(is_null);
+ result[i] = static_cast<float>(
+ std::fmod(static_cast<double>(lhs),
static_cast<double>(adjusted_rhs)));
+ }
+}
+
+void benchmark_args(benchmark::internal::Benchmark* b) {
+ constexpr int64_t rows = 1 << 20;
+ b->Args({DB_DB, rows})
+ ->Args({IN_ONE_DB, rows})
+ ->Args({DB_IN_ONE, rows})
+ ->Args({DB_IN_TEN, rows})
+ ->Args({MIXED_SIGNS_AND_ZEROS, rows})
+ ->Unit(benchmark::kMillisecond)
+ ->UseRealTime()
+ ->Repetitions(5)
+ ->DisplayAggregatesOnly();
+}
+
+void benchmark_const_args(benchmark::internal::Benchmark* b) {
+ constexpr int64_t rows = 1 << 20;
+ b->Arg(rows)
+ ->Unit(benchmark::kMillisecond)
+ ->UseRealTime()
+ ->Repetitions(5)
+ ->DisplayAggregatesOnly();
+}
+
+static void BM_FmodDoubleVectorVectorStd(benchmark::State& state) {
+ std::vector<double> lhs;
+ std::vector<double> rhs;
+ fill_actual_load_sample(static_cast<int>(state.range(0)), state.range(1),
&lhs, &rhs);
+ std::vector<double> result(lhs.size());
+ std::vector<uint8_t> null_map(lhs.size());
+
+ for (auto _ : state) {
+ std_vector_vector(lhs.data(), rhs.data(), result.data(),
null_map.data(), lhs.size());
+ benchmark::ClobberMemory();
+ }
+ state.SetItemsProcessed(state.iterations() *
static_cast<int64_t>(lhs.size()));
+}
+
+static void BM_FmodDoubleVectorVectorFast(benchmark::State& state) {
+ std::vector<double> lhs;
+ std::vector<double> rhs;
+ fill_actual_load_sample(static_cast<int>(state.range(0)), state.range(1),
&lhs, &rhs);
+ std::vector<double> result(lhs.size());
+ std::vector<uint8_t> null_map(lhs.size());
+
+ for (auto _ : state) {
+ fmod_fast::vector_vector(lhs.data(), rhs.data(), result.data(),
null_map.data(),
+ lhs.size());
+ benchmark::ClobberMemory();
+ }
+ state.SetItemsProcessed(state.iterations() *
static_cast<int64_t>(lhs.size()));
+}
+
+static void BM_FmodDoubleVectorConstantStd(benchmark::State& state) {
+ std::vector<double> lhs;
+ std::vector<double> rhs;
+ fill_actual_load_sample(DB_IN_ONE, state.range(0), &lhs, &rhs);
+ std::vector<double> result(lhs.size());
+ std::vector<uint8_t> null_map(lhs.size());
+
+ for (auto _ : state) {
+ std_vector_constant(lhs.data(), 0.9999998, result.data(),
null_map.data(), lhs.size());
+ benchmark::ClobberMemory();
+ }
+ state.SetItemsProcessed(state.iterations() *
static_cast<int64_t>(lhs.size()));
+}
+
+static void BM_FmodDoubleVectorConstantFast(benchmark::State& state) {
+ std::vector<double> lhs;
+ std::vector<double> rhs;
+ fill_actual_load_sample(DB_IN_ONE, state.range(0), &lhs, &rhs);
+ std::vector<double> result(lhs.size());
+ std::vector<uint8_t> null_map(lhs.size());
+
+ for (auto _ : state) {
+ fmod_fast::vector_constant(lhs.data(), 0.9999998, result.data(),
null_map.data(),
+ lhs.size());
+ benchmark::ClobberMemory();
+ }
+ state.SetItemsProcessed(state.iterations() *
static_cast<int64_t>(lhs.size()));
+}
+
+static void BM_FmodDoubleConstZeroStd(benchmark::State& state) {
+ std::vector<double> lhs;
+ std::vector<double> rhs;
+ fill_actual_load_sample(DB_IN_ONE, state.range(0), &lhs, &rhs);
+ std::vector<double> result(lhs.size(), -777.0);
+ std::vector<uint8_t> null_map(lhs.size());
+
+ for (auto _ : state) {
+ std_vector_constant(lhs.data(), 0.0, result.data(), null_map.data(),
lhs.size());
+ benchmark::ClobberMemory();
+ }
+ state.SetItemsProcessed(state.iterations() *
static_cast<int64_t>(lhs.size()));
+}
+
+static void BM_FmodDoubleConstZeroFast(benchmark::State& state) {
+ std::vector<double> lhs;
+ std::vector<double> rhs;
+ fill_actual_load_sample(DB_IN_ONE, state.range(0), &lhs, &rhs);
+ std::vector<double> result(lhs.size(), -777.0);
+ std::vector<uint8_t> null_map(lhs.size());
+
+ for (auto _ : state) {
+ fmod_fast::vector_constant(lhs.data(), 0.0, result.data(),
null_map.data(), lhs.size());
+ benchmark::ClobberMemory();
+ }
+ state.SetItemsProcessed(state.iterations() *
static_cast<int64_t>(lhs.size()));
+}
+
+static void BM_FmodDoubleConstantVectorStd(benchmark::State& state) {
+ std::vector<double> lhs;
+ std::vector<double> rhs;
+ fill_actual_load_sample(IN_ONE_DB, state.range(0), &lhs, &rhs);
+ std::vector<double> result(lhs.size());
+ std::vector<uint8_t> null_map(lhs.size());
+
+ for (auto _ : state) {
+ std_constant_vector(12345.678, rhs.data(), result.data(),
null_map.data(), rhs.size());
+ benchmark::ClobberMemory();
+ }
+ state.SetItemsProcessed(state.iterations() *
static_cast<int64_t>(rhs.size()));
+}
+
+static void BM_FmodDoubleConstantVectorFast(benchmark::State& state) {
+ std::vector<double> lhs;
+ std::vector<double> rhs;
+ fill_actual_load_sample(IN_ONE_DB, state.range(0), &lhs, &rhs);
+ std::vector<double> result(lhs.size());
+ std::vector<uint8_t> null_map(lhs.size());
+
+ for (auto _ : state) {
+ fmod_fast::constant_vector(12345.678, rhs.data(), result.data(),
null_map.data(),
+ rhs.size());
+ benchmark::ClobberMemory();
+ }
+ state.SetItemsProcessed(state.iterations() *
static_cast<int64_t>(rhs.size()));
+}
+
+static void BM_FmodFloatVectorVectorStd(benchmark::State& state) {
+ std::vector<float> lhs;
+ std::vector<float> rhs;
+ fill_actual_load_sample_float(static_cast<int>(state.range(0)),
state.range(1), &lhs, &rhs);
+ std::vector<float> result(lhs.size());
+ std::vector<uint8_t> null_map(lhs.size());
+
+ for (auto _ : state) {
+ std_vector_vector(lhs.data(), rhs.data(), result.data(),
null_map.data(), lhs.size());
+ benchmark::ClobberMemory();
+ }
+ state.SetItemsProcessed(state.iterations() *
static_cast<int64_t>(lhs.size()));
+}
+
+static void BM_FmodFloatVectorVectorFast(benchmark::State& state) {
+ std::vector<float> lhs;
+ std::vector<float> rhs;
+ fill_actual_load_sample_float(static_cast<int>(state.range(0)),
state.range(1), &lhs, &rhs);
+ std::vector<float> result(lhs.size());
+ std::vector<uint8_t> null_map(lhs.size());
+
+ for (auto _ : state) {
+ fmod_fast::vector_vector(lhs.data(), rhs.data(), result.data(),
null_map.data(),
+ lhs.size());
+ benchmark::ClobberMemory();
+ }
+ state.SetItemsProcessed(state.iterations() *
static_cast<int64_t>(lhs.size()));
+}
+
+static void BM_FmodFloatVectorConstantStd(benchmark::State& state) {
+ std::vector<float> lhs;
+ std::vector<float> rhs;
+ fill_actual_load_sample_float(DB_IN_ONE, state.range(0), &lhs, &rhs);
+ std::vector<float> result(lhs.size());
+ std::vector<uint8_t> null_map(lhs.size());
+
+ for (auto _ : state) {
+ std_vector_constant(lhs.data(), 0.9999998F, result.data(),
null_map.data(), lhs.size());
+ benchmark::ClobberMemory();
+ }
+ state.SetItemsProcessed(state.iterations() *
static_cast<int64_t>(lhs.size()));
+}
+
+static void BM_FmodFloatVectorConstantFast(benchmark::State& state) {
+ std::vector<float> lhs;
+ std::vector<float> rhs;
+ fill_actual_load_sample_float(DB_IN_ONE, state.range(0), &lhs, &rhs);
+ std::vector<float> result(lhs.size());
+ std::vector<uint8_t> null_map(lhs.size());
+
+ for (auto _ : state) {
+ fmod_fast::vector_constant(lhs.data(), 0.9999998F, result.data(),
null_map.data(),
+ lhs.size());
+ benchmark::ClobberMemory();
+ }
+ state.SetItemsProcessed(state.iterations() *
static_cast<int64_t>(lhs.size()));
+}
+
+static void BM_FmodFloatConstZeroStd(benchmark::State& state) {
+ std::vector<float> lhs;
+ std::vector<float> rhs;
+ fill_actual_load_sample_float(DB_IN_ONE, state.range(0), &lhs, &rhs);
+ std::vector<float> result(lhs.size(), -777.0F);
+ std::vector<uint8_t> null_map(lhs.size());
+
+ for (auto _ : state) {
+ std_vector_constant(lhs.data(), 0.0F, result.data(), null_map.data(),
lhs.size());
+ benchmark::ClobberMemory();
+ }
+ state.SetItemsProcessed(state.iterations() *
static_cast<int64_t>(lhs.size()));
+}
+
+static void BM_FmodFloatConstZeroFast(benchmark::State& state) {
+ std::vector<float> lhs;
+ std::vector<float> rhs;
+ fill_actual_load_sample_float(DB_IN_ONE, state.range(0), &lhs, &rhs);
+ std::vector<float> result(lhs.size(), -777.0F);
+ std::vector<uint8_t> null_map(lhs.size());
+
+ for (auto _ : state) {
+ fmod_fast::vector_constant(lhs.data(), 0.0F, result.data(),
null_map.data(), lhs.size());
+ benchmark::ClobberMemory();
+ }
+ state.SetItemsProcessed(state.iterations() *
static_cast<int64_t>(lhs.size()));
+}
+
+static void BM_FmodFloatConstantVectorStd(benchmark::State& state) {
+ std::vector<float> lhs;
+ std::vector<float> rhs;
+ fill_actual_load_sample_float(IN_ONE_DB, state.range(0), &lhs, &rhs);
+ std::vector<float> result(lhs.size());
+ std::vector<uint8_t> null_map(lhs.size());
+
+ for (auto _ : state) {
+ std_constant_vector(12345.678F, rhs.data(), result.data(),
null_map.data(), rhs.size());
+ benchmark::ClobberMemory();
+ }
+ state.SetItemsProcessed(state.iterations() *
static_cast<int64_t>(rhs.size()));
+}
+
+static void BM_FmodFloatConstantVectorFast(benchmark::State& state) {
+ std::vector<float> lhs;
+ std::vector<float> rhs;
+ fill_actual_load_sample_float(IN_ONE_DB, state.range(0), &lhs, &rhs);
+ std::vector<float> result(lhs.size());
+ std::vector<uint8_t> null_map(lhs.size());
+
+ for (auto _ : state) {
+ fmod_fast::constant_vector(12345.678F, rhs.data(), result.data(),
null_map.data(),
+ rhs.size());
+ benchmark::ClobberMemory();
+ }
+ state.SetItemsProcessed(state.iterations() *
static_cast<int64_t>(rhs.size()));
+}
+
+} // namespace
+
+BENCHMARK(BM_FmodDoubleVectorVectorStd)->Apply(benchmark_args);
+BENCHMARK(BM_FmodDoubleVectorVectorFast)->Apply(benchmark_args);
+BENCHMARK(BM_FmodDoubleVectorConstantStd)->Apply(benchmark_const_args);
+BENCHMARK(BM_FmodDoubleVectorConstantFast)->Apply(benchmark_const_args);
+BENCHMARK(BM_FmodDoubleConstZeroStd)->Apply(benchmark_const_args);
+BENCHMARK(BM_FmodDoubleConstZeroFast)->Apply(benchmark_const_args);
+BENCHMARK(BM_FmodDoubleConstantVectorStd)->Apply(benchmark_const_args);
+BENCHMARK(BM_FmodDoubleConstantVectorFast)->Apply(benchmark_const_args);
+BENCHMARK(BM_FmodFloatVectorVectorStd)->Apply(benchmark_args);
+BENCHMARK(BM_FmodFloatVectorVectorFast)->Apply(benchmark_args);
+BENCHMARK(BM_FmodFloatVectorConstantStd)->Apply(benchmark_const_args);
+BENCHMARK(BM_FmodFloatVectorConstantFast)->Apply(benchmark_const_args);
+BENCHMARK(BM_FmodFloatConstZeroStd)->Apply(benchmark_const_args);
+BENCHMARK(BM_FmodFloatConstZeroFast)->Apply(benchmark_const_args);
+BENCHMARK(BM_FmodFloatConstantVectorStd)->Apply(benchmark_const_args);
+BENCHMARK(BM_FmodFloatConstantVectorFast)->Apply(benchmark_const_args);
+
+} // namespace doris
diff --git a/be/benchmark/benchmark_main.cpp b/be/benchmark/benchmark_main.cpp
index e36f61febbc..905331fa422 100644
--- a/be/benchmark/benchmark_main.cpp
+++ b/be/benchmark/benchmark_main.cpp
@@ -19,6 +19,7 @@
#include "benchmark_bit_pack.hpp"
#include "benchmark_fastunion.hpp"
+#include "benchmark_fmod.hpp"
#include "benchmark_hll_merge.hpp"
#include "benchmark_zone_map_index.hpp"
#include "binary_cast_benchmark.hpp"
diff --git a/be/src/exprs/function/fmod_fast.cpp
b/be/src/exprs/function/fmod_fast.cpp
new file mode 100644
index 00000000000..0f3673a90b2
--- /dev/null
+++ b/be/src/exprs/function/fmod_fast.cpp
@@ -0,0 +1,160 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "exprs/function/fmod_fast.h"
+
+#include <string.h>
+
+#include <cmath>
+
+#include "common/compiler_util.h"
+
+namespace doris::fmod_fast {
+namespace {
+
+#if defined(__x86_64__) && (defined(__GNUC__) || defined(__clang__))
+#define DORIS_HAS_X87_FMOD_FAST 1
+
+ALWAYS_INLINE inline double fmod_x87_fprem(double a, double b) {
+ double r;
+ asm volatile(
+ "fldl %[b]\n\t"
+ "fldl %[a]\n\t"
+ "1:\n\t"
+ "fprem\n\t"
+ "fnstsw %%ax\n\t"
+ "testb $4, %%ah\n\t"
+ "jne 1b\n\t"
+ "fstp %%st(1)\n\t"
+ "fstpl %[r]\n\t"
+ : [r] "=m"(r)
+ : [a] "m"(a), [b] "m"(b)
+ : "ax", "cc", "st");
+ return r;
+}
+#else
+#define DORIS_HAS_X87_FMOD_FAST 0
+#endif
+
+ALWAYS_INLINE inline double fmod_double(double a, double b) {
+#if DORIS_HAS_X87_FMOD_FAST
+ if (b != 0.0 && std::isfinite(a) && std::isfinite(b)) {
+ double abs_a = std::fabs(a);
+ double abs_b = std::fabs(b);
+ if (abs_a < abs_b) {
+ return a;
+ }
+ if (abs_a == abs_b) {
+ return std::copysign(0.0, a);
+ }
+ return fmod_x87_fprem(a, b);
+ }
+#endif
+ return std::fmod(a, b);
+}
+
+ALWAYS_INLINE inline float fmod_float(float a, float b) {
+ return static_cast<float>(fmod_double(static_cast<double>(a),
static_cast<double>(b)));
+}
+
+ALWAYS_INLINE inline double fmod_value(double a, double b) {
+ return fmod_double(a, b);
+}
+
+ALWAYS_INLINE inline float fmod_value(float a, float b) {
+ return fmod_float(a, b);
+}
+
+template <typename T>
+ALWAYS_INLINE inline void vector_vector_impl(const T* lhs, const T* rhs, T*
result,
+ uint8_t* null_map, size_t size) {
+ for (size_t i = 0; i < size; ++i) {
+ uint8_t is_null = rhs[i] == T(0);
+ null_map[i] = is_null;
+ T adjusted_rhs = rhs[i] + static_cast<T>(is_null);
+ result[i] = fmod_value(lhs[i], adjusted_rhs);
+ }
+}
+
+template <typename T>
+ALWAYS_INLINE inline void vector_constant_impl(const T* lhs, T rhs, T* result,
uint8_t* null_map,
+ size_t size) {
+ uint8_t is_null = rhs == T(0);
+ memset(null_map, is_null, size);
+ if (is_null) {
+ return;
+ }
+
+ for (size_t i = 0; i < size; ++i) {
+ result[i] = fmod_value(lhs[i], rhs);
+ }
+}
+
+template <typename T>
+ALWAYS_INLINE inline void constant_vector_impl(T lhs, const T* rhs, T* result,
uint8_t* null_map,
+ size_t size) {
+ for (size_t i = 0; i < size; ++i) {
+ uint8_t is_null = rhs[i] == T(0);
+ null_map[i] = is_null;
+ T adjusted_rhs = rhs[i] + static_cast<T>(is_null);
+ result[i] = fmod_value(lhs, adjusted_rhs);
+ }
+}
+
+} // namespace
+
+bool is_x87_fast_path_enabled() {
+ return DORIS_HAS_X87_FMOD_FAST;
+}
+
+double scalar(double a, double b) {
+ return fmod_double(a, b);
+}
+
+float scalar(float a, float b) {
+ return fmod_float(a, b);
+}
+
+void vector_vector(const double* lhs, const double* rhs, double* result,
uint8_t* null_map,
+ size_t size) {
+ vector_vector_impl(lhs, rhs, result, null_map, size);
+}
+
+void vector_vector(const float* lhs, const float* rhs, float* result, uint8_t*
null_map,
+ size_t size) {
+ vector_vector_impl(lhs, rhs, result, null_map, size);
+}
+
+void vector_constant(const double* lhs, double rhs, double* result, uint8_t*
null_map,
+ size_t size) {
+ vector_constant_impl(lhs, rhs, result, null_map, size);
+}
+
+void vector_constant(const float* lhs, float rhs, float* result, uint8_t*
null_map, size_t size) {
+ vector_constant_impl(lhs, rhs, result, null_map, size);
+}
+
+void constant_vector(double lhs, const double* rhs, double* result, uint8_t*
null_map,
+ size_t size) {
+ constant_vector_impl(lhs, rhs, result, null_map, size);
+}
+
+void constant_vector(float lhs, const float* rhs, float* result, uint8_t*
null_map, size_t size) {
+ constant_vector_impl(lhs, rhs, result, null_map, size);
+}
+
+} // namespace doris::fmod_fast
diff --git a/be/src/exprs/function/fmod_fast.h
b/be/src/exprs/function/fmod_fast.h
new file mode 100644
index 00000000000..6bcc7481165
--- /dev/null
+++ b/be/src/exprs/function/fmod_fast.h
@@ -0,0 +1,41 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+
+namespace doris::fmod_fast {
+
+bool is_x87_fast_path_enabled();
+
+double scalar(double a, double b);
+float scalar(float a, float b);
+
+void vector_vector(const double* lhs, const double* rhs, double* result,
uint8_t* null_map,
+ size_t size);
+void vector_vector(const float* lhs, const float* rhs, float* result, uint8_t*
null_map,
+ size_t size);
+
+void vector_constant(const double* lhs, double rhs, double* result, uint8_t*
null_map, size_t size);
+void vector_constant(const float* lhs, float rhs, float* result, uint8_t*
null_map, size_t size);
+
+void constant_vector(double lhs, const double* rhs, double* result, uint8_t*
null_map, size_t size);
+void constant_vector(float lhs, const float* rhs, float* result, uint8_t*
null_map, size_t size);
+
+} // namespace doris::fmod_fast
diff --git a/be/src/exprs/function/modulo.cpp b/be/src/exprs/function/modulo.cpp
index 6ddad76d880..8bd3b86cbbd 100644
--- a/be/src/exprs/function/modulo.cpp
+++ b/be/src/exprs/function/modulo.cpp
@@ -33,6 +33,7 @@
#include "core/types.h"
#include "core/value/decimalv2_value.h"
#include "exprs/function/cast_type_to_either.h"
+#include "exprs/function/fmod_fast.h"
#include "exprs/function/simple_function_factory.h"
namespace doris {
@@ -395,9 +396,13 @@ struct ModNumericImpl {
auto& b = column_right_ptr->get_data();
auto& c = column_result->get_data();
auto& n = null_map->get_data();
- size_t size = b.size();
- for (size_t i = 0; i < size; ++i) {
- c[i] = Impl::apply(a, b[i], n[i]);
+ if constexpr (requires { Impl::apply(a, b, c, n); }) {
+ Impl::apply(a, b, c, n);
+ } else {
+ size_t size = b.size();
+ for (size_t i = 0; i < size; ++i) {
+ c[i] = Impl::apply(a, b[i], n[i]);
+ }
}
return ColumnNullable::create(std::move(column_result),
std::move(null_map));
}
@@ -414,9 +419,13 @@ struct ModNumericImpl {
auto& b = column_right_ptr->get_data();
auto& c = column_result->get_data();
auto& n = null_map->get_data();
- size_t size = a.size();
- for (size_t i = 0; i < size; ++i) {
- c[i] = Impl::apply(a[i], b[i], n[i]);
+ if constexpr (requires { Impl::apply(a, b, c, n); }) {
+ Impl::apply(a, b, c, n);
+ } else {
+ size_t size = a.size();
+ for (size_t i = 0; i < size; ++i) {
+ c[i] = Impl::apply(a[i], b[i], n[i]);
+ }
}
return ColumnNullable::create(std::move(column_result),
std::move(null_map));
}
@@ -439,17 +448,42 @@ struct ModuloNumericImpl {
static void apply(const typename ColumnType::Container& a, ArgB b,
typename ColumnType::Container& c,
PaddedPODArray<UInt8>& null_map) {
size_t size = c.size();
- UInt8 is_null = b == 0;
- memset(null_map.data(), is_null, sizeof(UInt8) * size);
-
- if (!is_null) {
+ if constexpr (is_float_or_double(Type)) {
+ fmod_fast::vector_constant(a.data(), b, c.data(), null_map.data(),
size);
+ } else {
+ UInt8 is_null = b == 0;
+ memset(null_map.data(), is_null, sizeof(UInt8) * size);
+ if (is_null) {
+ return;
+ }
for (size_t i = 0; i < size; i++) {
- if constexpr (is_float_or_double(Type)) {
- c[i] = std::fmod((double)a[i], (double)b);
- } else {
- throw_if_division_leads_to_FPE(a[i], b);
- c[i] = a[i] % b;
- }
+ throw_if_division_leads_to_FPE(a[i], b);
+ c[i] = a[i] % b;
+ }
+ }
+ }
+
+ static void apply(ArgA a, const typename ColumnType::Container& b,
+ typename ColumnType::Container& c,
PaddedPODArray<UInt8>& null_map) {
+ size_t size = c.size();
+ if constexpr (is_float_or_double(Type)) {
+ fmod_fast::constant_vector(a, b.data(), c.data(), null_map.data(),
size);
+ } else {
+ for (size_t i = 0; i < size; ++i) {
+ c[i] = apply(a, b[i], null_map[i]);
+ }
+ }
+ }
+
+ static void apply(const typename ColumnType::Container& a,
+ const typename ColumnType::Container& b, typename
ColumnType::Container& c,
+ PaddedPODArray<UInt8>& null_map) {
+ size_t size = c.size();
+ if constexpr (is_float_or_double(Type)) {
+ fmod_fast::vector_vector(a.data(), b.data(), c.data(),
null_map.data(), size);
+ } else {
+ for (size_t i = 0; i < size; ++i) {
+ c[i] = apply(a[i], b[i], null_map[i]);
}
}
}
@@ -460,7 +494,7 @@ struct ModuloNumericImpl {
b += is_null;
if constexpr (is_float_or_double(Type)) {
- return std::fmod((double)a, (double)b);
+ return fmod_fast::scalar(a, b);
} else {
throw_if_division_leads_to_FPE(a, b);
return a % b;
diff --git a/be/test/exprs/function/function_fmod_fast_test.cpp
b/be/test/exprs/function/function_fmod_fast_test.cpp
new file mode 100644
index 00000000000..ac4f7c0dd57
--- /dev/null
+++ b/be/test/exprs/function/function_fmod_fast_test.cpp
@@ -0,0 +1,353 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <bit>
+#include <cmath>
+#include <cstdint>
+#include <limits>
+#include <random>
+#include <string>
+#include <vector>
+
+#include "core/data_type/data_type_number.h"
+#include "core/types.h"
+#include "exprs/function/fmod_fast.h"
+#include "exprs/function/function_test_util.h"
+#include "testutil/any_type.h"
+
+namespace doris {
+namespace {
+
+uint64_t bits(double v) {
+ return std::bit_cast<uint64_t>(v);
+}
+
+uint32_t bits(float v) {
+ return std::bit_cast<uint32_t>(v);
+}
+
+void expect_same_double(double actual, double expected, double lhs, double
rhs) {
+ if (std::isnan(expected)) {
+ ASSERT_TRUE(std::isnan(actual)) << "lhs=" << lhs << " rhs=" << rhs;
+ } else {
+ ASSERT_EQ(bits(expected), bits(actual)) << "lhs=" << lhs << " rhs=" <<
rhs
+ << " expected=" << expected <<
" actual=" << actual;
+ }
+}
+
+void expect_same_float(float actual, float expected, float lhs, float rhs) {
+ if (std::isnan(expected)) {
+ ASSERT_TRUE(std::isnan(actual)) << "lhs=" << lhs << " rhs=" << rhs;
+ } else {
+ ASSERT_EQ(bits(expected), bits(actual)) << "lhs=" << lhs << " rhs=" <<
rhs
+ << " expected=" << expected <<
" actual=" << actual;
+ }
+}
+
+double reference_fmod(double lhs, double rhs) {
+ return std::fmod(lhs, rhs);
+}
+
+float reference_fmod(float lhs, float rhs) {
+ return static_cast<float>(std::fmod(static_cast<double>(lhs),
static_cast<double>(rhs)));
+}
+
+template <typename T>
+std::vector<T> interesting_values();
+
+template <>
+std::vector<double> interesting_values<double>() {
+ const double nan = std::numeric_limits<double>::quiet_NaN();
+ const double inf = std::numeric_limits<double>::infinity();
+ return {0.0,
+ -0.0,
+ 1.0,
+ -1.0,
+ 2.0,
+ -2.0,
+ 2.5,
+ -2.5,
+ 1000.1575,
+ -1000.1575,
+ 44'728'676'500.0,
+ -44'728'676'500.0,
+ std::numeric_limits<double>::min(),
+ -std::numeric_limits<double>::min(),
+ std::numeric_limits<double>::denorm_min(),
+ -std::numeric_limits<double>::denorm_min(),
+ std::numeric_limits<double>::max(),
+ -std::numeric_limits<double>::max(),
+ inf,
+ -inf,
+ nan};
+}
+
+template <>
+std::vector<float> interesting_values<float>() {
+ const float nan = std::numeric_limits<float>::quiet_NaN();
+ const float inf = std::numeric_limits<float>::infinity();
+ return {0.0F,
+ -0.0F,
+ 1.0F,
+ -1.0F,
+ 2.0F,
+ -2.0F,
+ 2.5F,
+ -2.5F,
+ 1000.1575F,
+ -1000.1575F,
+ 1.0e10F,
+ -1.0e10F,
+ std::numeric_limits<float>::min(),
+ -std::numeric_limits<float>::min(),
+ std::numeric_limits<float>::denorm_min(),
+ -std::numeric_limits<float>::denorm_min(),
+ std::numeric_limits<float>::max(),
+ -std::numeric_limits<float>::max(),
+ inf,
+ -inf,
+ nan};
+}
+
+template <typename T>
+void check_scalar_pair(T lhs, T rhs);
+
+template <>
+void check_scalar_pair<double>(double lhs, double rhs) {
+ expect_same_double(fmod_fast::scalar(lhs, rhs), reference_fmod(lhs, rhs),
lhs, rhs);
+}
+
+template <>
+void check_scalar_pair<float>(float lhs, float rhs) {
+ expect_same_float(fmod_fast::scalar(lhs, rhs), reference_fmod(lhs, rhs),
lhs, rhs);
+}
+
+template <typename T>
+void check_scalar_corner_cases() {
+ const auto values = interesting_values<T>();
+ for (T lhs : values) {
+ for (T rhs : values) {
+ check_scalar_pair(lhs, rhs);
+ }
+ }
+}
+
+template <typename T>
+void check_actual_load_distribution() {
+ constexpr double db_scales[] = {1234.4500, 1876.2222, 8945.7353,
5612.6245, 4646.7853,
+ 6523.5285, 1000.1575, 6555.5678,
2587.8535, 3754.2575};
+ for (double scale : db_scales) {
+ for (int64_t row = 1; row <= 5'000'000; row += 9973) {
+ T db = static_cast<T>(static_cast<double>(row) * scale);
+ T in_one = static_cast<T>(static_cast<double>(row) * 2e-7);
+ T in_ten = static_cast<T>(static_cast<double>(row) * 2e-6);
+ check_scalar_pair(db, db);
+ check_scalar_pair(in_one, db);
+ check_scalar_pair(db, in_one);
+ check_scalar_pair(db, in_ten);
+ }
+ }
+}
+
+template <typename T>
+void check_random_finite_distribution() {
+ std::mt19937_64 rng(0x9e3779b97f4a7c15ULL);
+ std::uniform_real_distribution<double> large(-4.5e10, 4.5e10);
+ std::uniform_real_distribution<double> small(-10.0, 10.0);
+ std::uniform_real_distribution<double> tiny(-1e-200, 1e-200);
+ for (int i = 0; i < 20000; ++i) {
+ T lhs = static_cast<T>(large(rng));
+ T rhs = static_cast<T>(small(rng));
+ if (rhs == T(0)) {
+ rhs = static_cast<T>(0.125);
+ }
+ check_scalar_pair(lhs, rhs);
+ check_scalar_pair(static_cast<T>(small(rng)), lhs == T(0) ?
static_cast<T>(1) : lhs);
+ check_scalar_pair(static_cast<T>(tiny(rng)), rhs);
+ }
+}
+
+template <typename T>
+void fill_batch_inputs(std::vector<T>* lhs, std::vector<T>* rhs) {
+ const auto values = interesting_values<T>();
+ for (size_t i = 0; i < values.size(); ++i) {
+ for (size_t j = 0; j < values.size(); ++j) {
+ lhs->push_back(values[i]);
+ rhs->push_back(values[j]);
+ }
+ }
+
+ constexpr double db_scales[] = {1234.4500, 1876.2222, 8945.7353,
5612.6245, 4646.7853,
+ 6523.5285, 1000.1575, 6555.5678,
2587.8535, 3754.2575};
+ for (double scale : db_scales) {
+ for (int64_t row = 1; row <= 5'000'000; row += 1543) {
+ T db = static_cast<T>(static_cast<double>(row) * scale);
+ T in_one = static_cast<T>(static_cast<double>(row) * 2e-7);
+ T in_ten = static_cast<T>(static_cast<double>(row) * 2e-6);
+ lhs->push_back(db);
+ rhs->push_back(db);
+ lhs->push_back(static_cast<T>(in_one));
+ rhs->push_back(db);
+ lhs->push_back(db);
+ rhs->push_back(in_one);
+ lhs->push_back(db);
+ rhs->push_back(in_ten);
+ }
+ }
+}
+
+template <typename T>
+void check_batch_vector_vector();
+
+template <>
+void check_batch_vector_vector<double>() {
+ std::vector<double> lhs;
+ std::vector<double> rhs;
+ fill_batch_inputs(&lhs, &rhs);
+ std::vector<double> result(lhs.size());
+ std::vector<uint8_t> null_map(lhs.size());
+
+ fmod_fast::vector_vector(lhs.data(), rhs.data(), result.data(),
null_map.data(), lhs.size());
+ for (size_t i = 0; i < lhs.size(); ++i) {
+ uint8_t expected_null = rhs[i] == 0.0;
+ ASSERT_EQ(expected_null, null_map[i]) << i;
+ double adjusted_rhs = rhs[i] + static_cast<double>(expected_null);
+ expect_same_double(result[i], reference_fmod(lhs[i], adjusted_rhs),
lhs[i], adjusted_rhs);
+ }
+}
+
+template <>
+void check_batch_vector_vector<float>() {
+ std::vector<float> lhs;
+ std::vector<float> rhs;
+ fill_batch_inputs(&lhs, &rhs);
+ std::vector<float> result(lhs.size());
+ std::vector<uint8_t> null_map(lhs.size());
+
+ fmod_fast::vector_vector(lhs.data(), rhs.data(), result.data(),
null_map.data(), lhs.size());
+ for (size_t i = 0; i < lhs.size(); ++i) {
+ uint8_t expected_null = rhs[i] == 0.0F;
+ ASSERT_EQ(expected_null, null_map[i]) << i;
+ float adjusted_rhs = rhs[i] + static_cast<float>(expected_null);
+ expect_same_float(result[i], reference_fmod(lhs[i], adjusted_rhs),
lhs[i], adjusted_rhs);
+ }
+}
+
+template <typename T>
+void check_batch_const_shapes();
+
+template <>
+void check_batch_const_shapes<double>() {
+ std::vector<double> lhs;
+ std::vector<double> rhs;
+ fill_batch_inputs(&lhs, &rhs);
+ std::vector<double> result(lhs.size(), -777.0);
+ std::vector<uint8_t> null_map(lhs.size());
+
+ fmod_fast::vector_constant(lhs.data(), 0.0, result.data(),
null_map.data(), lhs.size());
+ for (size_t i = 0; i < lhs.size(); ++i) {
+ ASSERT_EQ(1, null_map[i]) << i;
+ ASSERT_EQ(bits(-777.0), bits(result[i])) << i;
+ }
+
+ fmod_fast::vector_constant(lhs.data(), 0.125, result.data(),
null_map.data(), lhs.size());
+ for (size_t i = 0; i < lhs.size(); ++i) {
+ ASSERT_EQ(0, null_map[i]) << i;
+ expect_same_double(result[i], reference_fmod(lhs[i], 0.125), lhs[i],
0.125);
+ }
+
+ fmod_fast::constant_vector(12345.678, rhs.data(), result.data(),
null_map.data(), rhs.size());
+ for (size_t i = 0; i < rhs.size(); ++i) {
+ uint8_t expected_null = rhs[i] == 0.0;
+ ASSERT_EQ(expected_null, null_map[i]) << i;
+ double adjusted_rhs = rhs[i] + static_cast<double>(expected_null);
+ expect_same_double(result[i], reference_fmod(12345.678, adjusted_rhs),
12345.678,
+ adjusted_rhs);
+ }
+}
+
+template <>
+void check_batch_const_shapes<float>() {
+ std::vector<float> lhs;
+ std::vector<float> rhs;
+ fill_batch_inputs(&lhs, &rhs);
+ std::vector<float> result(lhs.size(), -777.0F);
+ std::vector<uint8_t> null_map(lhs.size());
+
+ fmod_fast::vector_constant(lhs.data(), 0.0F, result.data(),
null_map.data(), lhs.size());
+ for (size_t i = 0; i < lhs.size(); ++i) {
+ ASSERT_EQ(1, null_map[i]) << i;
+ ASSERT_EQ(bits(-777.0F), bits(result[i])) << i;
+ }
+
+ fmod_fast::vector_constant(lhs.data(), 0.125F, result.data(),
null_map.data(), lhs.size());
+ for (size_t i = 0; i < lhs.size(); ++i) {
+ ASSERT_EQ(0, null_map[i]) << i;
+ expect_same_float(result[i], reference_fmod(lhs[i], 0.125F), lhs[i],
0.125F);
+ }
+
+ fmod_fast::constant_vector(12345.678F, rhs.data(), result.data(),
null_map.data(), rhs.size());
+ for (size_t i = 0; i < rhs.size(); ++i) {
+ uint8_t expected_null = rhs[i] == 0.0F;
+ ASSERT_EQ(expected_null, null_map[i]) << i;
+ float adjusted_rhs = rhs[i] + static_cast<float>(expected_null);
+ expect_same_float(result[i], reference_fmod(12345.678F, adjusted_rhs),
12345.678F,
+ adjusted_rhs);
+ }
+}
+
+} // namespace
+
+TEST(FunctionFmodFastTest, ScalarCornerCasesMatchStdFmod) {
+ check_scalar_corner_cases<double>();
+ check_scalar_corner_cases<float>();
+}
+
+TEST(FunctionFmodFastTest, ActualLoadDistributionMatchesStdFmod) {
+ check_actual_load_distribution<double>();
+ check_actual_load_distribution<float>();
+}
+
+TEST(FunctionFmodFastTest, RandomFiniteDistributionMatchesStdFmod) {
+ check_random_finite_distribution<double>();
+ check_random_finite_distribution<float>();
+}
+
+TEST(FunctionFmodFastTest, BatchVectorVectorMatchesStdFmod) {
+ check_batch_vector_vector<double>();
+ check_batch_vector_vector<float>();
+}
+
+TEST(FunctionFmodFastTest, BatchConstShapesMatchStdFmod) {
+ check_batch_const_shapes<double>();
+ check_batch_const_shapes<float>();
+}
+
+TEST(FunctionFmodFastTest, DorisFunctionNullSemanticsStayUnchanged) {
+ InputTypeSet input_types = {PrimitiveType::TYPE_DOUBLE,
PrimitiveType::TYPE_DOUBLE};
+ DataSet data_set = {
+ {{5.5, 2.0}, reference_fmod(5.5, 2.0)},
+ {{-5.5, 2.0}, reference_fmod(-5.5, 2.0)},
+ {{5.5, -2.0}, reference_fmod(5.5, -2.0)},
+ {{1.0, 0.0}, Null()},
+ {{0.0, 0.0}, Null()},
+ {{44'728'676'500.0, 0.9999998}, reference_fmod(44'728'676'500.0,
0.9999998)}};
+ static_cast<void>(check_function<DataTypeFloat64, true>("fmod",
input_types, data_set));
+ static_cast<void>(check_function<DataTypeFloat64, true>("mod",
input_types, data_set));
+}
+
+} // namespace doris
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]