This is an automated email from the ASF dual-hosted git repository.
yuanzhou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git
The following commit(s) were added to refs/heads/main by this push:
new b0b7ae98c [GLUTEN-2874][VL] support allowDecimalPrecisionLoss (#2895)
b0b7ae98c is described below
commit b0b7ae98c45c6df69ee7024b4047eec7b6e7266f
Author: Yuan <[email protected]>
AuthorDate: Fri Jul 5 16:25:37 2024 +0800
[GLUTEN-2874][VL] support allowDecimalPrecisionLoss (#2895)
This patch allows user to enable or disable decimal precision loss in
decimal arithmetic
---
.../gluten/backendsapi/velox/VeloxBackend.scala | 2 +-
cpp/core/config/GlutenConfig.h | 2 ++
cpp/velox/compute/WholeStageResultIterator.cc | 4 +++
ep/build-velox/src/get_velox.sh | 1 +
.../gluten/utils/DecimalArithmeticUtil.scala | 35 +++++++++++++++++++---
.../scala/org/apache/gluten/GlutenConfig.scala | 2 ++
6 files changed, 41 insertions(+), 5 deletions(-)
diff --git
a/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxBackend.scala
b/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxBackend.scala
index 0238508d9..9c1089a35 100644
---
a/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxBackend.scala
+++
b/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxBackend.scala
@@ -491,7 +491,7 @@ object VeloxBackendSettings extends BackendSettingsApi {
override def supportTransformWriteFiles: Boolean = true
- override def allowDecimalArithmetic: Boolean =
SQLConf.get.decimalOperationsAllowPrecisionLoss
+ override def allowDecimalArithmetic: Boolean = true
override def enableNativeWriteFiles(): Boolean = {
GlutenConfig.getConf.enableNativeWriter.getOrElse(
diff --git a/cpp/core/config/GlutenConfig.h b/cpp/core/config/GlutenConfig.h
index ad7dacf11..060bbe111 100644
--- a/cpp/core/config/GlutenConfig.h
+++ b/cpp/core/config/GlutenConfig.h
@@ -32,6 +32,8 @@ const std::string kCaseSensitive = "spark.sql.caseSensitive";
const std::string kSessionTimezone = "spark.sql.session.timeZone";
+const std::string kAllowPrecisionLoss =
"spark.sql.decimalOperations.allowPrecisionLoss";
+
const std::string kIgnoreMissingFiles = "spark.sql.files.ignoreMissingFiles";
const std::string kDefaultSessionTimezone =
"spark.gluten.sql.session.timeZone.default";
diff --git a/cpp/velox/compute/WholeStageResultIterator.cc
b/cpp/velox/compute/WholeStageResultIterator.cc
index 296b9415b..8439545ca 100644
--- a/cpp/velox/compute/WholeStageResultIterator.cc
+++ b/cpp/velox/compute/WholeStageResultIterator.cc
@@ -455,6 +455,10 @@ std::unordered_map<std::string, std::string>
WholeStageResultIterator::getQueryC
// Adjust timestamp according to the above configured session timezone.
configs[velox::core::QueryConfig::kAdjustTimestampToTimezone] = "true";
+ // To align with Spark's behavior, allow decimal precision loss or not.
+
configs[velox::core::QueryConfig::kSparkDecimalOperationsAllowPrecisionLoss] =
+ veloxCfg_->get<std::string>(kAllowPrecisionLoss, "true");
+
{
// partial aggregation memory config
auto offHeapMemory = veloxCfg_->get<int64_t>(kSparkTaskOffHeapMemory,
facebook::velox::memory::kMaxMemory);
diff --git a/ep/build-velox/src/get_velox.sh b/ep/build-velox/src/get_velox.sh
index 0ef03bd83..3e99229ad 100755
--- a/ep/build-velox/src/get_velox.sh
+++ b/ep/build-velox/src/get_velox.sh
@@ -16,6 +16,7 @@
set -exu
+
VELOX_REPO=https://github.com/oap-project/velox.git
VELOX_BRANCH=2024_07_05
VELOX_HOME=""
diff --git
a/gluten-core/src/main/scala/org/apache/gluten/utils/DecimalArithmeticUtil.scala
b/gluten-core/src/main/scala/org/apache/gluten/utils/DecimalArithmeticUtil.scala
index 479eb8bb5..d3a3373aa 100644
---
a/gluten-core/src/main/scala/org/apache/gluten/utils/DecimalArithmeticUtil.scala
+++
b/gluten-core/src/main/scala/org/apache/gluten/utils/DecimalArithmeticUtil.scala
@@ -28,8 +28,14 @@ import org.apache.spark.sql.utils.DecimalTypeUtil
object DecimalArithmeticUtil {
+ val MIN_ADJUSTED_SCALE = 6
+ val MAX_PRECISION = 38
+ val MAX_SCALE = 38
+
// Returns the result decimal type of a decimal arithmetic computing.
def getResultType(expr: BinaryArithmetic, type1: DecimalType, type2:
DecimalType): DecimalType = {
+
+ val allowPrecisionLoss = SQLConf.get.decimalOperationsAllowPrecisionLoss
var resultScale = 0
var resultPrecision = 0
expr match {
@@ -45,13 +51,34 @@ object DecimalArithmeticUtil {
resultScale = type1.scale + type2.scale
resultPrecision = type1.precision + type2.precision + 1
case _: Divide =>
- resultScale =
- Math.max(DecimalType.MINIMUM_ADJUSTED_SCALE, type1.scale +
type2.precision + 1)
- resultPrecision = type1.precision - type1.scale + type2.scale +
resultScale
+ if (allowPrecisionLoss) {
+ resultScale = Math.max(MIN_ADJUSTED_SCALE, type1.scale +
type2.precision + 1)
+ resultPrecision = type1.precision - type1.scale + type2.scale +
resultScale
+ } else {
+ var intDig = Math.min(MAX_SCALE, type1.precision - type1.scale +
type2.scale)
+ var decDig = Math.min(MAX_SCALE, Math.max(6, type1.scale +
type2.precision + 1))
+ val diff = (intDig + decDig) - MAX_SCALE
+ if (diff > 0) {
+ decDig -= diff / 2 + 1
+ intDig = MAX_SCALE - decDig
+ }
+ resultPrecision = intDig + decDig
+ resultScale = decDig
+ }
case other =>
throw new GlutenNotSupportException(s"$other is not supported.")
}
- DecimalTypeUtil.adjustPrecisionScale(resultPrecision, resultScale)
+
+ if (allowPrecisionLoss) {
+ DecimalTypeUtil.adjustPrecisionScale(resultPrecision, resultScale)
+ } else {
+ bounded(resultPrecision, resultScale)
+ }
+
+ }
+
+ def bounded(precision: Int, scale: Int): DecimalType = {
+ DecimalType(Math.min(precision, MAX_PRECISION), Math.min(scale, MAX_SCALE))
}
// If casting between DecimalType, unnecessary cast is skipped to avoid data
loss,
diff --git a/shims/common/src/main/scala/org/apache/gluten/GlutenConfig.scala
b/shims/common/src/main/scala/org/apache/gluten/GlutenConfig.scala
index 4ef96bec2..ef4618fca 100644
--- a/shims/common/src/main/scala/org/apache/gluten/GlutenConfig.scala
+++ b/shims/common/src/main/scala/org/apache/gluten/GlutenConfig.scala
@@ -636,6 +636,7 @@ object GlutenConfig {
GLUTEN_DEFAULT_SESSION_TIMEZONE_KEY,
SQLConf.LEGACY_SIZE_OF_NULL.key,
"spark.io.compression.codec",
+ "spark.sql.decimalOperations.allowPrecisionLoss",
COLUMNAR_VELOX_BLOOM_FILTER_EXPECTED_NUM_ITEMS.key,
COLUMNAR_VELOX_BLOOM_FILTER_NUM_BITS.key,
COLUMNAR_VELOX_BLOOM_FILTER_MAX_NUM_BITS.key,
@@ -717,6 +718,7 @@ object GlutenConfig {
("spark.hadoop.input.write.timeout", "180000"),
("spark.hadoop.dfs.client.log.severity", "INFO"),
("spark.sql.orc.compression.codec", "snappy"),
+ ("spark.sql.decimalOperations.allowPrecisionLoss", "true"),
(
COLUMNAR_VELOX_FILE_HANDLE_CACHE_ENABLED.key,
COLUMNAR_VELOX_FILE_HANDLE_CACHE_ENABLED.defaultValueString),
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]