This is an automated email from the ASF dual-hosted git repository.
yumwang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new d5fa41efe2b [SPARK-41741][SQL] Encode the string using the UTF_8
charset in ParquetFilters
d5fa41efe2b is described below
commit d5fa41efe2b1aa0aa41f558c1bef048b4632cf5c
Author: Yuming Wang <[email protected]>
AuthorDate: Mon Feb 20 19:15:30 2023 +0800
[SPARK-41741][SQL] Encode the string using the UTF_8 charset in
ParquetFilters
### What changes were proposed in this pull request?
This PR makes it encode the string using the `UTF_8` charset in
`ParquetFilters`.
### Why are the changes needed?
Fix data issue where the default charset is not `UTF_8`.
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
Manual test.
Closes #40090 from wangyum/SPARK-41741.
Authored-by: Yuming Wang <[email protected]>
Signed-off-by: Yuming Wang <[email protected]>
---
.../spark/sql/execution/datasources/parquet/ParquetFilters.scala | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala
index c34f2827659..6994e1ba39d 100644
---
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala
+++
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.execution.datasources.parquet
import java.lang.{Boolean => JBoolean, Double => JDouble, Float => JFloat,
Long => JLong}
import java.math.{BigDecimal => JBigDecimal}
+import java.nio.charset.StandardCharsets.UTF_8
import java.sql.{Date, Timestamp}
import java.time.{Duration, Instant, LocalDate, Period}
import java.util.HashSet
@@ -776,7 +777,7 @@ class ParquetFilters(
Option(prefix).map { v =>
FilterApi.userDefined(binaryColumn(nameToParquetField(name).fieldNames),
new UserDefinedPredicate[Binary] with Serializable {
- private val strToBinary = Binary.fromReusedByteArray(v.getBytes)
+ private val strToBinary =
Binary.fromReusedByteArray(v.getBytes(UTF_8))
private val size = strToBinary.length
override def canDrop(statistics: Statistics[Binary]): Boolean = {
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]