This is an automated email from the ASF dual-hosted git repository.

philo pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git


The following commit(s) were added to refs/heads/main by this push:
     new fa12819be [VL] Skip UTF-8 validation in JSON parsing (#6661)
fa12819be is described below

commit fa12819be3cb73493e6f3c035bfbc75261f53550
Author: PHILO-HE <[email protected]>
AuthorDate: Wed Aug 7 17:30:07 2024 +0800

    [VL] Skip UTF-8 validation in JSON parsing (#6661)
---
 .../gluten/execution/ScalarFunctionsValidateSuite.scala    | 14 +++++++++++---
 ep/build-velox/src/modify_velox.patch                      | 11 +++++++++++
 2 files changed, 22 insertions(+), 3 deletions(-)

diff --git 
a/backends-velox/src/test/scala/org/apache/gluten/execution/ScalarFunctionsValidateSuite.scala
 
b/backends-velox/src/test/scala/org/apache/gluten/execution/ScalarFunctionsValidateSuite.scala
index 897c1c5f5..a0f7d22e4 100644
--- 
a/backends-velox/src/test/scala/org/apache/gluten/execution/ScalarFunctionsValidateSuite.scala
+++ 
b/backends-velox/src/test/scala/org/apache/gluten/execution/ScalarFunctionsValidateSuite.scala
@@ -263,20 +263,28 @@ abstract class ScalarFunctionsValidateSuite extends 
FunctionsValidateTest {
     }
   }
 
-  test("Test get_json_object datatab function") {
+  test("get_json_object") {
     runQueryAndCompare(
       "SELECT get_json_object(string_field1, '$.a') " +
         "from datatab limit 1;") {
       checkGlutenOperatorMatch[ProjectExecTransformer]
     }
-  }
 
-  test("Test get_json_object lineitem function") {
     runQueryAndCompare(
       "SELECT l_orderkey, get_json_object('{\"a\":\"b\"}', '$.a') " +
         "from lineitem limit 1;") {
       checkGlutenOperatorMatch[ProjectExecTransformer]
     }
+
+    // Invalid UTF-8 encoding.
+    spark.sql(
+      "CREATE TABLE t USING parquet SELECT concat('{\"a\": 2, \"'," +
+        " string(X'80'), '\": 3, \"c\": 100}') AS c1")
+    withTable("t") {
+      runQueryAndCompare("SELECT get_json_object(c1, '$.c') FROM t;") {
+        checkGlutenOperatorMatch[ProjectExecTransformer]
+      }
+    }
   }
 
   ignore("json_array_length") {
diff --git a/ep/build-velox/src/modify_velox.patch 
b/ep/build-velox/src/modify_velox.patch
index 533b49353..c710ff545 100644
--- a/ep/build-velox/src/modify_velox.patch
+++ b/ep/build-velox/src/modify_velox.patch
@@ -180,3 +180,14 @@ index 97266c253..11d88dcc4 100644
  
  add_library(
    velox_dwio_arrow_parquet_writer_test_lib
+diff --git a/CMake/resolve_dependency_modules/simdjson.cmake 
b/CMake/resolve_dependency_modules/simdjson.cmake
+index 69e7f2044..777eb5ec1 100644
+--- a/CMake/resolve_dependency_modules/simdjson.cmake
++++ b/CMake/resolve_dependency_modules/simdjson.cmake
+@@ -29,4 +29,6 @@ FetchContent_Declare(
+   URL ${VELOX_SIMDJSON_SOURCE_URL}
+   URL_HASH ${VELOX_SIMDJSON_BUILD_SHA256_CHECKSUM})
+
++set(SIMDJSON_SKIPUTF8VALIDATION ON)
++
+ FetchContent_MakeAvailable(simdjson)


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to