This is an automated email from the ASF dual-hosted git repository.

liuneng pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git


The following commit(s) were added to refs/heads/main by this push:
     new 5b183fb59 [GLUTEN-5303][CH]Fix get_json_object on abnormal string 
contains `NULL` control character (#5304)
5b183fb59 is described below

commit 5b183fb591e9b64a6f76bdd305dc4a55c1efb79a
Author: KevinyhZou <[email protected]>
AuthorDate: Mon Apr 15 17:41:57 2024 +0800

    [GLUTEN-5303][CH]Fix get_json_object on abnormal string contains `NULL` 
control character (#5304)
    
    What changes were proposed in this pull request?
    (Please fill in changes proposed in this fix)
    
    (Fixes: #5303)
    
    How was this patch tested?
    TEST BY UT
---
 .../resources/text-data/abnormal-json/data2.txt    | Bin 0 -> 36 bytes
 .../execution/GlutenClickHouseHiveTableSuite.scala |   3 +++
 .../Functions/SparkFunctionGetJsonObject.h         |  21 +++++++++------------
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git 
a/backends-clickhouse/src/test/resources/text-data/abnormal-json/data2.txt 
b/backends-clickhouse/src/test/resources/text-data/abnormal-json/data2.txt
new file mode 100644
index 000000000..485fb6d72
Binary files /dev/null and 
b/backends-clickhouse/src/test/resources/text-data/abnormal-json/data2.txt 
differ
diff --git 
a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseHiveTableSuite.scala
 
b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseHiveTableSuite.scala
index 1939883c8..0ac64ca44 100644
--- 
a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseHiveTableSuite.scala
+++ 
b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseHiveTableSuite.scala
@@ -951,11 +951,14 @@ class GlutenClickHouseHiveTableSuite
     val select_sql_3 = "select id, get_json_object(data, '$.123.234') from 
test_tbl_3337"
     val select_sql_4 = "select id, get_json_object(data, '$.v111') from 
test_tbl_3337"
     val select_sql_5 = "select id, get_json_object(data, 'v112') from 
test_tbl_3337"
+    val select_sql_6 =
+      "select id, get_json_object(data, '$.id') from test_tbl_3337 where id = 
123";
     compareResultsAgainstVanillaSpark(select_sql_1, compareResult = true, _ => 
{})
     compareResultsAgainstVanillaSpark(select_sql_2, compareResult = true, _ => 
{})
     compareResultsAgainstVanillaSpark(select_sql_3, compareResult = true, _ => 
{})
     compareResultsAgainstVanillaSpark(select_sql_4, compareResult = true, _ => 
{})
     compareResultsAgainstVanillaSpark(select_sql_5, compareResult = true, _ => 
{})
+    compareResultsAgainstVanillaSpark(select_sql_6, compareResult = true, _ => 
{})
 
     spark.sql("DROP TABLE test_tbl_3337")
   }
diff --git a/cpp-ch/local-engine/Functions/SparkFunctionGetJsonObject.h 
b/cpp-ch/local-engine/Functions/SparkFunctionGetJsonObject.h
index d9411813b..6ba05e901 100644
--- a/cpp-ch/local-engine/Functions/SparkFunctionGetJsonObject.h
+++ b/cpp-ch/local-engine/Functions/SparkFunctionGetJsonObject.h
@@ -212,22 +212,20 @@ public:
 private:
     DB::ContextPtr context;
 
-    void parseAbnormalJson(char * dst, std::string_view & json) const
+    size_t normalizeJson(std::string_view & json, char * dst) const
     {
         const char * json_chars = json.data();
         const size_t json_size = json.size();
-        UInt8 NULL_CHAR = 0x0000;
-        UInt8 SPACE_CHAR = 0x0020;
         std::stack<char> tmp;
-        size_t cursor = 0;
+        size_t new_json_size = 0;
         for (size_t i = 0; i <= json_size; ++i)
         {
-            if (*(json_chars + i) > NULL_CHAR && *(json_chars + i) < 
SPACE_CHAR)
+            if ((*(json_chars + i) >= 0x00 && *(json_chars + i) <= 0x1F) || 
*(json_chars + i) == 0x7F)
                 continue;
             else
             {
                 char ch = *(json_chars + i);
-                dst[cursor++] = ch;
+                dst[new_json_size++] = ch;
                 if (ch == '{')
                     tmp.push('{');
                 else if (ch == '}')
@@ -239,8 +237,7 @@ private:
                     break;
             }
         }
-        std::string_view result{dst, cursor};
-        json = result;
+        return new_json_size;
     }
 
     template <typename JSONParser, typename Impl>
@@ -323,8 +320,8 @@ private:
             if (!document_ok)
             {
                 char dst[json.size()];
-                parseAbnormalJson(dst, json);
-                document_ok = parser.parse(json, document);
+                size_t size = normalizeJson(json, dst);
+                document_ok = parser.parse(std::string_view(dst, size), 
document);
             }
         }
 
@@ -345,8 +342,8 @@ private:
                 if (!document_ok)
                 {
                     char dst[json.size()];
-                    parseAbnormalJson(dst, json);
-                    document_ok = parser.parse(json, document);
+                    size_t size = normalizeJson(json, dst);
+                    document_ok = parser.parse(std::string_view(dst, size), 
document);
                 }
             }
             if (document_ok)


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to