This is an automated email from the ASF dual-hosted git repository.
liuneng pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git
The following commit(s) were added to refs/heads/main by this push:
new 5b183fb59 [GLUTEN-5303][CH]Fix get_json_object on abnormal string
contains `NULL` control character (#5304)
5b183fb59 is described below
commit 5b183fb591e9b64a6f76bdd305dc4a55c1efb79a
Author: KevinyhZou <[email protected]>
AuthorDate: Mon Apr 15 17:41:57 2024 +0800
[GLUTEN-5303][CH]Fix get_json_object on abnormal string contains `NULL`
control character (#5304)
What changes were proposed in this pull request?
(Please fill in changes proposed in this fix)
(Fixes: #5303)
How was this patch tested?
TEST BY UT
---
.../resources/text-data/abnormal-json/data2.txt | Bin 0 -> 36 bytes
.../execution/GlutenClickHouseHiveTableSuite.scala | 3 +++
.../Functions/SparkFunctionGetJsonObject.h | 21 +++++++++------------
3 files changed, 12 insertions(+), 12 deletions(-)
diff --git
a/backends-clickhouse/src/test/resources/text-data/abnormal-json/data2.txt
b/backends-clickhouse/src/test/resources/text-data/abnormal-json/data2.txt
new file mode 100644
index 000000000..485fb6d72
Binary files /dev/null and
b/backends-clickhouse/src/test/resources/text-data/abnormal-json/data2.txt
differ
diff --git
a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseHiveTableSuite.scala
b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseHiveTableSuite.scala
index 1939883c8..0ac64ca44 100644
---
a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseHiveTableSuite.scala
+++
b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseHiveTableSuite.scala
@@ -951,11 +951,14 @@ class GlutenClickHouseHiveTableSuite
val select_sql_3 = "select id, get_json_object(data, '$.123.234') from
test_tbl_3337"
val select_sql_4 = "select id, get_json_object(data, '$.v111') from
test_tbl_3337"
val select_sql_5 = "select id, get_json_object(data, 'v112') from
test_tbl_3337"
+ val select_sql_6 =
+ "select id, get_json_object(data, '$.id') from test_tbl_3337 where id =
123";
compareResultsAgainstVanillaSpark(select_sql_1, compareResult = true, _ =>
{})
compareResultsAgainstVanillaSpark(select_sql_2, compareResult = true, _ =>
{})
compareResultsAgainstVanillaSpark(select_sql_3, compareResult = true, _ =>
{})
compareResultsAgainstVanillaSpark(select_sql_4, compareResult = true, _ =>
{})
compareResultsAgainstVanillaSpark(select_sql_5, compareResult = true, _ =>
{})
+ compareResultsAgainstVanillaSpark(select_sql_6, compareResult = true, _ =>
{})
spark.sql("DROP TABLE test_tbl_3337")
}
diff --git a/cpp-ch/local-engine/Functions/SparkFunctionGetJsonObject.h
b/cpp-ch/local-engine/Functions/SparkFunctionGetJsonObject.h
index d9411813b..6ba05e901 100644
--- a/cpp-ch/local-engine/Functions/SparkFunctionGetJsonObject.h
+++ b/cpp-ch/local-engine/Functions/SparkFunctionGetJsonObject.h
@@ -212,22 +212,20 @@ public:
private:
DB::ContextPtr context;
- void parseAbnormalJson(char * dst, std::string_view & json) const
+ size_t normalizeJson(std::string_view & json, char * dst) const
{
const char * json_chars = json.data();
const size_t json_size = json.size();
- UInt8 NULL_CHAR = 0x0000;
- UInt8 SPACE_CHAR = 0x0020;
std::stack<char> tmp;
- size_t cursor = 0;
+ size_t new_json_size = 0;
for (size_t i = 0; i <= json_size; ++i)
{
- if (*(json_chars + i) > NULL_CHAR && *(json_chars + i) <
SPACE_CHAR)
+ if ((*(json_chars + i) >= 0x00 && *(json_chars + i) <= 0x1F) ||
*(json_chars + i) == 0x7F)
continue;
else
{
char ch = *(json_chars + i);
- dst[cursor++] = ch;
+ dst[new_json_size++] = ch;
if (ch == '{')
tmp.push('{');
else if (ch == '}')
@@ -239,8 +237,7 @@ private:
break;
}
}
- std::string_view result{dst, cursor};
- json = result;
+ return new_json_size;
}
template <typename JSONParser, typename Impl>
@@ -323,8 +320,8 @@ private:
if (!document_ok)
{
char dst[json.size()];
- parseAbnormalJson(dst, json);
- document_ok = parser.parse(json, document);
+ size_t size = normalizeJson(json, dst);
+ document_ok = parser.parse(std::string_view(dst, size),
document);
}
}
@@ -345,8 +342,8 @@ private:
if (!document_ok)
{
char dst[json.size()];
- parseAbnormalJson(dst, json);
- document_ok = parser.parse(json, document);
+ size_t size = normalizeJson(json, dst);
+ document_ok = parser.parse(std::string_view(dst, size),
document);
}
}
if (document_ok)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]