This is an automated email from the ASF dual-hosted git repository.
zhanglistar pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git
The following commit(s) were added to refs/heads/main by this push:
new 29f05f3ffc fixed: json path contains spaces (#7435)
29f05f3ffc is described below
commit 29f05f3ffcb8c0b21cd4372b4cbbe7642d3c87d7
Author: lgbo <[email protected]>
AuthorDate: Thu Oct 10 15:40:30 2024 +0800
fixed: json path contains spaces (#7435)
---
.../execution/GlutenFunctionValidateSuite.scala | 17 +++
.../Functions/SparkFunctionGetJsonObject.h | 119 ++++++++++++++++++---
2 files changed, 122 insertions(+), 14 deletions(-)
diff --git
a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenFunctionValidateSuite.scala
b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenFunctionValidateSuite.scala
index b268eb7192..83deb89a16 100644
---
a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenFunctionValidateSuite.scala
+++
b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenFunctionValidateSuite.scala
@@ -801,4 +801,21 @@ class GlutenFunctionValidateSuite extends
GlutenClickHouseWholeStageTransformerS
|""".stripMargin
runQueryAndCompare(sql)(checkGlutenOperatorMatch[ProjectExecTransformer])
}
+
+ test("GLUTEN-7426 get_json_object") {
+ val sql = """
+ |select
+ |get_json_object(a, '$.a b'),
+ |get_json_object(a, '$.a b '),
+ |get_json_object(a, '$.a b c'),
+ |get_json_object(a, '$.a 1 c'),
+ |get_json_object(a, '$.1 '),
+ |get_json_object(a, '$.1 2'),
+ |get_json_object(a, '$.1 2 c')
+ |from values('{"a b":1}'), ('{"a b ":1}'), ('{"a b c":1}')
+ |, ('{"a 1 c":1}'), ('{"1 ":1}'), ('{"1 2":1}'), ('{"1 2
c":1}')
+ |as data(a)
+ """.stripMargin
+ runQueryAndCompare(sql)(checkGlutenOperatorMatch[ProjectExecTransformer])
+ }
}
diff --git a/cpp-ch/local-engine/Functions/SparkFunctionGetJsonObject.h
b/cpp-ch/local-engine/Functions/SparkFunctionGetJsonObject.h
index 125ab0a394..bc0dd9ea9e 100644
--- a/cpp-ch/local-engine/Functions/SparkFunctionGetJsonObject.h
+++ b/cpp-ch/local-engine/Functions/SparkFunctionGetJsonObject.h
@@ -50,11 +50,11 @@ extern const SettingsUInt64 max_parser_backtracks;
}
namespace ErrorCodes
{
- extern const int LOGICAL_ERROR;
- extern const int TOO_FEW_ARGUMENTS_FOR_FUNCTION;
- extern const int ILLEGAL_TYPE_OF_ARGUMENT;
- extern const int ILLEGAL_COLUMN;
- extern const int ILLEGAL_TYPE_OF_ARGUMENT;
+extern const int LOGICAL_ERROR;
+extern const int TOO_FEW_ARGUMENTS_FOR_FUNCTION;
+extern const int ILLEGAL_TYPE_OF_ARGUMENT;
+extern const int ILLEGAL_COLUMN;
+extern const int ILLEGAL_TYPE_OF_ARGUMENT;
}
}
namespace local_engine
@@ -215,7 +215,7 @@ private:
/// To use simdjson, we need to convert single quotes to double quotes.
/// FIXME: It will be OK if we just return a leaf value, but it will have
different result for
/// returning a object with strings which are wrapped by single quotes.
- inline static const char * normalizeSingleQuotesString(const char * pos,
const char * end, char *&dst)
+ inline static const char * normalizeSingleQuotesString(const char * pos,
const char * end, char *& dst)
{
if (!isExpectedChar('\'', pos, end)) [[unlikely]]
{
@@ -403,7 +403,7 @@ class GetJsonObjectImpl
public:
using Element = typename JSONParser::Element;
- static DB::DataTypePtr getReturnType(const char *, const
DB::ColumnsWithTypeAndName &, bool )
+ static DB::DataTypePtr getReturnType(const char *, const
DB::ColumnsWithTypeAndName &, bool)
{
auto nested_type = std::make_shared<DB::DataTypeString>();
return std::make_shared<DB::DataTypeNullable>(nested_type);
@@ -411,8 +411,7 @@ public:
static size_t getNumberOfIndexArguments(const DB::ColumnsWithTypeAndName &
arguments) { return arguments.size() - 1; }
- bool insertResultToColumn(
- DB::IColumn & dest, const Element & root,
DB::GeneratorJSONPath<JSONParser> & generator_json_path, bool )
+ bool insertResultToColumn(DB::IColumn & dest, const Element & root,
DB::GeneratorJSONPath<JSONParser> & generator_json_path, bool)
{
Element current_element = root;
DB::VisitorStatus status;
@@ -447,7 +446,7 @@ public:
if (elements[0].isNull())
return false;
nullable_col_str.getNullMapData().push_back(0);
-
+
if (elements[0].isString())
{
auto str = elements[0].getString();
@@ -480,8 +479,99 @@ public:
serializer.commit();
return true;
}
+};
-private:
+/// If a json field containt spaces, we wrap it by double quotes.
+/// FIXME: If it contains \t, \n, simdjson cannot parse.
+class JSONPathNormalizer
+{
+public:
+ static String normalize(const String & json_path_)
+ {
+ DB::Tokens tokens(json_path_.data(), json_path_.data() +
json_path_.size());
+ DB::IParser::Pos pos(tokens, 0, 0);
+ String res;
+ while (pos->type != DB::TokenType::EndOfStream)
+ {
+ if (pos->type == DB::TokenType::Number)
+ {
+ ++pos;
+ // Two tokens are seperated by white spaces.
+ if (pos->type == DB::TokenType::Number || pos->type ==
DB::TokenType::BareWord)
+ {
+ --pos;
+ if (*pos->begin == '.')
+ res += ".";
+ ++pos;
+ res += "\"";
+
+ while (pos->type == DB::TokenType::Number || pos->type ==
DB::TokenType::BareWord)
+ {
+ --pos;
+ const auto * last_end = pos->end;
+ const auto * begin = *pos->begin == '.' ? pos->begin +
1 : pos->begin;
+ res += String(begin, pos->end);
+ ++pos;
+ res += String(last_end, pos->begin);
+ ++pos;
+ }
+ --pos;
+ const auto * last_end = pos->end;
+ res += String(pos->begin, pos->end);
+ ++pos;
+ res += String(last_end, pos->begin);
+ res += "\"";
+ }
+ else if (
+ pos->type == DB::TokenType::Dot || pos->type ==
DB::TokenType::OpeningSquareBracket
+ || pos->type == DB::TokenType::EndOfStream)
+ {
+ --pos;
+ if (*pos->begin == '.')
+ res += ".";
+ res += "\"";
+ const auto * last_end = pos->end;
+ const auto * begin = *pos->begin == '.' ? pos->begin + 1 :
pos->begin;
+ res += String(begin, pos->end);
+ ++pos;
+ res += String(last_end, pos->begin);
+ res += "\"";
+ }
+ else
+ {
+ --pos;
+ res += String(pos->begin, pos->end);
+ ++pos;
+ }
+ }
+ else if (pos->type == DB::TokenType::BareWord)
+ {
+ res += "\"";
+ ++pos;
+ while (pos->type == DB::TokenType::Number || pos->type ==
DB::TokenType::BareWord)
+ {
+ --pos;
+ const auto * last_end = pos->end;
+ res += String(pos->begin, pos->end);
+ ++pos;
+ res += String(last_end, pos->begin);
+ ++pos;
+ }
+ --pos;
+ const auto * last_end = pos->end;
+ res += String(pos->begin, pos->end);
+ ++pos;
+ res += String(last_end, pos->begin);
+ res += "\"";
+ }
+ else
+ {
+ res += String(pos->begin, pos->end);
+ ++pos;
+ }
+ }
+ return res;
+ }
};
/// Flatten a json string into a tuple.
@@ -550,7 +640,7 @@ private:
mutable size_t total_parsed_rows = 0;
mutable size_t total_normalized_rows = 0;
- template<typename JSONParser>
+ template <typename JSONParser>
bool safeParseJson(std::string_view str, JSONParser & parser,
JSONParser::Element & doc) const
{
total_parsed_rows++;
@@ -566,7 +656,7 @@ private:
}
if (!is_doc_ok)
{
- total_normalized_rows ++;
+ total_normalized_rows++;
std::vector<char> buf;
buf.resize(str.size(), 0);
char * buf_pos = buf.data();
@@ -598,7 +688,8 @@ private:
bool path_parsed = true;
for (const auto & field : tokenizer)
{
- required_fields.push_back(field);
+ auto normalized_field = JSONPathNormalizer::normalize(field);
+ required_fields.push_back(normalized_field);
tuple_columns.emplace_back(str_type->createColumn());
const char * query_begin = reinterpret_cast<const char
*>(required_fields.back().c_str());
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]