This is an automated email from the ASF dual-hosted git repository.
dbecker pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git
The following commit(s) were added to refs/heads/master by this push:
new 53ce60457 IMPALA-11778: Printing maps may produce invalid json
53ce60457 is described below
commit 53ce60457b9d24fa4a7518a37d8bb9f1791d5825
Author: Daniel Becker <[email protected]>
AuthorDate: Wed Dec 14 14:50:00 2022 +0100
IMPALA-11778: Printing maps may produce invalid json
Impala allows non-string types, for example numbers, to be keys in maps.
We print maps as json objects, but json objects only allow string keys.
If the Impala map has for example an INT key, the printed json is
invalid.
For example, in Impala the following two maps are not the same:
{1: "a", 2: "b"}
{"1": "a", "2": "b"}
The first map has INT keys, the second has STRING keys. Only the second
one is valid json.
Hive has the same behaviour as Impala, i.e. it produces invalid json if
the map keys have a non-string type.
This change introduces the STRINGIFY_MAP_KEYS query option that, when
set to true, converts non-string keys to strings. The default value of
the new query option is false because
- conversion to string causes loss of information and
- setting it to true would be a breaking change.
Testing:
- Added tests in nested-map-in-select-list.test and map_null_keys.test
that check the behaviour when STRINGIFY_MAP_KEYS is set to true.
Change-Id: I1820036a1c614c34ae5d70ac4fe79a992c9bce3a
Reviewed-on: http://gerrit.cloudera.org:8080/19364
Reviewed-by: Impala Public Jenkins <[email protected]>
Tested-by: Impala Public Jenkins <[email protected]>
---
be/src/runtime/complex-value-writer.h | 50 ++++++-----
be/src/runtime/complex-value-writer.inline.h | 99 +++++++++++++---------
be/src/service/hs2-util.cc | 24 ++++--
be/src/service/hs2-util.h | 5 +-
be/src/service/impala-beeswax-server.cc | 3 +-
be/src/service/impala-hs2-server.cc | 6 +-
be/src/service/query-options.cc | 4 +
be/src/service/query-options.h | 6 +-
be/src/service/query-result-set.cc | 43 +++++++---
be/src/service/query-result-set.h | 16 ++--
common/thrift/ImpalaService.thrift | 3 +
common/thrift/Query.thrift | 3 +
.../queries/QueryTest/map_null_keys.test | 24 ++++++
.../QueryTest/nested-map-in-select-list.test | 23 +++++
14 files changed, 210 insertions(+), 99 deletions(-)
diff --git a/be/src/runtime/complex-value-writer.h
b/be/src/runtime/complex-value-writer.h
index 0de297429..e1c7d05fc 100644
--- a/be/src/runtime/complex-value-writer.h
+++ b/be/src/runtime/complex-value-writer.h
@@ -30,34 +30,40 @@
namespace impala {
-// Class with static methods that write complex types (collections and
structs) in JSON
-// format.
+// Class that writes complex types (collections and structs) in the JSON
format.
template <class JsonStream>
class ComplexValueWriter {
public:
- // Gets a non-null CollectionValue and writes it in JSON format using
'writer'.
- // 'collection_type' should be either TYPE_ARRAY or TYPE_MAP.
- static void CollectionValueToJSON(const CollectionValue& collection_value,
- PrimitiveType collection_type, const TupleDescriptor* item_tuple_desc,
- rapidjson::Writer<JsonStream>* writer);
-
- // Gets a non-null StructVal and writes it in JSON format using 'writer'.
Uses
- // 'column_type' to figure out field names and types. This function can call
itself
- // recursively in case of nested structs.
- static void StructValToJSON(const impala_udf::StructVal& struct_val,
- const ColumnType& column_type, rapidjson::Writer<JsonStream>* writer);
+ // Will use 'writer' to write JSON text. Does not take ownership of
'writer', the
+ // caller is responsible for managing its lifetime. If 'stringify_map_keys'
is true,
+ // converts map keys to strings; see IMPALA-11778.
+ ComplexValueWriter(rapidjson::Writer<JsonStream>* writer, bool
stringify_map_keys);
+
+ // Gets a non-null CollectionValue and writes it in JSON format.
'collection_type'
+ // should be either TYPE_ARRAY or TYPE_MAP.
+ void CollectionValueToJSON(const CollectionValue& collection_value,
+ PrimitiveType collection_type, const TupleDescriptor* item_tuple_desc);
+
+ // Gets a non-null StructVal and writes it in JSON format. Uses
'column_type' to figure
+ // out field names and types. This function can call itself recursively in
case of
+ // nested structs.
+ void StructValToJSON(const impala_udf::StructVal& struct_val,
+ const ColumnType& column_type);
private:
- static void PrimitiveValueToJSON(void* value, const ColumnType& type, bool
map_key,
- rapidjson::Writer<JsonStream>* writer);
- static void WriteNull(rapidjson::Writer<JsonStream>* writer);
- static void CollectionElementToJSON(Tuple* item_tuple, const SlotDescriptor&
slot_desc,
- bool map_key, rapidjson::Writer<JsonStream>* writer);
- static void ArrayValueToJSON(const CollectionValue& array_value,
- const TupleDescriptor* item_tuple_desc, rapidjson::Writer<JsonStream>*
writer);
- static void MapValueToJSON(const CollectionValue& map_value,
- const TupleDescriptor* item_tuple_desc, rapidjson::Writer<JsonStream>*
writer);
+ void PrimitiveValueToJSON(void* value, const ColumnType& type, bool map_key);
+ void WriteNull(bool map_key);
+ void CollectionElementToJSON(Tuple* item_tuple, const SlotDescriptor&
slot_desc,
+ bool map_key);
+ void ArrayValueToJSON(const CollectionValue& array_value,
+ const TupleDescriptor* item_tuple_desc);
+ void MapValueToJSON(const CollectionValue& map_value,
+ const TupleDescriptor* item_tuple_desc);
+
+ rapidjson::Writer<JsonStream>* const writer_;
+ // If true, converts map keys to strings; see IMPALA-11778.
+ const bool stringify_map_keys_;
};
} // namespace impala
diff --git a/be/src/runtime/complex-value-writer.inline.h
b/be/src/runtime/complex-value-writer.inline.h
index 52625bf27..e73a17b60 100644
--- a/be/src/runtime/complex-value-writer.inline.h
+++ b/be/src/runtime/complex-value-writer.inline.h
@@ -33,16 +33,21 @@ bool IsPrimitiveTypePrintedAsString(const ColumnType& type)
{
} // anonymous namespace
-// Writes a primitive value using 'writer'. If 'map_key' is true, treats the
value as a
+// Writes a primitive value. If 'map_key' is true, treats the value as a
// map key. Maps are printed as JSON objects, but in Impala maps can have
non-string keys
-// while JSON objects can't. We circumvent this rule by writing raw values
with the
-// 'rapidjson::kStringType' type when printing map keys. These non-string key
values will
-// not have quotes around them in the printed text.
+// while JSON objects can't. We circumvent this rule by writing values in the
following
+// way:
+// - if 'stringify_map_keys_' is false, we write raw values with the
+// 'rapidjson::kStringType' type when printing map keys; these non-string
key values
+// will not have quotes around them in the printed text.
+// - if 'stringify_map_keys_' is true, we convert the map keys to strings and
write them
+// with quotes.
+// If 'map_key' is false, 'stringify_map_keys_' has no effect.
template <class JsonStream>
void ComplexValueWriter<JsonStream>::PrimitiveValueToJSON(void* value,
- const ColumnType& type, bool map_key, rapidjson::Writer<JsonStream>*
writer) {
+ const ColumnType& type, bool map_key) {
if (type.IsBooleanType() && !map_key) {
- writer->Bool( *(reinterpret_cast<bool*>(value)) );
+ writer_->Bool( *(reinterpret_cast<bool*>(value)) );
return;
}
@@ -51,34 +56,40 @@ void
ComplexValueWriter<JsonStream>::PrimitiveValueToJSON(void* value,
// (used when printing floating point values).
constexpr int scale = -1;
RawValue::PrintValue(value, type, scale, &tmp);
- if (IsPrimitiveTypePrintedAsString(type)) {
- writer->String(tmp.c_str());
+ const bool should_convert_to_string = map_key && stringify_map_keys_;
+ if (IsPrimitiveTypePrintedAsString(type) || should_convert_to_string) {
+ writer_->String(tmp.c_str());
} else {
- writer->RawValue(tmp.c_str(), tmp.size(),
+ writer_->RawValue(tmp.c_str(), tmp.size(),
map_key ? rapidjson::kStringType : rapidjson::kNumberType);
}
}
template <class JsonStream>
-void ComplexValueWriter<JsonStream>::WriteNull(rapidjson::Writer<JsonStream>*
writer) {
- // We don't use writer->Null() because that does not work for map keys. Maps
are
+void ComplexValueWriter<JsonStream>::WriteNull(bool map_key) {
+ // We don't use writer_->Null() because that does not work for map keys.
Maps are
// modelled as JSON objects but JSON objects can only have string keys, not
nulls. We
- // circumvent this limitation by writing a raw value.
+ // circumvent this limitation by writing the string "null" with quotes if
+ // 'stringify_map_keys_' is true and a raw null value without quotes if it
is false.
constexpr char const* null_str = RawValue::NullLiteral(/*top_level*/ false);
- const int null_str_len = std::char_traits<char>::length(null_str);
- writer->RawValue(null_str, null_str_len, rapidjson::kStringType);
+
+ if (map_key && stringify_map_keys_) {
+ writer_->String(null_str);
+ } else {
+ const int null_str_len = std::char_traits<char>::length(null_str);
+ writer_->RawValue(null_str, null_str_len, rapidjson::kStringType);
+ }
}
template <class JsonStream>
void ComplexValueWriter<JsonStream>::CollectionElementToJSON(Tuple* item_tuple,
- const SlotDescriptor& slot_desc, bool map_key,
- rapidjson::Writer<JsonStream>* writer) {
+ const SlotDescriptor& slot_desc, bool map_key) {
void* element = item_tuple->GetSlot(slot_desc.tuple_offset());
DCHECK(element != nullptr);
const ColumnType& element_type = slot_desc.type();
bool element_is_null = item_tuple->IsNull(slot_desc.null_indicator_offset());
if (element_is_null) {
- WriteNull(writer);
+ WriteNull(map_key);
} else if (element_type.IsStructType()) {
DCHECK(false) << "Structs in collections are not supported yet.";
} else if (element_type.IsCollectionType()) {
@@ -88,17 +99,16 @@ void
ComplexValueWriter<JsonStream>::CollectionElementToJSON(Tuple* item_tuple,
slot_desc.children_tuple_descriptor();
DCHECK(child_item_tuple_desc != nullptr);
ComplexValueWriter::CollectionValueToJSON(*nested_collection_val,
element_type.type,
- child_item_tuple_desc, writer);
+ child_item_tuple_desc);
} else {
- PrimitiveValueToJSON(element, element_type, map_key, writer);
+ PrimitiveValueToJSON(element, element_type, map_key);
}
}
-// Gets an ArrayValue and writes it in JSON format using 'writer'.
+// Gets an array value and writes it in JSON format.
template <class JsonStream>
void ComplexValueWriter<JsonStream>::ArrayValueToJSON(const CollectionValue&
array_value,
- const TupleDescriptor* item_tuple_desc,
- rapidjson::Writer<JsonStream>* writer) {
+ const TupleDescriptor* item_tuple_desc) {
DCHECK(item_tuple_desc != nullptr);
const int item_byte_size = item_tuple_desc->byte_size();
@@ -107,23 +117,22 @@ void
ComplexValueWriter<JsonStream>::ArrayValueToJSON(const CollectionValue& arr
DCHECK(slot_descs.size() == 1);
DCHECK(slot_descs[0] != nullptr);
- writer->StartArray();
+ writer_->StartArray();
for (int i = 0; i < array_value.num_tuples; ++i) {
Tuple* item_tuple = reinterpret_cast<Tuple*>(
array_value.ptr + i * item_byte_size);
- // Print the value.
const SlotDescriptor& value_slot_desc = *slot_descs[0];
- CollectionElementToJSON(item_tuple, value_slot_desc, false, writer);
+ CollectionElementToJSON(item_tuple, value_slot_desc, false);
}
- writer->EndArray();
+ writer_->EndArray();
}
-// Gets a MapValue and writes it in JSON format using 'writer'.
+// Gets a map value and writes it in JSON format.
template <class JsonStream>
void ComplexValueWriter<JsonStream>::MapValueToJSON(const CollectionValue&
map_value,
- const TupleDescriptor* item_tuple_desc, rapidjson::Writer<JsonStream>*
writer) {
+ const TupleDescriptor* item_tuple_desc) {
DCHECK(item_tuple_desc != nullptr);
const int item_byte_size = item_tuple_desc->byte_size();
@@ -133,54 +142,60 @@ void ComplexValueWriter<JsonStream>::MapValueToJSON(const
CollectionValue& map_v
DCHECK(slot_descs[0] != nullptr);
DCHECK(slot_descs[1] != nullptr);
- writer->StartObject();
+ writer_->StartObject();
for (int i = 0; i < map_value.num_tuples; ++i) {
Tuple* item_tuple = reinterpret_cast<Tuple*>(
map_value.ptr + i * item_byte_size);
// Print key.
const SlotDescriptor& key_slot_desc = *slot_descs[0];
- CollectionElementToJSON(item_tuple, key_slot_desc, true, writer);
+ CollectionElementToJSON(item_tuple, key_slot_desc, true);
// Print the value.
const SlotDescriptor& value_slot_desc = *slot_descs[1];
- CollectionElementToJSON(item_tuple, value_slot_desc, false, writer);
+ CollectionElementToJSON(item_tuple, value_slot_desc, false);
}
- writer->EndObject();
+ writer_->EndObject();
}
+template <class JsonStream>
+ComplexValueWriter<JsonStream>::ComplexValueWriter(rapidjson::Writer<JsonStream>*
writer,
+ bool stringify_map_keys)
+ : writer_(writer),
+ stringify_map_keys_(stringify_map_keys) {}
+
template <class JsonStream>
void ComplexValueWriter<JsonStream>::CollectionValueToJSON(
const CollectionValue& collection_value, PrimitiveType collection_type,
- const TupleDescriptor* item_tuple_desc, rapidjson::Writer<JsonStream>*
writer) {
+ const TupleDescriptor* item_tuple_desc) {
if (collection_type == PrimitiveType::TYPE_MAP) {
- MapValueToJSON(collection_value, item_tuple_desc, writer);
+ MapValueToJSON(collection_value, item_tuple_desc);
} else {
DCHECK_EQ(collection_type, PrimitiveType::TYPE_ARRAY);
- ArrayValueToJSON(collection_value, item_tuple_desc, writer);
+ ArrayValueToJSON(collection_value, item_tuple_desc);
}
}
template <class JsonStream>
void ComplexValueWriter<JsonStream>::StructValToJSON(const StructVal&
struct_val,
- const ColumnType& column_type, rapidjson::Writer<JsonStream>* writer) {
+ const ColumnType& column_type) {
DCHECK(column_type.type == TYPE_STRUCT);
DCHECK_EQ(struct_val.num_children, column_type.children.size());
- writer->StartObject();
+ writer_->StartObject();
for (int i = 0; i < struct_val.num_children; ++i) {
- writer->String(column_type.field_names[i].c_str());
+ writer_->String(column_type.field_names[i].c_str());
void* child = (void*)(struct_val.ptr[i]);
const ColumnType& child_type = column_type.children[i];
if (child == nullptr) {
- WriteNull(writer);
+ WriteNull(false);
} else if (child_type.IsStructType()) {
- StructValToJSON(*((StructVal*)child), child_type, writer);
+ StructValToJSON(*((StructVal*)child), child_type);
} else {
- PrimitiveValueToJSON(child, child_type, false, writer);
+ PrimitiveValueToJSON(child, child_type, false);
}
}
- writer->EndObject();
+ writer_->EndObject();
}
} // namespace impala
diff --git a/be/src/service/hs2-util.cc b/be/src/service/hs2-util.cc
index be9a0f233..b3255d545 100644
--- a/be/src/service/hs2-util.cc
+++ b/be/src/service/hs2-util.cc
@@ -387,8 +387,11 @@ static void
StructExprValuesToHS2TColumn(ScalarExprEvaluator* expr_eval,
buffer.Clear();
rapidjson::Writer<rapidjson::StringBuffer> writer(buffer);
- ComplexValueWriter<rapidjson::StringBuffer>::StructValToJSON(
- struct_val, column_type, &writer);
+ // TODO: Create a stringify_map_keys parameter and pass that when
+ // "IMPALA-9551: Allow mixed complex types in select list" is done.
+ ComplexValueWriter<rapidjson::StringBuffer>
complex_value_writer(&writer, false);
+ complex_value_writer.StructValToJSON(struct_val, column_type);
+
column->stringVal.values.emplace_back(buffer.GetString());
}
SetNullBit(output_row_idx, struct_val.is_null, &column->stringVal.nulls);
@@ -398,7 +401,8 @@ static void
StructExprValuesToHS2TColumn(ScalarExprEvaluator* expr_eval,
static void CollectionExprValuesToHS2TColumn(ScalarExprEvaluator* expr_eval,
const TColumnType& type, RowBatch* batch, int start_idx, int num_rows,
- uint32_t output_row_idx, apache::hive::service::cli::thrift::TColumn*
column) {
+ uint32_t output_row_idx, bool stringify_map_keys,
+ apache::hive::service::cli::thrift::TColumn* column) {
DCHECK(type.types.size() > 1);
TTypeNodeType::type coll_thrift_type = type.types[0].type;
DCHECK(coll_thrift_type == TTypeNodeType::ARRAY ||
@@ -424,8 +428,11 @@ static void
CollectionExprValuesToHS2TColumn(ScalarExprEvaluator* expr_eval,
buffer.Clear();
rapidjson::Writer<rapidjson::StringBuffer> writer(buffer);
- ComplexValueWriter<rapidjson::StringBuffer>::CollectionValueToJSON(
- value, coll_impala_type, item_tuple_desc, &writer);
+ ComplexValueWriter<rapidjson::StringBuffer> complex_value_writer(
+ &writer, stringify_map_keys);
+ complex_value_writer.CollectionValueToJSON(value, coll_impala_type,
+ item_tuple_desc);
+
column->stringVal.values.emplace_back(buffer.GetString());
}
SetNullBit(output_row_idx, coll_val.is_null, &column->stringVal.nulls);
@@ -436,7 +443,8 @@ static void
CollectionExprValuesToHS2TColumn(ScalarExprEvaluator* expr_eval,
// For V6 and above
void impala::ExprValuesToHS2TColumn(ScalarExprEvaluator* expr_eval,
const TColumnType& type, RowBatch* batch, int start_idx, int num_rows,
- uint32_t output_row_idx, apache::hive::service::cli::thrift::TColumn*
column) {
+ uint32_t output_row_idx, bool stringify_map_keys,
+ apache::hive::service::cli::thrift::TColumn* column) {
// Dispatch to a templated function for the loop over rows. This avoids
branching on
// the type for every row.
// TODO: instead of relying on stamped out implementations, we could codegen
this loop
@@ -448,8 +456,8 @@ void impala::ExprValuesToHS2TColumn(ScalarExprEvaluator*
expr_eval,
return;
case TTypeNodeType::ARRAY:
case TTypeNodeType::MAP:
- CollectionExprValuesToHS2TColumn(
- expr_eval, type, batch, start_idx, num_rows, output_row_idx, column);
+ CollectionExprValuesToHS2TColumn(expr_eval, type, batch, start_idx,
num_rows,
+ output_row_idx, stringify_map_keys, column);
return;
default:
break;
diff --git a/be/src/service/hs2-util.h b/be/src/service/hs2-util.h
index b6a97b09a..50618e4a6 100644
--- a/be/src/service/hs2-util.h
+++ b/be/src/service/hs2-util.h
@@ -35,11 +35,12 @@ void TColumnValueToHS2TColumn(const TColumnValue& col_val,
const TColumnType& ty
/// Evaluate 'expr_eval' over the row [start_idx, start_idx + num_rows) from
'batch' into
/// 'column' with 'type' starting at output_row_idx. The caller is responsible
for
-/// calling RuntimeState::GetQueryStatus() to check for expression evaluation
errors.
+/// calling RuntimeState::GetQueryStatus() to check for expression evaluation
errors. If
+/// 'stringify_map_keys' is true, converts map keys to strings; see
IMPALA-11778.
/// For V6->
void ExprValuesToHS2TColumn(ScalarExprEvaluator* expr_eval, const TColumnType&
type,
RowBatch* batch, int start_idx, int num_rows, uint32_t output_row_idx,
- apache::hive::service::cli::thrift::TColumn* column);
+ bool stringify_map_keys, apache::hive::service::cli::thrift::TColumn*
column);
/// For V1->V5
void TColumnValueToHS2TColumnValue(const TColumnValue& col_val, const
TColumnType& type,
diff --git a/be/src/service/impala-beeswax-server.cc
b/be/src/service/impala-beeswax-server.cc
index da3c05a98..2b10e8c17 100644
--- a/be/src/service/impala-beeswax-server.cc
+++ b/be/src/service/impala-beeswax-server.cc
@@ -624,7 +624,8 @@ Status ImpalaServer::FetchInternal(TUniqueId query_id,
const bool start_over,
query_results->data.clear();
if (!query_handle->eos()) {
scoped_ptr<QueryResultSet>
result_set(QueryResultSet::CreateAsciiQueryResultSet(
- *query_handle->result_metadata(), &query_results->data));
+ *query_handle->result_metadata(), &query_results->data,
+ query_handle->query_options().stringify_map_keys));
fetch_rows_status =
query_handle->FetchRows(fetch_size, result_set.get(),
block_on_wait_time_us);
}
diff --git a/be/src/service/impala-hs2-server.cc
b/be/src/service/impala-hs2-server.cc
index 7f9abc34c..e0164e862 100644
--- a/be/src/service/impala-hs2-server.cc
+++ b/be/src/service/impala-hs2-server.cc
@@ -219,7 +219,8 @@ Status ImpalaServer::FetchInternal(TUniqueId query_id,
SessionState* session,
TProtocolVersion::type version = is_child_query ?
TProtocolVersion::HIVE_CLI_SERVICE_PROTOCOL_V1 : session->hs2_version;
scoped_ptr<QueryResultSet> result_set(QueryResultSet::CreateHS2ResultSet(
- version, *(query_handle->result_metadata()), &(fetch_results->results)));
+ version, *(query_handle->result_metadata()), &(fetch_results->results),
+ query_handle->query_options().stringify_map_keys));
RETURN_IF_ERROR(
query_handle->FetchRows(fetch_size, result_set.get(),
block_on_wait_time_us));
*num_results = result_set->size();
@@ -541,7 +542,8 @@ Status ImpalaServer::SetupResultsCacheing(const
QueryHandle& query_handle,
if (cache_num_rows > 0) {
const TResultSetMetadata* result_set_md = query_handle->result_metadata();
QueryResultSet* result_set =
- QueryResultSet::CreateHS2ResultSet(session->hs2_version,
*result_set_md, nullptr);
+ QueryResultSet::CreateHS2ResultSet(session->hs2_version,
*result_set_md, nullptr,
+ query_handle->query_options().stringify_map_keys);
RETURN_IF_ERROR(query_handle->SetResultCache(result_set, cache_num_rows));
}
return Status::OK();
diff --git a/be/src/service/query-options.cc b/be/src/service/query-options.cc
index a661096c4..4579a9612 100644
--- a/be/src/service/query-options.cc
+++ b/be/src/service/query-options.cc
@@ -1052,6 +1052,10 @@ Status impala::SetQueryOption(const string& key, const
string& value,
query_options->__set_fallback_db_for_functions(value);
break;
}
+ case TImpalaQueryOptions::STRINGIFY_MAP_KEYS: {
+ query_options->__set_stringify_map_keys(IsTrue(value));
+ break;
+ }
default:
if (IsRemovedQueryOption(key)) {
LOG(WARNING) << "Ignoring attempt to set removed query option '" <<
key << "'";
diff --git a/be/src/service/query-options.h b/be/src/service/query-options.h
index 4f5330435..d21038491 100644
--- a/be/src/service/query-options.h
+++ b/be/src/service/query-options.h
@@ -50,7 +50,7 @@ typedef std::unordered_map<string,
beeswax::TQueryOptionLevel::type>
// time we add or remove a query option to/from the enum TImpalaQueryOptions.
#define QUERY_OPTS_TABLE
\
DCHECK_EQ(_TImpalaQueryOptions_VALUES_TO_NAMES.size(),
\
- TImpalaQueryOptions::CODEGEN_CACHE_MODE + 1);
\
+ TImpalaQueryOptions::STRINGIFY_MAP_KEYS + 1);
\
REMOVED_QUERY_OPT_FN(abort_on_default_limit_exceeded,
ABORT_ON_DEFAULT_LIMIT_EXCEEDED) \
QUERY_OPT_FN(abort_on_error, ABORT_ON_ERROR, TQueryOptionLevel::REGULAR)
\
REMOVED_QUERY_OPT_FN(allow_unsupported_formats, ALLOW_UNSUPPORTED_FORMATS)
\
@@ -280,7 +280,9 @@ typedef std::unordered_map<string,
beeswax::TQueryOptionLevel::type>
fallback_db_for_functions, FALLBACK_DB_FOR_FUNCTIONS,
TQueryOptionLevel::ADVANCED) \
QUERY_OPT_FN(
\
disable_codegen_cache, DISABLE_CODEGEN_CACHE,
TQueryOptionLevel::ADVANCED) \
- QUERY_OPT_FN(codegen_cache_mode, CODEGEN_CACHE_MODE,
TQueryOptionLevel::DEVELOPMENT);
+ QUERY_OPT_FN(codegen_cache_mode, CODEGEN_CACHE_MODE,
TQueryOptionLevel::DEVELOPMENT) \
+ QUERY_OPT_FN(stringify_map_keys, STRINGIFY_MAP_KEYS,
TQueryOptionLevel::ADVANCED) \
+;
/// Enforce practical limits on some query options to avoid undesired query
state.
static const int64_t SPILLABLE_BUFFER_LIMIT = 1LL << 40; // 1 TB
diff --git a/be/src/service/query-result-set.cc
b/be/src/service/query-result-set.cc
index 17f37c208..e90efe7ea 100644
--- a/be/src/service/query-result-set.cc
+++ b/be/src/service/query-result-set.cc
@@ -58,8 +58,9 @@ namespace impala {
class AsciiQueryResultSet : public QueryResultSet {
public:
/// Rows are added into 'rowset'.
- AsciiQueryResultSet(const TResultSetMetadata& metadata, vector<string>*
rowset)
- : metadata_(metadata), result_set_(rowset) {}
+ AsciiQueryResultSet(const TResultSetMetadata& metadata, vector<string>*
rowset,
+ bool stringify_map_keys)
+ : metadata_(metadata), result_set_(rowset),
stringify_map_keys_(stringify_map_keys) {}
virtual ~AsciiQueryResultSet() {}
@@ -83,13 +84,17 @@ class AsciiQueryResultSet : public QueryResultSet {
/// Points to the result set to be filled. Not owned by this object.
vector<string>* result_set_;
+
+ // If true, converts map keys to strings; see IMPALA-11778.
+ const bool stringify_map_keys_;
};
/// Result set container for Hive protocol versions >= V6, where results are
returned in
/// column-orientation.
class HS2ColumnarResultSet : public QueryResultSet {
public:
- HS2ColumnarResultSet(const TResultSetMetadata& metadata, TRowSet* rowset);
+ HS2ColumnarResultSet(const TResultSetMetadata& metadata, TRowSet* rowset,
+ bool stringify_map_keys);
virtual ~HS2ColumnarResultSet() {}
@@ -122,6 +127,9 @@ class HS2ColumnarResultSet : public QueryResultSet {
int64_t num_rows_;
+ // If true, converts map keys to strings; see IMPALA-11778.
+ const bool stringify_map_keys_;
+
void InitColumns();
};
@@ -160,16 +168,17 @@ class HS2RowOrientedResultSet : public QueryResultSet {
};
QueryResultSet* QueryResultSet::CreateAsciiQueryResultSet(
- const TResultSetMetadata& metadata, vector<string>* rowset) {
- return new AsciiQueryResultSet(metadata, rowset);
+ const TResultSetMetadata& metadata, vector<string>* rowset, bool
stringify_map_keys) {
+ return new AsciiQueryResultSet(metadata, rowset, stringify_map_keys);
}
QueryResultSet* QueryResultSet::CreateHS2ResultSet(
- TProtocolVersion::type version, const TResultSetMetadata& metadata,
TRowSet* rowset) {
+ TProtocolVersion::type version, const TResultSetMetadata& metadata,
TRowSet* rowset,
+ bool stringify_map_keys) {
if (version < TProtocolVersion::HIVE_CLI_SERVICE_PROTOCOL_V6) {
return new HS2RowOrientedResultSet(metadata, rowset);
} else {
- return new HS2ColumnarResultSet(metadata, rowset);
+ return new HS2ColumnarResultSet(metadata, rowset, stringify_map_keys);
}
}
@@ -202,7 +211,8 @@ Status AsciiQueryResultSet::AddRows(const
vector<ScalarExprEvaluator*>& expr_eva
&out_stream);
} else if (metadata_.columns[i].columnType.types.size() > 1) {
ColumnType col_type =
ColumnType::FromThrift(metadata_.columns[i].columnType);
- PrintComplexValue(expr_evals[i], it.Get(), &out_stream, col_type);
+ PrintComplexValue(expr_evals[i], it.Get(), &out_stream, col_type,
+ stringify_map_keys_);
} else {
DCHECK(false);
}
@@ -214,7 +224,8 @@ Status AsciiQueryResultSet::AddRows(const
vector<ScalarExprEvaluator*>& expr_eva
}
void QueryResultSet::PrintComplexValue(ScalarExprEvaluator* expr_eval,
- const TupleRow* row, stringstream *stream, const ColumnType& type) {
+ const TupleRow* row, stringstream *stream, const ColumnType& type,
+ bool stringify_map_keys) {
DCHECK(type.IsComplexType());
const ScalarExpr& scalar_expr = expr_eval->root();
// Currently scalar_expr can be only a slot ref as no functions return
complex types.
@@ -235,12 +246,15 @@ void
QueryResultSet::PrintComplexValue(ScalarExprEvaluator* expr_eval,
DCHECK(item_tuple_desc != nullptr);
ComplexValueWriter<rapidjson::BasicOStreamWrapper<stringstream>>
- ::CollectionValueToJSON(*collection_val, type.type, item_tuple_desc,
&writer);
+ complex_value_writer(&writer, stringify_map_keys);
+ complex_value_writer.CollectionValueToJSON(*collection_val, type.type,
+ item_tuple_desc);
} else {
DCHECK(type.IsStructType());
const StructVal* struct_val = static_cast<const StructVal*>(value);
ComplexValueWriter<rapidjson::BasicOStreamWrapper<stringstream>>
- ::StructValToJSON(*struct_val, type, &writer);
+ complex_value_writer(&writer, stringify_map_keys);
+ complex_value_writer.StructValToJSON(*struct_val, type);
}
}
@@ -323,8 +337,9 @@ uint32_t TColumnByteSize(const ThriftTColumn& col, uint32_t
start_idx, uint32_t
// Result set container for Hive protocol versions >= V6, where results are
returned in
// column-orientation.
HS2ColumnarResultSet::HS2ColumnarResultSet(
- const TResultSetMetadata& metadata, TRowSet* rowset)
- : metadata_(metadata), result_set_(rowset), num_rows_(0) {
+ const TResultSetMetadata& metadata, TRowSet* rowset, bool
stringify_map_keys)
+ : metadata_(metadata), result_set_(rowset), num_rows_(0),
+ stringify_map_keys_(stringify_map_keys) {
if (rowset == NULL) {
owned_result_set_.reset(new TRowSet());
result_set_ = owned_result_set_.get();
@@ -341,7 +356,7 @@ Status HS2ColumnarResultSet::AddRows(const
vector<ScalarExprEvaluator*>& expr_ev
const TColumnType& type = metadata_.columns[i].columnType;
ScalarExprEvaluator* expr_eval = expr_evals[i];
ExprValuesToHS2TColumn(expr_eval, type, batch, start_idx, num_rows,
num_rows_,
- &(result_set_->columns[i]));
+ stringify_map_keys_, &(result_set_->columns[i]));
}
num_rows_ += num_rows;
return Status::OK();
diff --git a/be/src/service/query-result-set.h
b/be/src/service/query-result-set.h
index 50f2f4430..60621ffa0 100644
--- a/be/src/service/query-result-set.h
+++ b/be/src/service/query-result-set.h
@@ -67,23 +67,27 @@ class QueryResultSet {
/// Returns the size of this result set in number of rows.
virtual size_t size() = 0;
- /// Returns a result set suitable for Beeswax-based clients.
+ /// Returns a result set suitable for Beeswax-based clients. If
'stringift_map_keys' is
+ /// true, converts map keys to strings; see IMPALA-11778.
static QueryResultSet* CreateAsciiQueryResultSet(
- const TResultSetMetadata& metadata, std::vector<std::string>* rowset);
+ const TResultSetMetadata& metadata, std::vector<std::string>* rowset,
+ bool stringify_map_keys);
/// Returns a result set suitable for HS2-based clients. If 'rowset' is
nullptr, the
- /// returned object will allocate and manage its own rowset.
+ /// returned object will allocate and manage its own rowset. If
'stringift_map_keys' is
+ /// true, converts map keys to strings; see IMPALA-11778.
static QueryResultSet* CreateHS2ResultSet(
apache::hive::service::cli::thrift::TProtocolVersion::type version,
const TResultSetMetadata& metadata,
- apache::hive::service::cli::thrift::TRowSet* rowset);
+ apache::hive::service::cli::thrift::TRowSet* rowset, bool
stringify_map_keys);
protected:
/// Wrapper to call ComplexValueWriter::CollectionValueToJSON() or
/// ComplexValueWriter::StructValToJSON() for a given complex column.
expr_eval must be
- /// a SlotRef on a complex-typed (collection or struct) slot.
+ /// a SlotRef on a complex-typed (collection or struct) slot. If
'stringify_map_keys' is
+ /// true, converts map keys to strings; see IMPALA-11778.
static void PrintComplexValue(ScalarExprEvaluator* expr_eval, const
TupleRow* row,
- std::stringstream *stream, const ColumnType& type);
+ std::stringstream *stream, const ColumnType& type, bool
stringify_map_keys);
};
}
diff --git a/common/thrift/ImpalaService.thrift
b/common/thrift/ImpalaService.thrift
index e759c068e..49a7595be 100644
--- a/common/thrift/ImpalaService.thrift
+++ b/common/thrift/ImpalaService.thrift
@@ -754,6 +754,9 @@ enum TImpalaQueryOptions {
// an issue.
// Only valid if DISABLE_CODEGEN_CACHE is set to false.
CODEGEN_CACHE_MODE = 150;
+
+ // Convert non-string map keys to string to produce valid JSON.
+ STRINGIFY_MAP_KEYS = 151
}
// The summary of a DML statement.
diff --git a/common/thrift/Query.thrift b/common/thrift/Query.thrift
index 4ab9d4101..0bf150e14 100644
--- a/common/thrift/Query.thrift
+++ b/common/thrift/Query.thrift
@@ -612,6 +612,9 @@ struct TQueryOptions {
// See comment in ImpalaService.thrift
150: optional bool disable_codegen_cache = false;
151: optional TCodeGenCacheMode codegen_cache_mode =
TCodeGenCacheMode.NORMAL;
+
+ // See comment in ImpalaService.thrift
+ 152: optional bool stringify_map_keys = false;
}
// Impala currently has three types of sessions: Beeswax, HiveServer2 and
external
diff --git
a/testdata/workloads/functional-query/queries/QueryTest/map_null_keys.test
b/testdata/workloads/functional-query/queries/QueryTest/map_null_keys.test
index e835742a6..9d8cd8a89 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/map_null_keys.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/map_null_keys.test
@@ -22,3 +22,27 @@ from map_null_keys;
---- TYPES
INT,STRING,STRING,STRING,STRING,STRING,STRING,STRING,STRING,STRING,STRING,STRING,STRING
=====
+---- QUERY
+-- Test that NULL map keys are printed correctly with STRINGIFY_MAP_KEYS=true.
+set CONVERT_LEGACY_HIVE_PARQUET_UTC_TIMESTAMPS=1;
+set STRINGIFY_MAP_KEYS=1;
+select
+ id,
+ map_bool_key,
+ map_tinyint_key,
+ map_smallint_key,
+ map_bigint_key,
+ map_float_key,
+ map_double_key,
+ map_decimal_key,
+ map_string_key,
+ map_char_key,
+ map_varchar_key,
+ map_timestamp_key,
+ map_date_key
+from map_null_keys;
+---- RESULTS
+1,'{"true":"true","null":"null"}','{"-1":"one","null":"null"}','{"-1":"one","null":"null"}','{"-1":"one","null":"null"}','{"-1.75":"a","null":"null"}','{"-1.75":"a","null":"null"}','{"-1.8":"a","null":"null"}','{"one":1,"null":null}','{"Mon":1,"null":null}','{"a":"A","null":null}','{"2022-12-10
08:15:12":"Saturday
morning","null":"null"}','{"2022-12-10":"Saturday","null":"null"}'
+---- TYPES
+INT,STRING,STRING,STRING,STRING,STRING,STRING,STRING,STRING,STRING,STRING,STRING,STRING
+=====
diff --git
a/testdata/workloads/functional-query/queries/QueryTest/nested-map-in-select-list.test
b/testdata/workloads/functional-query/queries/QueryTest/nested-map-in-select-list.test
index f611605a7..d834633c9 100644
---
a/testdata/workloads/functional-query/queries/QueryTest/nested-map-in-select-list.test
+++
b/testdata/workloads/functional-query/queries/QueryTest/nested-map-in-select-list.test
@@ -346,3 +346,26 @@ from collection_tbl;
---- TYPES
STRING,STRING,STRING,STRING,STRING,STRING,STRING,STRING,STRING,STRING,STRING,STRING
=====
+---- QUERY
+-- Test that map keys are printed correctly with STRINGIFY_MAP_KEYS=true.
+set CONVERT_LEGACY_HIVE_PARQUET_UTC_TIMESTAMPS=1;
+set STRINGIFY_MAP_KEYS=1;
+select
+ map_bool_key,
+ map_tinyint_key,
+ map_smallint_key,
+ map_bigint_key,
+ map_float_key,
+ map_double_key,
+ map_decimal_key,
+ map_string_key,
+ map_char_key,
+ map_varchar_key,
+ map_timestamp_key,
+ map_date_key
+from collection_tbl;
+---- RESULTS
+'{"true":"true","false":"false"}','{"-1":"a","0":"b","1":"c"}','{"-1":"a","0":"b","1":"c"}','{"-1":"a","0":"b","1":"c"}','{"-1.5":"a","0.25":"b","1.75":"c"}','{"-1.5":"a","0.25":"b","1.75":"c"}','{"-1.8":"a","0.2":"b","1.2":"c"}','{"one":1,"two":2,"three":3}','{"Mon":1,"Tue":2,"Wed":3,"Thu":4,"Fri":5,"Sat":6,"Sun":7}','{"a":"A","ab":"AB","abc":"ABC"}','{"2022-12-10
08:15:12":"Saturday morning","2022-12-09 18:15:12":"Friday
evening"}','{"2022-12-10":"Saturday","2022-12-09":"Friday"}'
+---- TYPES
+STRING,STRING,STRING,STRING,STRING,STRING,STRING,STRING,STRING,STRING,STRING,STRING
+=====