This is an automated email from the ASF dual-hosted git repository. alexey pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/kudu.git
commit 18d40679a441068072856995a3fb6d4cabfe6d68 Author: Alexey Serbin <[email protected]> AuthorDate: Fri Jul 29 14:07:37 2022 -0700 KUDU-2671 range-specific hash schemas in 'kudu table create' CLI This patch add support for range-specific hash schemas into the 'kudu table create' CLI tool. This patch also contains a test scenario to cover the newly introduced functionality. Change-Id: I94aab482792ef93754b6475e1390b8f0c4a05678 Reviewed-on: http://gerrit.cloudera.org:8080/18809 Reviewed-by: Mahesh Reddy <[email protected]> Tested-by: Alexey Serbin <[email protected]> Reviewed-by: Yingchun Lai <[email protected]> Reviewed-by: Abhishek Chennaka <[email protected]> Reviewed-by: Attila Bukor <[email protected]> --- src/kudu/tools/create-table-tool-test.cc | 130 +++++++++++++++++++++++++++---- src/kudu/tools/tool.proto | 40 ++++++---- src/kudu/tools/tool_action_table.cc | 64 +++++++++++++-- 3 files changed, 198 insertions(+), 36 deletions(-) diff --git a/src/kudu/tools/create-table-tool-test.cc b/src/kudu/tools/create-table-tool-test.cc index d592638d7..9ce59a98d 100644 --- a/src/kudu/tools/create-table-tool-test.cc +++ b/src/kudu/tools/create-table-tool-test.cc @@ -15,13 +15,14 @@ // specific language governing permissions and limitations // under the License. +#include <algorithm> +#include <cstdint> #include <cstdio> #include <functional> #include <map> #include <memory> #include <string> #include <unordered_map> -#include <utility> #include <vector> #include <gtest/gtest.h> @@ -99,28 +100,32 @@ TEST_F(CreateTableToolTest, TestCreateTable) { // Test a few good cases. const auto check_good_input = [&](const string& json_str, - const string& master, + const string& master_rpc_addr, const string& table_name, - const string& schema, - const string& partition, + const string& schema_str, + const string& partition_str, const map<string, string>& extra_configs, - KuduClient* client) { + KuduClient* client, + shared_ptr<KuduTable>* table_out = nullptr) { const vector<string> table_args = { - "table", "create", master, json_str + "table", "create", master_rpc_addr, json_str }; bool table_exists = false; ASSERT_OK(RunKuduTool(table_args)); ASSERT_EVENTUALLY([&] { - ASSERT_OK(client->TableExists(table_name, &table_exists)); - ASSERT_TRUE(table_exists); + ASSERT_OK(client->TableExists(table_name, &table_exists)); + ASSERT_TRUE(table_exists); }); shared_ptr<KuduTable> table; ASSERT_OK(client->OpenTable(table_name, &table)); - ASSERT_EQ(table->name(), table_name); - ASSERT_EQ(table->schema().ToString(), schema); - ASSERT_EQ(table->partition_schema().DebugString(KuduSchema::ToSchema( - table->schema())), partition); - ASSERT_EQ(table->extra_configs(), extra_configs); + ASSERT_EQ(table_name, table->name()); + ASSERT_EQ(schema_str, table->schema().ToString()); + ASSERT_EQ(partition_str, table->partition_schema().DebugString( + KuduSchema::ToSchema(table->schema()))); + ASSERT_EQ(extra_configs, table->extra_configs()); + if (table_out) { + *table_out = std::move(table); + } }; // Create a simple table. @@ -613,6 +618,105 @@ TEST_F(CreateTableToolTest, TestCreateTable) { NO_FATALS(check_good_input(encoding_type_unknown, master_addr, "encoding_type_unknown", schema, partition, {}, client.get())); + // Create a table with a range having custom hash schema. + const string range_with_custom_hash_schema = R"( + { + "table_name": "range_with_custom_hash_schema", + "schema": { + "columns": [ + { + "column_name": "id", + "column_type": "INT32", + "is_nullable": false, + }, + { + "column_name": "name", + "column_type": "STRING", + "is_nullable": true, + } + ], + "key_column_names": [ + "id" + ] + }, + "partition": { + "hash_partitions": [ + { + "columns": ["id"], + "num_buckets": 2, + "seed": 1 + } + ], + "range_partition": { + "columns": ["id"], + "range_bounds": [ + { + "upper_bound": { + "bound_values": ["-100"], + "bound_type": "EXCLUSIVE" + } + }, + { + "lower_bound": { + "bound_values": ["100"], + "bound_type": "INCLUSIVE" + } + } + ], + "custom_hash_schema_ranges": [ + { + "range_bounds": { + "lower_bound": { + "bound_values": ["-100"], + "bound_type": "INCLUSIVE" + }, + "upper_bound": { + "bound_values": ["100"], + "bound_type": "EXCLUSIVE" + } + }, + "hash_schema": { + "columns": ["id"], + "num_buckets": 5, + "seed": 8 + } + } + ] + } + } + } + )"; + { + constexpr const char* const kRefSchema = + "(\n" + " id INT32 NOT NULL,\n" + " name STRING NULLABLE,\n" + " PRIMARY KEY (id)\n)"; + constexpr const char* const kRefPartitionInfo = + "HASH (id) PARTITIONS 2 SEED 1, RANGE (id)"; + shared_ptr<KuduTable> table; + NO_FATALS(check_good_input(range_with_custom_hash_schema, + master_addr, + "range_with_custom_hash_schema", + kRefSchema, + kRefPartitionInfo, + {}, + client.get(), + &table)); + vector<Partition> partitions; + ASSERT_OK(table->ListPartitions(&partitions)); + ASSERT_EQ(9, partitions.size()); + vector<int32_t> bucket_nums; + for (const auto& p : partitions) { + // All hash schemas in this table are one-dimensional. + ASSERT_EQ(1, p.hash_buckets().size()); + bucket_nums.emplace_back(p.hash_buckets().front()); + } + std::sort(bucket_nums.begin(), bucket_nums.end()); + const vector<int32_t> ref_bucket_nums{0, 0, 0, 1, 1, 1, 2, 3, 4}; + ASSERT_EQ(ref_bucket_nums, bucket_nums); + } + // Test a few error cases. const auto check_bad_input = [&](const string& json_str, const string& master, diff --git a/src/kudu/tools/tool.proto b/src/kudu/tools/tool.proto index dc735e137..f98d8cd84 100644 --- a/src/kudu/tools/tool.proto +++ b/src/kudu/tools/tool.proto @@ -398,6 +398,20 @@ message ColumnPB { } message PartitionPB { + message HashPartitionPB { + // Column names of columns included in the hash. Every column must be + // a component of the primary key. + repeated string columns = 1; + // Number of buckets into which columns will be hashed. Must be at least 2. + optional int32 num_buckets = 2; + // Seed value for hash calculation. Administrators may set a seed value + // on a per-table basis in order to randomize the mapping of rows to + // buckets. Setting a seed provides some amount of protection against denial + // of service attacks when the hash bucket columns contain user provided + // input. + optional uint32 seed = 3; + } + message RangePartitionPB { message BoundPB { enum Type { @@ -422,6 +436,12 @@ message PartitionPB { // exact string value for the bound. repeated string split_values = 1; } + message RangeWithHashSchemaPB { + // The bounds of this range. + optional RangeBoundPB range_bounds = 1; + // Hash schema for this range. + repeated HashPartitionPB hash_schema = 2; + } // Column names of columns included in the range. All columns must be // a component of the primary key. @@ -430,25 +450,13 @@ message PartitionPB { repeated RangeBoundPB range_bounds = 2; // Range splits. repeated SplitValuePB range_splits = 3; + // Ranges with custom hash schemas. + repeated RangeWithHashSchemaPB custom_hash_schema_ranges = 4; } - message HashPartitionPB { - // Column names of columns included in the hash. Every column must be - // a component of the primary key. - repeated string columns = 1; - // Number of buckets into which columns will be hashed. Must be at least 2. - optional int32 num_buckets = 2; - // Seed value for hash calculation. Administrators may set a seed value - // on a per-table basis in order to randomize the mapping of rows to - // buckets. Setting a seed provides some amount of protection against denial - // of service attacks when the hash bucket columns contain user provided - // input. - optional uint32 seed = 3; - } - - // Hash partition message. Support zero or more hash partition levels . + // Table-wide hash schema. repeated HashPartitionPB hash_partitions = 1; - // range partition message. + // Range partitioning information. optional RangePartitionPB range_partition = 2; } diff --git a/src/kudu/tools/tool_action_table.cc b/src/kudu/tools/tool_action_table.cc index a03c23574..b764781de 100644 --- a/src/kudu/tools/tool_action_table.cc +++ b/src/kudu/tools/tool_action_table.cc @@ -72,6 +72,7 @@ using kudu::client::KuduColumnSchema; using kudu::client::KuduColumnSpec; using kudu::client::KuduColumnStorageAttributes; using kudu::client::KuduPredicate; +using kudu::client::KuduRangePartition; using kudu::client::KuduScanToken; using kudu::client::KuduScanTokenBuilder; using kudu::client::KuduScanner; @@ -1491,11 +1492,8 @@ Status ParseTablePartition(const PartitionPB& partition, string bound_partial_row_json; for (const auto& bound : partition.range_partition().range_bounds()) { unique_ptr<KuduPartialRow> lower_bound(kudu_schema.NewRow()); - unique_ptr<KuduPartialRow> upper_bound(kudu_schema.NewRow()); KuduTableCreator::RangePartitionBound lower_bound_type = KuduTableCreator::INCLUSIVE_BOUND; - KuduTableCreator::RangePartitionBound upper_bound_type = - KuduTableCreator::EXCLUSIVE_BOUND; if (bound.has_lower_bound()) { RETURN_NOT_OK(ToJsonPartialRow(bound.lower_bound().bound_values(), range_col_names_and_types, @@ -1503,9 +1501,13 @@ Status ParseTablePartition(const PartitionPB& partition, RETURN_NOT_OK(ConvertToKuduPartialRow(range_col_names_and_types, bound_partial_row_json, lower_bound.get())); - RETURN_NOT_OK(ToClientRangePartitionBound(bound.lower_bound().bound_type(), - &lower_bound_type)); + RETURN_NOT_OK(ToClientRangePartitionBound( + bound.lower_bound().bound_type(), &lower_bound_type)); } + + unique_ptr<KuduPartialRow> upper_bound(kudu_schema.NewRow()); + KuduTableCreator::RangePartitionBound upper_bound_type = + KuduTableCreator::EXCLUSIVE_BOUND; if (bound.has_upper_bound()) { RETURN_NOT_OK(ToJsonPartialRow(bound.upper_bound().bound_values(), range_col_names_and_types, @@ -1513,12 +1515,60 @@ Status ParseTablePartition(const PartitionPB& partition, RETURN_NOT_OK(ConvertToKuduPartialRow(range_col_names_and_types, bound_partial_row_json, upper_bound.get())); - RETURN_NOT_OK(ToClientRangePartitionBound(bound.upper_bound().bound_type(), - &upper_bound_type)); + RETURN_NOT_OK(ToClientRangePartitionBound( + bound.upper_bound().bound_type(), &upper_bound_type)); } + table_creator->add_range_partition(lower_bound.release(), upper_bound.release(), lower_bound_type, upper_bound_type); } + + for (const auto& range : partition.range_partition().custom_hash_schema_ranges()) { + const auto& bounds = range.range_bounds(); + + unique_ptr<KuduPartialRow> lower_bound(kudu_schema.NewRow()); + KuduTableCreator::RangePartitionBound lower_bound_type = + KuduTableCreator::INCLUSIVE_BOUND; + if (bounds.has_lower_bound()) { + RETURN_NOT_OK(ToJsonPartialRow(bounds.lower_bound().bound_values(), + range_col_names_and_types, + &bound_partial_row_json)); + RETURN_NOT_OK(ConvertToKuduPartialRow(range_col_names_and_types, + bound_partial_row_json, + lower_bound.get())); + RETURN_NOT_OK(ToClientRangePartitionBound( + bounds.lower_bound().bound_type(), &lower_bound_type)); + } + + unique_ptr<KuduPartialRow> upper_bound(kudu_schema.NewRow()); + KuduTableCreator::RangePartitionBound upper_bound_type = + KuduTableCreator::EXCLUSIVE_BOUND; + if (bounds.has_upper_bound()) { + RETURN_NOT_OK(ToJsonPartialRow(bounds.upper_bound().bound_values(), + range_col_names_and_types, + &bound_partial_row_json)); + RETURN_NOT_OK(ConvertToKuduPartialRow(range_col_names_and_types, + bound_partial_row_json, + upper_bound.get())); + RETURN_NOT_OK(ToClientRangePartitionBound( + bounds.upper_bound().bound_type(), &upper_bound_type)); + } + + unique_ptr<KuduRangePartition> partition( + new KuduRangePartition(lower_bound.release(), upper_bound.release(), + lower_bound_type, upper_bound_type)); + for (const auto& hash_dimension : range.hash_schema()) { + vector<string> hash_columns; + for (const auto& c : hash_dimension.columns()) { + hash_columns.emplace_back(c); + } + const int32_t seed = hash_dimension.has_seed() ? hash_dimension.seed() : 0; + partition->add_hash_partitions( + hash_columns, hash_dimension.num_buckets(), seed); + } + + table_creator->add_custom_range_partition(partition.release()); + } for (const auto& split_pb : partition.range_partition().range_splits()) { RETURN_NOT_OK(ToJsonPartialRow(split_pb.split_values(), range_col_names_and_types,
