[kudu] branch master updated: [common] more generic API for IN list predicate pruning

alexey Tue, 10 Aug 2021 19:15:08 -0700

This is an automated email from the ASF dual-hosted git repository.

alexey pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git



The following commit(s) were added to refs/heads/master by this push:
     new 607d9d0  [common] more generic API for IN list predicate pruning
607d9d0 is described below

commit 607d9d0a7e95e220864f43b88a64644bb6402163
Author: Alexey Serbin <[email protected]>
AuthorDate: Mon Aug 9 21:18:29 2021 -0700

    [common] more generic API for IN list predicate pruning
    
    While working on KUDU-2671, I found that the exposed internals of the
    PartitionSchema class doesn't allow for updating the implementation of
    the partition-related code to include per-range custom hash bucket
    schemas in a consistent manner.
    
    This patch introduces a bit more generic interface for pruning values of
    IN list predicates by adding a new PartitionMayContainRow() method and
    removes the following methods from the public API of the PartitionSchema
    class:
      * HashPartitionContainsRow()
      * RangePartitionContainsRow()
      * IsColumnSingleRangeSchema()
      * TryGetSingleColumnHashPartitionIndex()
    
    I also added one extra test scenario and updated existing ones to
    increase readability of the assertion messages if they are triggered.
    
    This is a follow-up to 6a7cadc7e and 83b8caf4f.
    
    Change-Id: I2e2390cc4747864fdac71656dd7125ac3b15bf9d
    Reviewed-on: http://gerrit.cloudera.org:8080/17764
    Tested-by: Kudu Jenkins
    Reviewed-by: Mahesh Reddy <[email protected]>
    Reviewed-by: Andrew Wong <[email protected]>
---
 src/kudu/common/partition.cc      |  86 ++--
 src/kudu/common/partition.h       |  59 +--
 src/kudu/common/scan_spec-test.cc | 833 +++++++++++++++-----------------------
 src/kudu/common/scan_spec.cc      |  59 ++-
 src/kudu/common/scan_spec.h       |  27 +-
 5 files changed, 462 insertions(+), 602 deletions(-)

diff --git a/src/kudu/common/partition.cc b/src/kudu/common/partition.cc
index 8a5a846..9cb06af 100644
--- a/src/kudu/common/partition.cc
+++ b/src/kudu/common/partition.cc
@@ -709,14 +709,54 @@ bool PartitionSchema::PartitionContainsRow(const 
Partition& partition,
   return PartitionContainsRowImpl(partition, row);
 }
 
-bool PartitionSchema::HashPartitionContainsRow(const Partition& partition,
-                                               const KuduPartialRow& row,
-                                               int hash_idx) const {
-  return HashPartitionContainsRowImpl(partition, row, hash_idx);
+bool PartitionSchema::PartitionMayContainRow(const Partition& partition,
+                                             const KuduPartialRow& row) const {
+  // It's the fast and 100% sure path when the row has the primary key set.
+  if (row.IsKeySet()) {
+    return PartitionContainsRow(partition, row);
+  }
+
+  const Schema* schema = row.schema();
+  vector<ColumnId> set_column_ids;
+  set_column_ids.reserve(schema->num_key_columns());
+  for (size_t idx = 0; idx < schema->num_key_columns(); ++idx) {
+    DCHECK(schema->is_key_column(idx));
+    if (row.IsColumnSet(idx) && !row.IsNull(idx)) {
+      set_column_ids.emplace_back(schema->column_id(idx));
+      if (set_column_ids.size() > 1) {
+        // No futher optimizations in this case for a while.
+        // NOTE: This might be a false positive.
+        return true;
+      }
+    }
+  }
+  if (set_column_ids.empty()) {
+    return false;
+  }
+
+  DCHECK_EQ(1, set_column_ids.size());
+  const auto& single_column_id = set_column_ids[0];
+
+  if (range_schema_.column_ids.size() == 1 &&
+      range_schema_.column_ids[0] == single_column_id &&
+      !RangePartitionContainsRow(partition, row)) {
+    return false;
+  }
+  for (size_t i = 0; i < hash_bucket_schemas_.size(); ++i) {
+    const auto& hash_partition = hash_bucket_schemas_[i];
+    if (hash_partition.column_ids.size() == 1 &&
+        hash_partition.column_ids[0] == single_column_id &&
+        !HashPartitionContainsRow(partition, row, i)) {
+      return false;
+    }
+  }
+
+  // NOTE: This might be a false positive.
+  return true;
 }
 
 bool PartitionSchema::HashPartitionContainsRow(const Partition& partition,
-                                               const ConstContiguousRow& row,
+                                               const KuduPartialRow& row,
                                                int hash_idx) const {
   return HashPartitionContainsRowImpl(partition, row, hash_idx);
 }
@@ -726,11 +766,6 @@ bool PartitionSchema::RangePartitionContainsRow(
   return RangePartitionContainsRowImpl(partition, row);
 }
 
-bool PartitionSchema::RangePartitionContainsRow(
-    const Partition& partition, const ConstContiguousRow& row) const {
-  return RangePartitionContainsRowImpl(partition, row);
-}
-
 Status PartitionSchema::DecodeRangeKey(Slice* encoded_key,
                                        KuduPartialRow* partial_row,
                                        Arena* arena) const {
@@ -1559,36 +1594,19 @@ Status 
PartitionSchema::MakeUpperBoundRangePartitionKeyExclusive(KuduPartialRow*
   return Status::OK();
 }
 
-Status PartitionSchema::GetRangeSchemaColumnIndexes(const Schema& schema,
-                                                    vector<int32_t>* 
range_column_idxs) const {
+Status PartitionSchema::GetRangeSchemaColumnIndexes(
+    const Schema& schema,
+    vector<int32_t>* range_column_indexes) const {
+  DCHECK(range_column_indexes);
   for (const ColumnId& column_id : range_schema_.column_ids) {
     int32_t idx = schema.find_column_by_id(column_id);
     if (idx == Schema::kColumnNotFound) {
-      return Status::InvalidArgument(Substitute("range partition column ID $0 "
-                                                "not found in range partition 
key schema.",
-                                                column_id));
+      return Status::InvalidArgument(Substitute(
+          "range partition column ID $0 not found in table schema", 
column_id));
     }
-    range_column_idxs->push_back(idx);
+    range_column_indexes->push_back(idx);
   }
   return Status::OK();
 }
 
-int32_t PartitionSchema::TryGetSingleColumnHashPartitionIndex(const Schema& 
schema,
-                                                              int32_t col_idx) 
const {
-  const ColumnId column_id = schema.column_id(col_idx);
-  for (int i = 0; i < hash_bucket_schemas_.size(); ++i) {
-    const auto& hash_partition = hash_bucket_schemas_[i];
-    if (hash_partition.column_ids.size() == 1 && hash_partition.column_ids[0] 
== column_id) {
-      return i;
-    }
-  }
-  return -1;
-}
-
-bool PartitionSchema::IsColumnSingleRangeSchema(const Schema& schema, int32_t 
col_idx) const {
-  const ColumnId column_id = schema.column_id(col_idx);
-  return range_schema_.column_ids.size() == 1 &&
-         range_schema_.column_ids[0] == column_id;
-}
-
 } // namespace kudu
diff --git a/src/kudu/common/partition.h b/src/kudu/common/partition.h
index bb3e773..fd72b69 100644
--- a/src/kudu/common/partition.h
+++ b/src/kudu/common/partition.h
@@ -225,25 +225,32 @@ class PartitionSchema {
       const Schema& schema,
       std::vector<Partition>* partitions) const WARN_UNUSED_RESULT;
 
-  // Tests if the partition contains the row.
+  // Check if the given partition contains the specified row. The row must have
+  // all the columns participating in the table's partitioning schema
+  // set to particular values.
   bool PartitionContainsRow(const Partition& partition,
                             const KuduPartialRow& row) const;
   bool PartitionContainsRow(const Partition& partition,
                             const ConstContiguousRow& row) const;
 
-  // Tests if the hash partition contains the row with given hash_idx.
-  bool HashPartitionContainsRow(const Partition& partition,
-                                const KuduPartialRow& row,
-                                int hash_idx) const;
-  bool HashPartitionContainsRow(const Partition& partition,
-                                const ConstContiguousRow& row,
-                                int hash_idx) const;
-
-  // Tests if the range partition contains the row.
-  bool RangePartitionContainsRow(const Partition& partition,
-                                 const KuduPartialRow& row) const;
-  bool RangePartitionContainsRow(const Partition& partition,
-                                 const ConstContiguousRow& row) const;
+  // Check if the specified row is probably in the given partition.
+  // The collection of columns set to particular values in the row can be a
+  // subset of all the columns participating in the table's partitioning 
schema.
+  // This method can be used to optimize the collection of values for IN list
+  // predicates. As of now, this method is effectively implemented only for
+  // single-column hash and single-column range partitioning schemas, meaning
+  // that it can return false positives in case of other than single-row range
+  // and hash partitioning schemas.
+  //
+  // NOTE: this method returns false positives in some cases (see above)
+  //
+  // TODO(aserbin): implement this for multi-row range schemas as well,
+  //                substituting non-specified columns in the row with values
+  //                from the partition's start key and return logically 
inverted
+  //                result of calling PartitionContainsRow() with the
+  //                artificially constructed row
+  bool PartitionMayContainRow(const Partition& partition,
+                              const KuduPartialRow& row) const;
 
   // Returns a text description of the partition suitable for debug printing.
   //
@@ -327,18 +334,13 @@ class PartitionSchema {
     return ranges_with_hash_schemas_;
   }
 
-  // Gets the vector containing the column indexes of the range partition keys.
+  // Given the specified table schema, populate the 'range_column_indexes'
+  // container with column indexes of the range partition keys.
   // If any of the columns is not in the key range columns then an
   // InvalidArgument status is returned.
-  Status GetRangeSchemaColumnIndexes(const Schema& schema,
-                                     std::vector<int>* range_column_idxs) 
const;
-
-  // Returns index of given column idx, if it is one of hash key and this hash 
schema
-  // contains only one column, otherwise returns -1.
-  int32_t TryGetSingleColumnHashPartitionIndex(const Schema& schema, int32_t 
col_idx) const;
-
-  // Given a column idx, verify that it is the only column of the range 
partition.
-  bool IsColumnSingleRangeSchema(const Schema& schema, int32_t col_idx) const;
+  Status GetRangeSchemaColumnIndexes(
+      const Schema& schema,
+      std::vector<int>* range_column_indexes) const;
 
  private:
   friend class PartitionPruner;
@@ -346,6 +348,15 @@ class PartitionSchema {
   FRIEND_TEST(PartitionTest, TestIncrementRangePartitionStringBounds);
   FRIEND_TEST(PartitionTest, TestVarcharRangePartitions);
 
+  // Tests if the hash partition contains the row with given hash_idx.
+  bool HashPartitionContainsRow(const Partition& partition,
+                                const KuduPartialRow& row,
+                                int hash_idx) const;
+
+  // Tests if the range partition contains the row.
+  bool RangePartitionContainsRow(const Partition& partition,
+                                 const KuduPartialRow& row) const;
+
   // Returns a text description of the encoded range key suitable for debug 
printing.
   std::string RangeKeyDebugString(Slice range_key, const Schema& schema) const;
   std::string RangeKeyDebugString(const KuduPartialRow& key) const;
diff --git a/src/kudu/common/scan_spec-test.cc 
b/src/kudu/common/scan_spec-test.cc
index 7ebb096..f43151e 100644
--- a/src/kudu/common/scan_spec-test.cc
+++ b/src/kudu/common/scan_spec-test.cc
@@ -86,7 +86,6 @@ string PruneInlistValuesAndGetSchemaString(const ScanSpec& 
spec,
                                            const PartitionSchema& 
partition_schema,
                                            Arena* arena) {
   ScanSpec copy_spec = spec;
-
   copy_spec.PruneInlistValuesIfPossible(schema, partition, partition_schema);
   copy_spec.OptimizeScan(schema, arena, true);
 
@@ -423,66 +422,50 @@ TEST_F(CompositeIntKeysTest, TestInListPushdown) {
 }
 
 // Test that hash(a) IN list predicates prune with right values.
-TEST_F(CompositeIntKeysTest, TestOneHashKeyInListHashPruning) {
+TEST_F(CompositeIntKeysTest, OneHashKeyInListHashPruning) {
   ScanSpec spec;
   AddInPredicate<int8_t>(&spec, "a", { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 });
   AddInPredicate<int8_t>(&spec, "b", { 50, 100 });
 
-  Schema schema = schema_.CopyWithColumnIds();
-
+  const auto schema = schema_.CopyWithColumnIds();
   PartitionSchema partition_schema;
-  GeneratePartitionSchema(schema,
-                          { pair<vector<string>, int>({ "a" }, 3) },
-                          {},
-                          &partition_schema);
+  GeneratePartitionSchema(schema, { { { "a" }, 3 } }, {}, &partition_schema);
 
   vector<Partition> partitions;
   ASSERT_OK(partition_schema.CreatePartitions({}, {}, {}, schema, 
&partitions));
   ASSERT_EQ(3, partitions.size());
 
   // Verify the splitted values can merge into original set without 
overlapping.
-  ASSERT_EQ(PruneInlistValuesAndGetSchemaString(spec,
-                                                schema,
-                                                partitions[0],
-                                                partition_schema,
-                                                &arena_),
-            "PK >= (int8 a=4, int8 b=50, int8 c=-128) AND "
+  ASSERT_EQ("PK >= (int8 a=4, int8 b=50, int8 c=-128) AND "
             "PK < (int8 a=8, int8 b=101, int8 c=-128) AND "
-            "a IN (4, 7, 8) AND b IN (50, 100)");
-
-  ASSERT_EQ(PruneInlistValuesAndGetSchemaString(spec,
-                                                schema,
-                                                partitions[1],
-                                                partition_schema,
-                                                &arena_),
-            "PK >= (int8 a=0, int8 b=50, int8 c=-128) AND "
+            "a IN (4, 7, 8) AND b IN (50, 100)",
+            PruneInlistValuesAndGetSchemaString(
+                spec, schema, partitions[0], partition_schema, &arena_));
+
+  ASSERT_EQ("PK >= (int8 a=0, int8 b=50, int8 c=-128) AND "
             "PK < (int8 a=9, int8 b=101, int8 c=-128) AND "
-            "a IN (0, 2, 5, 9) AND b IN (50, 100)");
-
-  ASSERT_EQ(PruneInlistValuesAndGetSchemaString(spec,
-                                                schema,
-                                                partitions[2],
-                                                partition_schema,
-                                                &arena_),
-            "PK >= (int8 a=1, int8 b=50, int8 c=-128) AND "
+            "a IN (0, 2, 5, 9) AND b IN (50, 100)",
+            PruneInlistValuesAndGetSchemaString(
+                spec, schema, partitions[1], partition_schema, &arena_));
+
+  ASSERT_EQ("PK >= (int8 a=1, int8 b=50, int8 c=-128) AND "
             "PK < (int8 a=6, int8 b=101, int8 c=-128) AND "
-            "a IN (1, 3, 6) AND b IN (50, 100)");
+            "a IN (1, 3, 6) AND b IN (50, 100)",
+            PruneInlistValuesAndGetSchemaString(
+                spec, schema, partitions[2], partition_schema, &arena_));
 }
 
 // Test that hash(a), range(a) IN list predicates prune would happen on
 // both hash and range aspects.
-TEST_F(CompositeIntKeysTest, TestOneHashKeyOneRangeKeyInListHashPruning) {
+TEST_F(CompositeIntKeysTest, OneHashKeyOneRangeKeyInListHashPruning) {
   ScanSpec spec;
   AddInPredicate<int8_t>(&spec, "a", { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 });
   AddInPredicate<int8_t>(&spec, "b", { 50, 100 });
 
-  Schema schema = schema_.CopyWithColumnIds();
-
+  const auto schema = schema_.CopyWithColumnIds();
   PartitionSchema partition_schema;
-  GeneratePartitionSchema(schema,
-                          { pair<vector<string>, int>({ "a" }, 3) },
-                          { "a" },
-                          &partition_schema);
+  GeneratePartitionSchema(
+      schema, { { { "a" }, 3 } }, { "a" }, &partition_schema);
 
   KuduPartialRow split1(&schema);
   KuduPartialRow split2(&schema);
@@ -490,107 +473,74 @@ TEST_F(CompositeIntKeysTest, 
TestOneHashKeyOneRangeKeyInListHashPruning) {
   ASSERT_OK(split2.SetInt8("a", 6));
 
   vector<Partition> partitions;
-  ASSERT_OK(partition_schema.CreatePartitions({ split1, split2 },
-                                              {},
-                                              {},
-                                              schema,
-                                              &partitions));
+  ASSERT_OK(partition_schema.CreatePartitions(
+      { split1, split2 }, {}, {}, schema, &partitions));
   ASSERT_EQ(9, partitions.size());
 
-  ASSERT_EQ(PruneInlistValuesAndGetSchemaString(spec,
-                                                schema,
-                                                partitions[0],
-                                                partition_schema,
-                                                &arena_),
-            "a IN () AND b IN (50, 100)");
-
-  ASSERT_EQ(PruneInlistValuesAndGetSchemaString(spec,
-                                                schema,
-                                                partitions[1],
-                                                partition_schema,
-                                                &arena_),
-            "PK >= (int8 a=4, int8 b=50, int8 c=-128) AND "
+  ASSERT_EQ("a IN () AND b IN (50, 100)",
+            PruneInlistValuesAndGetSchemaString(
+                spec, schema, partitions[0], partition_schema, &arena_));
+
+  ASSERT_EQ("PK >= (int8 a=4, int8 b=50, int8 c=-128) AND "
             "PK < (int8 a=4, int8 b=101, int8 c=-128) AND "
-            "a IN (4) AND b IN (50, 100)");
-
-  ASSERT_EQ(PruneInlistValuesAndGetSchemaString(spec,
-                                                schema,
-                                                partitions[2],
-                                                partition_schema,
-                                                &arena_),
-            "PK >= (int8 a=7, int8 b=50, int8 c=-128) AND "
+            "a IN (4) AND b IN (50, 100)",
+            PruneInlistValuesAndGetSchemaString(
+                spec, schema, partitions[1], partition_schema, &arena_));
+
+  ASSERT_EQ("PK >= (int8 a=7, int8 b=50, int8 c=-128) AND "
             "PK < (int8 a=8, int8 b=101, int8 c=-128) AND "
-            "a IN (7, 8) AND b IN (50, 100)");
-
-  ASSERT_EQ(PruneInlistValuesAndGetSchemaString(spec,
-                                                schema,
-                                                partitions[3],
-                                                partition_schema,
-                                                &arena_),
-            "PK >= (int8 a=0, int8 b=50, int8 c=-128) AND "
+            "a IN (7, 8) AND b IN (50, 100)",
+            PruneInlistValuesAndGetSchemaString(
+                spec, schema, partitions[2], partition_schema, &arena_));
+
+  ASSERT_EQ("PK >= (int8 a=0, int8 b=50, int8 c=-128) AND "
             "PK < (int8 a=2, int8 b=101, int8 c=-128) AND "
-            "a IN (0, 2) AND b IN (50, 100)");
-
-  ASSERT_EQ(PruneInlistValuesAndGetSchemaString(spec,
-                                                schema,
-                                                partitions[4],
-                                                partition_schema,
-                                                &arena_),
-            "PK >= (int8 a=5, int8 b=50, int8 c=-128) AND "
+            "a IN (0, 2) AND b IN (50, 100)",
+            PruneInlistValuesAndGetSchemaString(
+                spec, schema, partitions[3], partition_schema, &arena_));
+
+  ASSERT_EQ("PK >= (int8 a=5, int8 b=50, int8 c=-128) AND "
             "PK < (int8 a=5, int8 b=101, int8 c=-128) AND "
-            "a IN (5) AND b IN (50, 100)");
-
-  ASSERT_EQ(PruneInlistValuesAndGetSchemaString(spec,
-                                                schema,
-                                                partitions[5],
-                                                partition_schema,
-                                                &arena_),
-            "PK >= (int8 a=9, int8 b=50, int8 c=-128) AND "
+            "a IN (5) AND b IN (50, 100)",
+            PruneInlistValuesAndGetSchemaString(
+                spec, schema, partitions[4], partition_schema, &arena_));
+
+  ASSERT_EQ("PK >= (int8 a=9, int8 b=50, int8 c=-128) AND "
             "PK < (int8 a=9, int8 b=101, int8 c=-128) AND "
-            "a IN (9) AND b IN (50, 100)");
-
-  ASSERT_EQ(PruneInlistValuesAndGetSchemaString(spec,
-                                                schema,
-                                                partitions[6],
-                                                partition_schema,
-                                                &arena_),
-            "PK >= (int8 a=1, int8 b=50, int8 c=-128) AND "
+            "a IN (9) AND b IN (50, 100)",
+            PruneInlistValuesAndGetSchemaString(
+                spec, schema, partitions[5], partition_schema, &arena_));
+
+  ASSERT_EQ("PK >= (int8 a=1, int8 b=50, int8 c=-128) AND "
             "PK < (int8 a=1, int8 b=101, int8 c=-128) AND "
-            "a IN (1) AND b IN (50, 100)");
-
-  ASSERT_EQ(PruneInlistValuesAndGetSchemaString(spec,
-                                                schema,
-                                                partitions[7],
-                                                partition_schema,
-                                                &arena_),
-            "PK >= (int8 a=3, int8 b=50, int8 c=-128) AND "
+            "a IN (1) AND b IN (50, 100)",
+            PruneInlistValuesAndGetSchemaString(
+                spec, schema, partitions[6], partition_schema, &arena_));
+
+  ASSERT_EQ("PK >= (int8 a=3, int8 b=50, int8 c=-128) AND "
             "PK < (int8 a=3, int8 b=101, int8 c=-128) AND "
-            "a IN (3) AND b IN (50, 100)");
-
-  ASSERT_EQ(PruneInlistValuesAndGetSchemaString(spec,
-                                                schema,
-                                                partitions[8],
-                                                partition_schema,
-                                                &arena_),
-            "PK >= (int8 a=6, int8 b=50, int8 c=-128) AND "
+            "a IN (3) AND b IN (50, 100)",
+            PruneInlistValuesAndGetSchemaString(
+                spec, schema, partitions[7], partition_schema, &arena_));
+
+  ASSERT_EQ("PK >= (int8 a=6, int8 b=50, int8 c=-128) AND "
             "PK < (int8 a=6, int8 b=101, int8 c=-128) AND "
-            "a IN (6) AND b IN (50, 100)");
+            "a IN (6) AND b IN (50, 100)",
+            PruneInlistValuesAndGetSchemaString(
+                spec, schema, partitions[8], partition_schema, &arena_));
 }
 
 // Test that hash(a), range(a, b) IN list predicates prune would happen
 // on hash-key but not on range key.
-TEST_F(CompositeIntKeysTest, TestOneHashKeyMultiRangeKeyInListHashPruning) {
+TEST_F(CompositeIntKeysTest, OneHashKeyMultiRangeKeyInListHashPruning) {
   ScanSpec spec;
   AddInPredicate<int8_t>(&spec, "a", { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 });
   AddInPredicate<int8_t>(&spec, "b", { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 });
 
-  Schema schema = schema_.CopyWithColumnIds();
-
+  const auto schema = schema_.CopyWithColumnIds();
   PartitionSchema partition_schema;
-  GeneratePartitionSchema(schema,
-                          { pair<vector<string>, int>({ "a" }, 3) },
-                          { "a", "b" },
-                          &partition_schema);
+  GeneratePartitionSchema(
+      schema, { { { "a" }, 3 } }, { "a", "b" }, &partition_schema);
 
   KuduPartialRow split1(&schema);
   KuduPartialRow split2(&schema);
@@ -601,109 +551,76 @@ TEST_F(CompositeIntKeysTest, 
TestOneHashKeyMultiRangeKeyInListHashPruning) {
   ASSERT_OK(split2.SetInt8("b", 6));
 
   vector<Partition> partitions;
-  ASSERT_OK(partition_schema.CreatePartitions({ split1, split2 },
-                                              {},
-                                              {},
-                                              schema,
-                                              &partitions));
+  ASSERT_OK(partition_schema.CreatePartitions(
+      { split1, split2 }, {}, {}, schema, &partitions));
   ASSERT_EQ(9, partitions.size());
 
-  ASSERT_EQ(PruneInlistValuesAndGetSchemaString(spec,
-                                                schema,
-                                                partitions[0],
-                                                partition_schema,
-                                                &arena_),
-            "PK >= (int8 a=4, int8 b=0, int8 c=-128) AND "
+  ASSERT_EQ("PK >= (int8 a=4, int8 b=0, int8 c=-128) AND "
             "PK < (int8 a=8, int8 b=10, int8 c=-128) AND "
-            "a IN (4, 7, 8) AND b IN (0, 1, 2, 3, 4, 5, 6, 7, 8, 9)");
-
-  ASSERT_EQ(PruneInlistValuesAndGetSchemaString(spec,
-                                                schema,
-                                                partitions[1],
-                                                partition_schema,
-                                                &arena_),
-            "PK >= (int8 a=4, int8 b=0, int8 c=-128) AND "
+            "a IN (4, 7, 8) AND b IN (0, 1, 2, 3, 4, 5, 6, 7, 8, 9)",
+            PruneInlistValuesAndGetSchemaString(
+                spec, schema, partitions[0], partition_schema, &arena_));
+
+  ASSERT_EQ("PK >= (int8 a=4, int8 b=0, int8 c=-128) AND "
             "PK < (int8 a=8, int8 b=10, int8 c=-128) AND "
-            "a IN (4, 7, 8) AND b IN (0, 1, 2, 3, 4, 5, 6, 7, 8, 9)");
-
-  ASSERT_EQ(PruneInlistValuesAndGetSchemaString(spec,
-                                                schema,
-                                                partitions[2],
-                                                partition_schema,
-                                                &arena_),
-            "PK >= (int8 a=4, int8 b=0, int8 c=-128) AND "
+            "a IN (4, 7, 8) AND b IN (0, 1, 2, 3, 4, 5, 6, 7, 8, 9)",
+            PruneInlistValuesAndGetSchemaString(
+                spec, schema, partitions[1], partition_schema, &arena_));
+
+  ASSERT_EQ("PK >= (int8 a=4, int8 b=0, int8 c=-128) AND "
             "PK < (int8 a=8, int8 b=10, int8 c=-128) AND "
-            "a IN (4, 7, 8) AND b IN (0, 1, 2, 3, 4, 5, 6, 7, 8, 9)");
-
-  ASSERT_EQ(PruneInlistValuesAndGetSchemaString(spec,
-                                                schema,
-                                                partitions[3],
-                                                partition_schema,
-                                                &arena_),
-            "PK >= (int8 a=0, int8 b=0, int8 c=-128) AND "
+            "a IN (4, 7, 8) AND b IN (0, 1, 2, 3, 4, 5, 6, 7, 8, 9)",
+            PruneInlistValuesAndGetSchemaString(
+                spec, schema, partitions[2], partition_schema, &arena_));
+
+  ASSERT_EQ("PK >= (int8 a=0, int8 b=0, int8 c=-128) AND "
             "PK < (int8 a=9, int8 b=10, int8 c=-128) AND "
-            "a IN (0, 2, 5, 9) AND b IN (0, 1, 2, 3, 4, 5, 6, 7, 8, 9)");
-
-  ASSERT_EQ(PruneInlistValuesAndGetSchemaString(spec,
-                                                schema,
-                                                partitions[4],
-                                                partition_schema,
-                                                &arena_),
-            "PK >= (int8 a=0, int8 b=0, int8 c=-128) AND "
+            "a IN (0, 2, 5, 9) AND b IN (0, 1, 2, 3, 4, 5, 6, 7, 8, 9)",
+            PruneInlistValuesAndGetSchemaString(
+                spec, schema, partitions[3], partition_schema, &arena_));
+
+  ASSERT_EQ("PK >= (int8 a=0, int8 b=0, int8 c=-128) AND "
             "PK < (int8 a=9, int8 b=10, int8 c=-128) AND "
-            "a IN (0, 2, 5, 9) AND b IN (0, 1, 2, 3, 4, 5, 6, 7, 8, 9)");
-
-  ASSERT_EQ(PruneInlistValuesAndGetSchemaString(spec,
-                                                schema,
-                                                partitions[5],
-                                                partition_schema,
-                                                &arena_),
-            "PK >= (int8 a=0, int8 b=0, int8 c=-128) AND "
+            "a IN (0, 2, 5, 9) AND b IN (0, 1, 2, 3, 4, 5, 6, 7, 8, 9)",
+            PruneInlistValuesAndGetSchemaString(
+                spec, schema, partitions[4], partition_schema, &arena_));
+
+  ASSERT_EQ("PK >= (int8 a=0, int8 b=0, int8 c=-128) AND "
             "PK < (int8 a=9, int8 b=10, int8 c=-128) AND "
-            "a IN (0, 2, 5, 9) AND b IN (0, 1, 2, 3, 4, 5, 6, 7, 8, 9)");
-
-  ASSERT_EQ(PruneInlistValuesAndGetSchemaString(spec,
-                                                schema,
-                                                partitions[6],
-                                                partition_schema,
-                                                &arena_),
-            "PK >= (int8 a=1, int8 b=0, int8 c=-128) AND "
+            "a IN (0, 2, 5, 9) AND b IN (0, 1, 2, 3, 4, 5, 6, 7, 8, 9)",
+            PruneInlistValuesAndGetSchemaString(
+                spec, schema, partitions[5], partition_schema, &arena_));
+
+  ASSERT_EQ("PK >= (int8 a=1, int8 b=0, int8 c=-128) AND "
             "PK < (int8 a=6, int8 b=10, int8 c=-128) AND "
-            "a IN (1, 3, 6) AND b IN (0, 1, 2, 3, 4, 5, 6, 7, 8, 9)");
-
-  ASSERT_EQ(PruneInlistValuesAndGetSchemaString(spec,
-                                                schema,
-                                                partitions[7],
-                                                partition_schema,
-                                                &arena_),
-            "PK >= (int8 a=1, int8 b=0, int8 c=-128) AND "
+            "a IN (1, 3, 6) AND b IN (0, 1, 2, 3, 4, 5, 6, 7, 8, 9)",
+            PruneInlistValuesAndGetSchemaString(
+                spec, schema, partitions[6], partition_schema, &arena_));
+
+  ASSERT_EQ("PK >= (int8 a=1, int8 b=0, int8 c=-128) AND "
             "PK < (int8 a=6, int8 b=10, int8 c=-128) AND "
-            "a IN (1, 3, 6) AND b IN (0, 1, 2, 3, 4, 5, 6, 7, 8, 9)");
-
-  ASSERT_EQ(PruneInlistValuesAndGetSchemaString(spec,
-                                                schema,
-                                                partitions[8],
-                                                partition_schema,
-                                                &arena_),
-            "PK >= (int8 a=1, int8 b=0, int8 c=-128) AND "
+            "a IN (1, 3, 6) AND b IN (0, 1, 2, 3, 4, 5, 6, 7, 8, 9)",
+            PruneInlistValuesAndGetSchemaString(
+                spec, schema, partitions[7], partition_schema, &arena_));
+
+  ASSERT_EQ("PK >= (int8 a=1, int8 b=0, int8 c=-128) AND "
             "PK < (int8 a=6, int8 b=10, int8 c=-128) AND "
-            "a IN (1, 3, 6) AND b IN (0, 1, 2, 3, 4, 5, 6, 7, 8, 9)");
+            "a IN (1, 3, 6) AND b IN (0, 1, 2, 3, 4, 5, 6, 7, 8, 9)",
+            PruneInlistValuesAndGetSchemaString(
+                spec, schema, partitions[8], partition_schema, &arena_));
 }
 
 // Test that hash(a), range(b) IN list predicates prune would happen
 // on both hash and range aspects.
-TEST_F(CompositeIntKeysTest, TestDifferentHashRangeKeyInListHashPruning) {
+TEST_F(CompositeIntKeysTest, DifferentHashRangeKeyInListHashPruning) {
   ScanSpec spec;
   AddInPredicate<int8_t>(&spec, "a", { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 });
   AddInPredicate<int8_t>(&spec, "b", { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 });
 
-  Schema schema = schema_.CopyWithColumnIds();
-
+  const auto schema = schema_.CopyWithColumnIds();
   PartitionSchema partition_schema;
-  GeneratePartitionSchema(schema,
-                          { pair<vector<string>, int>({ "a" }, 3) },
-                          { "b" },
-                          &partition_schema);
+  GeneratePartitionSchema(
+      schema, { { { "a" }, 3 } }, { "b" }, &partition_schema);
 
   KuduPartialRow split1(&schema);
   KuduPartialRow split2(&schema);
@@ -712,155 +629,109 @@ TEST_F(CompositeIntKeysTest, 
TestDifferentHashRangeKeyInListHashPruning) {
   ASSERT_OK(split2.SetInt8("b", 6));
 
   vector<Partition> partitions;
-  ASSERT_OK(partition_schema.CreatePartitions({ split1, split2 },
-                                              {},
-                                              {},
-                                              schema,
-                                              &partitions));
+  ASSERT_OK(partition_schema.CreatePartitions(
+      { split1, split2 }, {}, {}, schema, &partitions));
   ASSERT_EQ(9, partitions.size());
 
-  ASSERT_EQ(PruneInlistValuesAndGetSchemaString(spec,
-                                                schema,
-                                                partitions[0],
-                                                partition_schema,
-                                                &arena_),
-            "PK >= (int8 a=4, int8 b=0, int8 c=-128) AND "
+  ASSERT_EQ("PK >= (int8 a=4, int8 b=0, int8 c=-128) AND "
             "PK < (int8 a=8, int8 b=3, int8 c=-128) AND "
-            "a IN (4, 7, 8) AND b IN (0, 1, 2)");
-
-  ASSERT_EQ(PruneInlistValuesAndGetSchemaString(spec,
-                                                schema,
-                                                partitions[1],
-                                                partition_schema,
-                                                &arena_),
-            "PK >= (int8 a=4, int8 b=3, int8 c=-128) AND "
+            "a IN (4, 7, 8) AND b IN (0, 1, 2)",
+            PruneInlistValuesAndGetSchemaString(
+                spec, schema, partitions[0], partition_schema, &arena_));
+
+  ASSERT_EQ("PK >= (int8 a=4, int8 b=3, int8 c=-128) AND "
             "PK < (int8 a=8, int8 b=6, int8 c=-128) AND "
-            "a IN (4, 7, 8) AND b IN (3, 4, 5)");
-
-  ASSERT_EQ(PruneInlistValuesAndGetSchemaString(spec,
-                                                schema,
-                                                partitions[2],
-                                                partition_schema,
-                                                &arena_),
-            "PK >= (int8 a=4, int8 b=6, int8 c=-128) AND "
+            "a IN (4, 7, 8) AND b IN (3, 4, 5)",
+            PruneInlistValuesAndGetSchemaString(
+                spec, schema, partitions[1], partition_schema, &arena_));
+
+  ASSERT_EQ("PK >= (int8 a=4, int8 b=6, int8 c=-128) AND "
             "PK < (int8 a=8, int8 b=10, int8 c=-128) AND "
-            "a IN (4, 7, 8) AND b IN (6, 7, 8, 9)");
-
-  ASSERT_EQ(PruneInlistValuesAndGetSchemaString(spec,
-                                                schema,
-                                                partitions[3],
-                                                partition_schema,
-                                                &arena_),
-            "PK >= (int8 a=0, int8 b=0, int8 c=-128) AND "
+            "a IN (4, 7, 8) AND b IN (6, 7, 8, 9)",
+            PruneInlistValuesAndGetSchemaString(
+                spec, schema, partitions[2], partition_schema, &arena_));
+
+  ASSERT_EQ("PK >= (int8 a=0, int8 b=0, int8 c=-128) AND "
             "PK < (int8 a=9, int8 b=3, int8 c=-128) AND "
-            "a IN (0, 2, 5, 9) AND b IN (0, 1, 2)");
-
-  ASSERT_EQ(PruneInlistValuesAndGetSchemaString(spec,
-                                                schema,
-                                                partitions[4],
-                                                partition_schema,
-                                                &arena_),
-            "PK >= (int8 a=0, int8 b=3, int8 c=-128) AND "
+            "a IN (0, 2, 5, 9) AND b IN (0, 1, 2)",
+            PruneInlistValuesAndGetSchemaString(
+                spec, schema, partitions[3], partition_schema, &arena_));
+
+  ASSERT_EQ("PK >= (int8 a=0, int8 b=3, int8 c=-128) AND "
             "PK < (int8 a=9, int8 b=6, int8 c=-128) AND "
-            "a IN (0, 2, 5, 9) AND b IN (3, 4, 5)");
-
-  ASSERT_EQ(PruneInlistValuesAndGetSchemaString(spec,
-                                                schema,
-                                                partitions[5],
-                                                partition_schema,
-                                                &arena_),
-            "PK >= (int8 a=0, int8 b=6, int8 c=-128) AND "
+            "a IN (0, 2, 5, 9) AND b IN (3, 4, 5)",
+            PruneInlistValuesAndGetSchemaString(
+                spec, schema, partitions[4], partition_schema, &arena_));
+
+  ASSERT_EQ("PK >= (int8 a=0, int8 b=6, int8 c=-128) AND "
             "PK < (int8 a=9, int8 b=10, int8 c=-128) AND "
-            "a IN (0, 2, 5, 9) AND b IN (6, 7, 8, 9)");
-
-  ASSERT_EQ(PruneInlistValuesAndGetSchemaString(spec,
-                                                schema,
-                                                partitions[6],
-                                                partition_schema,
-                                                &arena_),
-            "PK >= (int8 a=1, int8 b=0, int8 c=-128) AND "
+            "a IN (0, 2, 5, 9) AND b IN (6, 7, 8, 9)",
+            PruneInlistValuesAndGetSchemaString(
+                spec, schema, partitions[5], partition_schema, &arena_));
+
+  ASSERT_EQ("PK >= (int8 a=1, int8 b=0, int8 c=-128) AND "
             "PK < (int8 a=6, int8 b=3, int8 c=-128) AND "
-            "a IN (1, 3, 6) AND b IN (0, 1, 2)");
-
-  ASSERT_EQ(PruneInlistValuesAndGetSchemaString(spec,
-                                                schema,
-                                                partitions[7],
-                                                partition_schema,
-                                                &arena_),
-            "PK >= (int8 a=1, int8 b=3, int8 c=-128) AND "
+            "a IN (1, 3, 6) AND b IN (0, 1, 2)",
+            PruneInlistValuesAndGetSchemaString(
+                spec, schema, partitions[6], partition_schema, &arena_));
+
+  ASSERT_EQ("PK >= (int8 a=1, int8 b=3, int8 c=-128) AND "
             "PK < (int8 a=6, int8 b=6, int8 c=-128) AND "
-            "a IN (1, 3, 6) AND b IN (3, 4, 5)");
-
-  ASSERT_EQ(PruneInlistValuesAndGetSchemaString(spec,
-                                                schema,
-                                                partitions[8],
-                                                partition_schema,
-                                                &arena_),
-            "PK >= (int8 a=1, int8 b=6, int8 c=-128) AND "
+            "a IN (1, 3, 6) AND b IN (3, 4, 5)",
+            PruneInlistValuesAndGetSchemaString(
+                spec, schema, partitions[7], partition_schema, &arena_));
+
+  ASSERT_EQ("PK >= (int8 a=1, int8 b=6, int8 c=-128) AND "
             "PK < (int8 a=6, int8 b=10, int8 c=-128) AND "
-            "a IN (1, 3, 6) AND b IN (6, 7, 8, 9)");
+            "a IN (1, 3, 6) AND b IN (6, 7, 8, 9)",
+            PruneInlistValuesAndGetSchemaString(
+                spec, schema, partitions[8], partition_schema, &arena_));
 }
 
 // Test that in case hash(a) prune all predicate values, the rest predicate 
values for
 // pruned in-list predicate should be detect correctly by CanShortCircuit().
 // BTW, empty IN list predicates wouldn't result in the crash of 
OptimizeScan().
-TEST_F(CompositeIntKeysTest, TestHashKeyInListHashPruningEmptyDetect) {
+TEST_F(CompositeIntKeysTest, HashKeyInListHashPruningEmptyDetect) {
   ScanSpec spec;
   AddInPredicate<int8_t>(&spec, "a", { 0, 2, 4, 5, 7, 8, 9 });
   AddInPredicate<int8_t>(&spec, "b", { 50, 100 });
 
-  Schema schema = schema_.CopyWithColumnIds();
-
+  const auto schema = schema_.CopyWithColumnIds();
   PartitionSchema partition_schema;
-  GeneratePartitionSchema(schema,
-                          { pair<vector<string>, int>({ "a" }, 3) },
-                          {},
-                          &partition_schema);
+  GeneratePartitionSchema(
+      schema, { { { "a" }, 3 } }, {}, &partition_schema);
 
   vector<Partition> partitions;
   ASSERT_OK(partition_schema.CreatePartitions({}, {}, {}, schema, 
&partitions));
   ASSERT_EQ(3, partitions.size());
 
-  ASSERT_EQ(PruneInlistValuesAndGetSchemaString(spec,
-                                                schema,
-                                                partitions[0],
-                                                partition_schema,
-                                                &arena_),
-            "PK >= (int8 a=4, int8 b=50, int8 c=-128) AND "
+  ASSERT_EQ("PK >= (int8 a=4, int8 b=50, int8 c=-128) AND "
             "PK < (int8 a=8, int8 b=101, int8 c=-128) AND "
-            "a IN (4, 7, 8) AND b IN (50, 100)");
-
-  ASSERT_EQ(PruneInlistValuesAndGetSchemaString(spec,
-                                                schema,
-                                                partitions[1],
-                                                partition_schema,
-                                                &arena_),
-            "PK >= (int8 a=0, int8 b=50, int8 c=-128) AND "
+            "a IN (4, 7, 8) AND b IN (50, 100)",
+            PruneInlistValuesAndGetSchemaString(
+                spec, schema, partitions[0], partition_schema, &arena_));
+
+  ASSERT_EQ("PK >= (int8 a=0, int8 b=50, int8 c=-128) AND "
             "PK < (int8 a=9, int8 b=101, int8 c=-128) AND "
-            "a IN (0, 2, 5, 9) AND b IN (50, 100)");
-
-  ASSERT_EQ(PruneInlistValuesAndGetSchemaString(spec,
-                                                schema,
-                                                partitions[2],
-                                                partition_schema,
-                                                &arena_),
-            "a IN () AND b IN (50, 100)");
+            "a IN (0, 2, 5, 9) AND b IN (50, 100)",
+            PruneInlistValuesAndGetSchemaString(
+                spec, schema, partitions[1], partition_schema, &arena_));
+
+  ASSERT_EQ("a IN () AND b IN (50, 100)",
+            PruneInlistValuesAndGetSchemaString(
+                spec, schema, partitions[2], partition_schema, &arena_));
 }
 
 // Test that hash(a), hash(b) IN list predicates should be pruned.
-TEST_F(CompositeIntKeysTest, TestMultiHashKeyOneColumnInListHashPruning) {
+TEST_F(CompositeIntKeysTest, MultiHashKeyOneColumnInListHashPruning) {
   ScanSpec spec;
   AddInPredicate<int8_t>(&spec, "a", { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 });
   AddInPredicate<int8_t>(&spec, "b", { 10, 20, 30, 40, 50, 60, 70, 80 });
 
-  Schema schema = schema_.CopyWithColumnIds();
-
+  const auto schema = schema_.CopyWithColumnIds();
   PartitionSchema partition_schema;
-  GeneratePartitionSchema(schema,
-                          { pair<vector<string>, int>({ "a" }, 3),
-                            pair<vector<string>, int>({ "b" }, 3) },
-                          {},
-                          &partition_schema);
+  GeneratePartitionSchema(
+      schema, { { { "a" }, 3 }, { { "b" }, 3 }, }, {}, &partition_schema);
 
   vector<Partition> partitions;
   ASSERT_OK(partition_schema.CreatePartitions({}, {}, {}, schema, 
&partitions));
@@ -869,236 +740,202 @@ TEST_F(CompositeIntKeysTest, 
TestMultiHashKeyOneColumnInListHashPruning) {
   // p1, p2, p3 should have the same predicate values to be pushed on hash(a).
   // p1, p4, p7 should have the same predicate values to be pushed on hash(b).
   // pi refer to partitions[i-1], e.g. p1 = partitions[0]
-  ASSERT_EQ(PruneInlistValuesAndGetSchemaString(spec,
-                                                schema,
-                                                partitions[0],
-                                                partition_schema,
-                                                &arena_),
-            "PK >= (int8 a=4, int8 b=40, int8 c=-128) AND "
+  ASSERT_EQ("PK >= (int8 a=4, int8 b=40, int8 c=-128) AND "
             "PK < (int8 a=8, int8 b=71, int8 c=-128) AND "
-            "a IN (4, 7, 8) AND b IN (40, 60, 70)");
-
-  ASSERT_EQ(PruneInlistValuesAndGetSchemaString(spec,
-                                                schema,
-                                                partitions[1],
-                                                partition_schema,
-                                                &arena_),
-            "PK >= (int8 a=4, int8 b=20, int8 c=-128) AND "
+            "a IN (4, 7, 8) AND b IN (40, 60, 70)",
+            PruneInlistValuesAndGetSchemaString(
+                spec, schema, partitions[0], partition_schema, &arena_));
+
+  ASSERT_EQ("PK >= (int8 a=4, int8 b=20, int8 c=-128) AND "
             "PK < (int8 a=8, int8 b=51, int8 c=-128) AND "
-            "a IN (4, 7, 8) AND b IN (20, 30, 50)");
-
-  ASSERT_EQ(PruneInlistValuesAndGetSchemaString(spec,
-                                                schema,
-                                                partitions[2],
-                                                partition_schema,
-                                                &arena_),
-            "PK >= (int8 a=4, int8 b=10, int8 c=-128) AND "
+            "a IN (4, 7, 8) AND b IN (20, 30, 50)",
+            PruneInlistValuesAndGetSchemaString(
+                spec, schema, partitions[1], partition_schema, &arena_));
+
+  ASSERT_EQ("PK >= (int8 a=4, int8 b=10, int8 c=-128) AND "
             "PK < (int8 a=8, int8 b=81, int8 c=-128) AND "
-            "a IN (4, 7, 8) AND b IN (10, 80)");
-
-  ASSERT_EQ(PruneInlistValuesAndGetSchemaString(spec,
-                                                schema,
-                                                partitions[3],
-                                                partition_schema,
-                                                &arena_),
-            "PK >= (int8 a=0, int8 b=40, int8 c=-128) AND "
+            "a IN (4, 7, 8) AND b IN (10, 80)",
+            PruneInlistValuesAndGetSchemaString(
+                spec, schema, partitions[2], partition_schema, &arena_));
+
+  ASSERT_EQ("PK >= (int8 a=0, int8 b=40, int8 c=-128) AND "
             "PK < (int8 a=9, int8 b=71, int8 c=-128) AND "
-            "a IN (0, 2, 5, 9) AND b IN (40, 60, 70)");
-
-  ASSERT_EQ(PruneInlistValuesAndGetSchemaString(spec,
-                                                schema,
-                                                partitions[4],
-                                                partition_schema,
-                                                &arena_),
-            "PK >= (int8 a=0, int8 b=20, int8 c=-128) AND "
+            "a IN (0, 2, 5, 9) AND b IN (40, 60, 70)",
+            PruneInlistValuesAndGetSchemaString(
+                spec, schema, partitions[3], partition_schema, &arena_));
+
+  ASSERT_EQ("PK >= (int8 a=0, int8 b=20, int8 c=-128) AND "
             "PK < (int8 a=9, int8 b=51, int8 c=-128) AND "
-            "a IN (0, 2, 5, 9) AND b IN (20, 30, 50)");
-
-  ASSERT_EQ(PruneInlistValuesAndGetSchemaString(spec,
-                                                schema,
-                                                partitions[5],
-                                                partition_schema,
-                                                &arena_),
-            "PK >= (int8 a=0, int8 b=10, int8 c=-128) AND "
+            "a IN (0, 2, 5, 9) AND b IN (20, 30, 50)",
+            PruneInlistValuesAndGetSchemaString(
+                spec, schema, partitions[4], partition_schema, &arena_));
+
+  ASSERT_EQ("PK >= (int8 a=0, int8 b=10, int8 c=-128) AND "
             "PK < (int8 a=9, int8 b=81, int8 c=-128) AND "
-            "a IN (0, 2, 5, 9) AND b IN (10, 80)");
-
-  ASSERT_EQ(PruneInlistValuesAndGetSchemaString(spec,
-                                                schema,
-                                                partitions[6],
-                                                partition_schema,
-                                                &arena_),
-            "PK >= (int8 a=1, int8 b=40, int8 c=-128) AND "
+            "a IN (0, 2, 5, 9) AND b IN (10, 80)",
+            PruneInlistValuesAndGetSchemaString(
+                spec, schema, partitions[5], partition_schema, &arena_));
+
+  ASSERT_EQ("PK >= (int8 a=1, int8 b=40, int8 c=-128) AND "
             "PK < (int8 a=6, int8 b=71, int8 c=-128) AND "
-            "a IN (1, 3, 6) AND b IN (40, 60, 70)");
-
-  ASSERT_EQ(PruneInlistValuesAndGetSchemaString(spec,
-                                                schema,
-                                                partitions[7],
-                                                partition_schema,
-                                                &arena_),
-            "PK >= (int8 a=1, int8 b=20, int8 c=-128) AND "
+            "a IN (1, 3, 6) AND b IN (40, 60, 70)",
+            PruneInlistValuesAndGetSchemaString(
+                spec, schema, partitions[6], partition_schema, &arena_));
+
+  ASSERT_EQ("PK >= (int8 a=1, int8 b=20, int8 c=-128) AND "
             "PK < (int8 a=6, int8 b=51, int8 c=-128) AND "
-            "a IN (1, 3, 6) AND b IN (20, 30, 50)");
-
-  ASSERT_EQ(PruneInlistValuesAndGetSchemaString(spec,
-                                                schema,
-                                                partitions[8],
-                                                partition_schema,
-                                                &arena_),
-            "PK >= (int8 a=1, int8 b=10, int8 c=-128) AND "
+            "a IN (1, 3, 6) AND b IN (20, 30, 50)",
+            PruneInlistValuesAndGetSchemaString(
+                spec, schema, partitions[7], partition_schema, &arena_));
+
+  ASSERT_EQ("PK >= (int8 a=1, int8 b=10, int8 c=-128) AND "
             "PK < (int8 a=6, int8 b=81, int8 c=-128) AND "
-            "a IN (1, 3, 6) AND b IN (10, 80)");
+            "a IN (1, 3, 6) AND b IN (10, 80)",
+            PruneInlistValuesAndGetSchemaString(
+                spec, schema, partitions[8], partition_schema, &arena_));
 }
 
 // Test that hash(a, b) IN list predicates should not be pruned.
-TEST_F(CompositeIntKeysTest, TesMultiHashColumnsInListHashPruning) {
+TEST_F(CompositeIntKeysTest, MultiHashColumnsInListHashPruning) {
   ScanSpec spec;
   AddInPredicate<int8_t>(&spec, "a", { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 });
   AddInPredicate<int8_t>(&spec, "b", { 50, 100 });
 
-  Schema schema = schema_.CopyWithColumnIds();
-
+  const auto schema = schema_.CopyWithColumnIds();
   PartitionSchema partition_schema;
-  GeneratePartitionSchema(schema,
-                          { pair<vector<string>, int>({ "a", "b" }, 3) },
-                          {},
-                          &partition_schema);
+  GeneratePartitionSchema(
+      schema, { { { "a", "b" }, 3 } }, {}, &partition_schema);
 
   vector<Partition> partitions;
   ASSERT_OK(partition_schema.CreatePartitions({}, {}, {}, schema, 
&partitions));
   ASSERT_EQ(3, partitions.size());
 
-  ASSERT_EQ(PruneInlistValuesAndGetSchemaString(spec,
-                                                schema,
-                                                partitions[0],
-                                                partition_schema,
-                                                &arena_),
-            "PK >= (int8 a=0, int8 b=50, int8 c=-128) AND "
+  ASSERT_EQ("PK >= (int8 a=0, int8 b=50, int8 c=-128) AND "
             "PK < (int8 a=9, int8 b=101, int8 c=-128) AND "
-            "a IN (0, 1, 2, 3, 4, 5, 6, 7, 8, 9) AND b IN (50, 100)");
-
-  ASSERT_EQ(PruneInlistValuesAndGetSchemaString(spec,
-                                                schema,
-                                                partitions[1],
-                                                partition_schema,
-                                                &arena_),
-            "PK >= (int8 a=0, int8 b=50, int8 c=-128) AND "
+            "a IN (0, 1, 2, 3, 4, 5, 6, 7, 8, 9) AND b IN (50, 100)",
+            PruneInlistValuesAndGetSchemaString(
+                spec, schema, partitions[0], partition_schema, &arena_));
+
+  ASSERT_EQ("PK >= (int8 a=0, int8 b=50, int8 c=-128) AND "
             "PK < (int8 a=9, int8 b=101, int8 c=-128) AND "
-            "a IN (0, 1, 2, 3, 4, 5, 6, 7, 8, 9) AND b IN (50, 100)");
-
-  ASSERT_EQ(PruneInlistValuesAndGetSchemaString(spec,
-                                                schema,
-                                                partitions[2],
-                                                partition_schema,
-                                                &arena_),
-            "PK >= (int8 a=0, int8 b=50, int8 c=-128) AND "
+            "a IN (0, 1, 2, 3, 4, 5, 6, 7, 8, 9) AND b IN (50, 100)",
+            PruneInlistValuesAndGetSchemaString(
+                spec, schema, partitions[1], partition_schema, &arena_));
+
+  ASSERT_EQ("PK >= (int8 a=0, int8 b=50, int8 c=-128) AND "
             "PK < (int8 a=9, int8 b=101, int8 c=-128) AND "
-            "a IN (0, 1, 2, 3, 4, 5, 6, 7, 8, 9) AND b IN (50, 100)");
+            "a IN (0, 1, 2, 3, 4, 5, 6, 7, 8, 9) AND b IN (50, 100)",
+            PruneInlistValuesAndGetSchemaString(
+                spec, schema, partitions[2], partition_schema, &arena_));
 }
 
 // Test that hash(a, b), hash(c) InList predicates.
 // Neither a or b IN list can be pruned.
 // c IN list should be pruned.
-TEST_F(CompositeIntKeysTest, TesMultiHashKeyMultiHashInListHashPruning) {
+TEST_F(CompositeIntKeysTest, MultiHashKeyMultiHashInListHashPruning) {
   ScanSpec spec;
   AddInPredicate<int8_t>(&spec, "a", { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 });
   AddInPredicate<int8_t>(&spec, "b", { 50, 100 });
-  AddInPredicate<int8_t>(&spec, "c", { 20, 30, 40, 50, 60, 70, 80, 90});
-
-  Schema schema = schema_.CopyWithColumnIds();
+  AddInPredicate<int8_t>(&spec, "c", { 20, 30, 40, 50, 60, 70, 80, 90 });
 
+  const auto schema = schema_.CopyWithColumnIds();
   PartitionSchema partition_schema;
-  GeneratePartitionSchema(schema,
-                          { pair<vector<string>, int>({ "a", "b" }, 3),
-                            pair<vector<string>, int>({ "c" }, 3) },
-                          {},
-                          &partition_schema);
+  GeneratePartitionSchema(
+      schema, { { { "a", "b" }, 3 }, { { "c" }, 3 } }, {}, &partition_schema);
 
   vector<Partition> partitions;
   ASSERT_OK(partition_schema.CreatePartitions({}, {}, {}, schema, 
&partitions));
   ASSERT_EQ(9, partitions.size());
 
-  ASSERT_EQ(PruneInlistValuesAndGetSchemaString(spec,
-                                                schema,
-                                                partitions[0],
-                                                partition_schema,
-                                                &arena_),
-            "PK >= (int8 a=0, int8 b=50, int8 c=40) AND "
+  ASSERT_EQ("PK >= (int8 a=0, int8 b=50, int8 c=40) AND "
             "PK < (int8 a=9, int8 b=100, int8 c=71) AND "
-            "a IN (0, 1, 2, 3, 4, 5, 6, 7, 8, 9) AND b IN (50, 100) AND c IN 
(40, 60, 70)");
-
-  ASSERT_EQ(PruneInlistValuesAndGetSchemaString(spec,
-                                                schema,
-                                                partitions[1],
-                                                partition_schema,
-                                                &arena_),
-            "PK >= (int8 a=0, int8 b=50, int8 c=20) AND "
+            "a IN (0, 1, 2, 3, 4, 5, 6, 7, 8, 9) AND "
+            "b IN (50, 100) AND c IN (40, 60, 70)",
+            PruneInlistValuesAndGetSchemaString(
+                spec, schema, partitions[0], partition_schema, &arena_));
+
+  ASSERT_EQ("PK >= (int8 a=0, int8 b=50, int8 c=20) AND "
             "PK < (int8 a=9, int8 b=100, int8 c=51) AND "
-            "a IN (0, 1, 2, 3, 4, 5, 6, 7, 8, 9) AND b IN (50, 100) AND c IN 
(20, 30, 50)");
-
-  ASSERT_EQ(PruneInlistValuesAndGetSchemaString(spec,
-                                                schema,
-                                                partitions[2],
-                                                partition_schema,
-                                                &arena_),
-            "PK >= (int8 a=0, int8 b=50, int8 c=80) AND "
+            "a IN (0, 1, 2, 3, 4, 5, 6, 7, 8, 9) AND "
+            "b IN (50, 100) AND c IN (20, 30, 50)",
+            PruneInlistValuesAndGetSchemaString(
+                spec, schema, partitions[1], partition_schema, &arena_));
+
+  ASSERT_EQ("PK >= (int8 a=0, int8 b=50, int8 c=80) AND "
             "PK < (int8 a=9, int8 b=100, int8 c=91) AND "
-            "a IN (0, 1, 2, 3, 4, 5, 6, 7, 8, 9) AND b IN (50, 100) AND c IN 
(80, 90)");
-
-  ASSERT_EQ(PruneInlistValuesAndGetSchemaString(spec,
-                                                schema,
-                                                partitions[3],
-                                                partition_schema,
-                                                &arena_),
-            "PK >= (int8 a=0, int8 b=50, int8 c=40) AND "
+            "a IN (0, 1, 2, 3, 4, 5, 6, 7, 8, 9) AND "
+            "b IN (50, 100) AND c IN (80, 90)",
+            PruneInlistValuesAndGetSchemaString(
+                spec, schema, partitions[2], partition_schema, &arena_));
+
+  ASSERT_EQ("PK >= (int8 a=0, int8 b=50, int8 c=40) AND "
             "PK < (int8 a=9, int8 b=100, int8 c=71) AND "
-            "a IN (0, 1, 2, 3, 4, 5, 6, 7, 8, 9) AND b IN (50, 100) AND c IN 
(40, 60, 70)");
-
-  ASSERT_EQ(PruneInlistValuesAndGetSchemaString(spec,
-                                                schema,
-                                                partitions[4],
-                                                partition_schema,
-                                                &arena_),
-            "PK >= (int8 a=0, int8 b=50, int8 c=20) AND "
+            "a IN (0, 1, 2, 3, 4, 5, 6, 7, 8, 9) AND "
+            "b IN (50, 100) AND c IN (40, 60, 70)",
+            PruneInlistValuesAndGetSchemaString(
+                spec, schema, partitions[3], partition_schema, &arena_));
+
+  ASSERT_EQ("PK >= (int8 a=0, int8 b=50, int8 c=20) AND "
             "PK < (int8 a=9, int8 b=100, int8 c=51) AND "
-            "a IN (0, 1, 2, 3, 4, 5, 6, 7, 8, 9) AND b IN (50, 100) AND c IN 
(20, 30, 50)");
-
-  ASSERT_EQ(PruneInlistValuesAndGetSchemaString(spec,
-                                                schema,
-                                                partitions[5],
-                                                partition_schema,
-                                                &arena_),
-            "PK >= (int8 a=0, int8 b=50, int8 c=80) AND "
+            "a IN (0, 1, 2, 3, 4, 5, 6, 7, 8, 9) AND "
+            "b IN (50, 100) AND c IN (20, 30, 50)",
+            PruneInlistValuesAndGetSchemaString(
+                spec, schema, partitions[4], partition_schema, &arena_));
+
+  ASSERT_EQ("PK >= (int8 a=0, int8 b=50, int8 c=80) AND "
             "PK < (int8 a=9, int8 b=100, int8 c=91) AND "
-            "a IN (0, 1, 2, 3, 4, 5, 6, 7, 8, 9) AND b IN (50, 100) AND c IN 
(80, 90)");
-
-  ASSERT_EQ(PruneInlistValuesAndGetSchemaString(spec,
-                                                schema,
-                                                partitions[6],
-                                                partition_schema,
-                                                &arena_),
-            "PK >= (int8 a=0, int8 b=50, int8 c=40) AND "
+            "a IN (0, 1, 2, 3, 4, 5, 6, 7, 8, 9) AND "
+            "b IN (50, 100) AND c IN (80, 90)",
+            PruneInlistValuesAndGetSchemaString(
+                spec, schema, partitions[5], partition_schema, &arena_));
+
+  ASSERT_EQ("PK >= (int8 a=0, int8 b=50, int8 c=40) AND "
             "PK < (int8 a=9, int8 b=100, int8 c=71) AND "
-            "a IN (0, 1, 2, 3, 4, 5, 6, 7, 8, 9) AND b IN (50, 100) AND c IN 
(40, 60, 70)");
-
-  ASSERT_EQ(PruneInlistValuesAndGetSchemaString(spec,
-                                                schema,
-                                                partitions[7],
-                                                partition_schema,
-                                                &arena_),
-            "PK >= (int8 a=0, int8 b=50, int8 c=20) AND "
+            "a IN (0, 1, 2, 3, 4, 5, 6, 7, 8, 9) AND "
+            "b IN (50, 100) AND c IN (40, 60, 70)",
+            PruneInlistValuesAndGetSchemaString(
+                spec, schema, partitions[6], partition_schema, &arena_));
+
+  ASSERT_EQ("PK >= (int8 a=0, int8 b=50, int8 c=20) AND "
             "PK < (int8 a=9, int8 b=100, int8 c=51) AND "
-            "a IN (0, 1, 2, 3, 4, 5, 6, 7, 8, 9) AND b IN (50, 100) AND c IN 
(20, 30, 50)");
-
-  ASSERT_EQ(PruneInlistValuesAndGetSchemaString(spec,
-                                                schema,
-                                                partitions[8],
-                                                partition_schema,
-                                                &arena_),
-            "PK >= (int8 a=0, int8 b=50, int8 c=80) AND "
+            "a IN (0, 1, 2, 3, 4, 5, 6, 7, 8, 9) AND "
+            "b IN (50, 100) AND c IN (20, 30, 50)",
+            PruneInlistValuesAndGetSchemaString(
+                spec, schema, partitions[7], partition_schema, &arena_));
+
+  ASSERT_EQ("PK >= (int8 a=0, int8 b=50, int8 c=80) AND "
             "PK < (int8 a=9, int8 b=100, int8 c=91) AND "
-            "a IN (0, 1, 2, 3, 4, 5, 6, 7, 8, 9) AND b IN (50, 100) AND c IN 
(80, 90)");
+            "a IN (0, 1, 2, 3, 4, 5, 6, 7, 8, 9) AND "
+            "b IN (50, 100) AND c IN (80, 90)",
+            PruneInlistValuesAndGetSchemaString(
+                spec, schema, partitions[8], partition_schema, &arena_));
+}
+
+// Of course, no pruning of IN list predicate's values for non-key columns.
+TEST_F(CompositeIntKeysTest, NonKeyValuesInListHashPruning) {
+  ScanSpec spec;
+  AddInPredicate<int8_t>(&spec, "d", { 1, 2, 3, 4, 5 });
+
+  const auto schema = schema_.CopyWithColumnIds();
+  PartitionSchema partition_schema;
+  GeneratePartitionSchema(
+      schema, { { { "a" }, 3 } }, {}, &partition_schema);
+
+  vector<Partition> partitions;
+  ASSERT_OK(partition_schema.CreatePartitions({}, {}, {}, schema, 
&partitions));
+  ASSERT_EQ(3, partitions.size());
+
+  ASSERT_EQ("d IN (1, 2, 3, 4, 5)",
+            PruneInlistValuesAndGetSchemaString(
+                spec, schema, partitions[0], partition_schema, &arena_));
+
+  ASSERT_EQ("d IN (1, 2, 3, 4, 5)",
+            PruneInlistValuesAndGetSchemaString(
+                spec, schema, partitions[1], partition_schema, &arena_));
+
+  ASSERT_EQ("d IN (1, 2, 3, 4, 5)",
+            PruneInlistValuesAndGetSchemaString(
+                spec, schema, partitions[2], partition_schema, &arena_));
 }
 
 // Test that IN list mixed with range predicates get pushed into the primary 
key
@@ -1128,7 +965,7 @@ TEST_F(CompositeIntKeysTest, TestInListPushdownWithRange) {
 TEST_F(CompositeIntKeysTest, TestLiftPrimaryKeyBounds_NoBounds) {
   ScanSpec spec;
   spec.OptimizeScan(schema_, &arena_, false);
-  ASSERT_EQ(0, spec.predicates().size());
+  ASSERT_TRUE(spec.predicates().empty());
 }
 
 // Test that implicit constraints specified in the lower primary key bound are
diff --git a/src/kudu/common/scan_spec.cc b/src/kudu/common/scan_spec.cc
index e421829..87b3d5b 100644
--- a/src/kudu/common/scan_spec.cc
+++ b/src/kudu/common/scan_spec.cc
@@ -174,42 +174,37 @@ void ScanSpec::OptimizeScan(const Schema& schema,
 void ScanSpec::PruneInlistValuesIfPossible(const Schema& schema,
                                            const Partition& partition,
                                            const PartitionSchema& 
partition_schema) {
-  for (auto& predicate_pair : predicates_) {
-    auto& predicate = predicate_pair.second;
-    if (predicate.predicate_type() != PredicateType::InList) continue;
+  for (auto& [column_name, predicate] : predicates_) {
+    if (predicate.predicate_type() != PredicateType::InList) {
+      continue;
+    }
 
-    const string& col_name = predicate_pair.first;
     int32_t idx;
-    Status s = schema.FindColumn(col_name, &idx);
-    if (!s.ok() || !schema.is_key_column(idx)) continue;
-
-    int hash_idx = 
partition_schema.TryGetSingleColumnHashPartitionIndex(schema, idx);
-    bool is_col_single_range_schema = 
partition_schema.IsColumnSingleRangeSchema(schema, idx);
-    if (hash_idx == -1 && !is_col_single_range_schema) continue;
+    if (auto s = schema.FindColumn(column_name, &idx); !s.ok()) {
+      LOG(DFATAL) << s.ToString();
+      continue;
+    }
+    if (!schema.is_key_column(idx)) {
+      continue;
+    }
 
     auto* predicate_values = predicate.mutable_raw_values();
-
-    predicate_values->erase(std::remove_if(predicate_values->begin(), 
predicate_values->end(),
-          [idx, hash_idx, is_col_single_range_schema,
-           &schema, &partition, &partition_schema](const void* value) {
-        // Returns true indicates this value is going to be removed from the 
predicate values.
-        KuduPartialRow partial_row(&schema);
-        Status s = partial_row.Set(idx, reinterpret_cast<const 
uint8_t*>(value));
-        if (!s.ok()) return false;
-
-         // If value is not in given hash partition, remove this value from 
predicate values.
-        if (hash_idx != -1 && !partition_schema.HashPartitionContainsRow(
-              partition, partial_row, hash_idx)) {
-          return true;
-        }
-
-        // If value is not in given range partition, remove this value from 
predicate values.
-        if (is_col_single_range_schema &&
-            !partition_schema.RangePartitionContainsRow(partition, 
partial_row)) {
-          return true;
-        }
-        return false;
-      }), predicate_values->end());
+    predicate_values->erase(std::remove_if(
+        predicate_values->begin(),
+        predicate_values->end(),
+        [idx, &schema, &partition, &partition_schema](const void* value) {
+          // If the target partition cannot contain the row, there is no sense
+          // of searching for the value: return 'true' if the value is to be
+          // removed from the IN(...) predicate.
+          KuduPartialRow row(&schema);
+          if (auto s = row.Set(idx, reinterpret_cast<const uint8_t*>(value));
+              !s.ok()) {
+            LOG(DFATAL) << s.ToString();
+            return false;
+          }
+          return !partition_schema.PartitionMayContainRow(partition, row);
+        }),
+        predicate_values->end());
   }
 }
 
diff --git a/src/kudu/common/scan_spec.h b/src/kudu/common/scan_spec.h
index 9843dae..a660502 100644
--- a/src/kudu/common/scan_spec.h
+++ b/src/kudu/common/scan_spec.h
@@ -73,22 +73,21 @@ class ScanSpec {
                     Arena* arena,
                     bool remove_pushed_predicates);
 
-  // Filter in-list predicate values with given hash partition schema.
-  // If range partition is introduced when creating table, in-list predicate
-  // can also benefit from this pruning.
+  // Filter in-list predicate values with given a partition schema.
   //
-  // Only supports pruning for single-column hash schemas or single-column 
range schema.
-  // Now support hash prune on:
-  //     hash(onekey), # support.
-  //     range(onekey), # support.
-  //     hash(onekey), hash(anotherkey) # support either.
-  //     hash(onekey), range(anotherkey) # support either.
-  //     hash(key_one, key_two), hash(anotherkey) # only support prune on 
anotherkey.
-  //     range(key_one, key_two) # not support.
+  // Supports pruning only for single-column hash and range schemas. The 
pruning
+  // of IN list predicate's values is enabled for the following partitioning
+  // patterns:
   //
-  // TODO(ningw) For IN list predicate on hash/range(key_one, key_two) or more 
columns,
-  // if one predicate is IN list, and the rest predicate(s) are EQUAL, could
-  // have IN list predicate values prune as well.
+  //   hash(onekey),                            # pruning on 'onekey'
+  //   range(onekey),                           # pruning on 'onekey'
+  //   hash(onekey), hash(anotherkey)           # pruning on either key
+  //   hash(onekey), range(anotherkey)          # pruning on either key
+  //   hash(key_one, key_two), hash(anotherkey) # pruning on 'anotherkey'
+  //
+  // TODO(ningw) For IN list predicate on hash/range(key_one, key_two) or more
+  //             columns, if one predicate is IN list, and the rest 
predicate(s)
+  //             are EQUAL, could have IN list predicate values prune as well.
   void PruneInlistValuesIfPossible(const Schema& schema,
                                    const Partition& partition,
                                    const PartitionSchema& partition_schema);

[kudu] branch master updated: [common] more generic API for IN list predicate pruning

Reply via email to