wgtmac commented on code in PR #664:
URL: https://github.com/apache/iceberg-cpp/pull/664#discussion_r3292032459


##########
src/iceberg/test/manifest_group_test.cc:
##########
@@ -404,6 +405,123 @@ TEST_P(ManifestGroupTest, CustomManifestEntriesFilter) {
                                                              
"/path/to/data3.parquet"));
 }
 
+TEST_P(ManifestGroupTest, FilterFilesByRecordCount) {
+  auto version = GetParam();
+
+  constexpr int64_t kSnapshotId = 1000L;
+  const auto part_value = PartitionValues({Literal::Int(0)});
+
+  std::vector<ManifestEntry> data_entries{
+      MakeEntry(ManifestStatus::kAdded, kSnapshotId, /*sequence_number=*/1,
+                MakeDataFile("/path/to/small.parquet", part_value,
+                             partitioned_spec_->spec_id(), 
/*record_count=*/5)),
+      MakeEntry(ManifestStatus::kAdded, kSnapshotId, /*sequence_number=*/1,
+                MakeDataFile("/path/to/boundary.parquet", part_value,
+                             partitioned_spec_->spec_id(), 
/*record_count=*/10)),
+      MakeEntry(ManifestStatus::kAdded, kSnapshotId, /*sequence_number=*/1,
+                MakeDataFile("/path/to/large.parquet", part_value,
+                             partitioned_spec_->spec_id(), 
/*record_count=*/15))};
+  auto data_manifest =
+      WriteDataManifest(version, kSnapshotId, std::move(data_entries), 
partitioned_spec_);
+
+  std::vector<ManifestFile> manifests = {data_manifest};
+  ICEBERG_UNWRAP_OR_FAIL(
+      auto group,
+      ManifestGroup::Make(file_io_, schema_, GetSpecsById(), 
std::move(manifests)));
+  group->FilterFiles(Expressions::GreaterThanOrEqual("record_count", 
Literal::Long(10)));

Review Comment:
   Does case sensitivity matter here? For example what if users set 
case-inventive and provide `RECORD_COUNT` as the column name?



##########
src/iceberg/test/manifest_group_test.cc:
##########
@@ -404,6 +405,123 @@ TEST_P(ManifestGroupTest, CustomManifestEntriesFilter) {
                                                              
"/path/to/data3.parquet"));
 }
 
+TEST_P(ManifestGroupTest, FilterFilesByRecordCount) {
+  auto version = GetParam();
+
+  constexpr int64_t kSnapshotId = 1000L;
+  const auto part_value = PartitionValues({Literal::Int(0)});
+
+  std::vector<ManifestEntry> data_entries{
+      MakeEntry(ManifestStatus::kAdded, kSnapshotId, /*sequence_number=*/1,
+                MakeDataFile("/path/to/small.parquet", part_value,
+                             partitioned_spec_->spec_id(), 
/*record_count=*/5)),
+      MakeEntry(ManifestStatus::kAdded, kSnapshotId, /*sequence_number=*/1,
+                MakeDataFile("/path/to/boundary.parquet", part_value,
+                             partitioned_spec_->spec_id(), 
/*record_count=*/10)),
+      MakeEntry(ManifestStatus::kAdded, kSnapshotId, /*sequence_number=*/1,
+                MakeDataFile("/path/to/large.parquet", part_value,
+                             partitioned_spec_->spec_id(), 
/*record_count=*/15))};
+  auto data_manifest =
+      WriteDataManifest(version, kSnapshotId, std::move(data_entries), 
partitioned_spec_);
+
+  std::vector<ManifestFile> manifests = {data_manifest};
+  ICEBERG_UNWRAP_OR_FAIL(
+      auto group,
+      ManifestGroup::Make(file_io_, schema_, GetSpecsById(), 
std::move(manifests)));
+  group->FilterFiles(Expressions::GreaterThanOrEqual("record_count", 
Literal::Long(10)));
+
+  ICEBERG_UNWRAP_OR_FAIL(auto entries, group->Entries());
+  EXPECT_THAT(GetEntryPaths(entries),
+              testing::UnorderedElementsAre("/path/to/boundary.parquet",
+                                            "/path/to/large.parquet"));
+}
+
+TEST_P(ManifestGroupTest, FilterFilesRejectsPartitionMetadata) {
+  auto version = GetParam();
+
+  constexpr int64_t kSnapshotId = 1000L;
+  const auto part_value = PartitionValues({Literal::Int(0)});
+
+  std::vector<ManifestEntry> data_entries{MakeEntry(
+      ManifestStatus::kAdded, kSnapshotId, /*sequence_number=*/1,
+      MakeDataFile("/path/to/data.parquet", part_value, 
partitioned_spec_->spec_id()))};
+  auto data_manifest =
+      WriteDataManifest(version, kSnapshotId, std::move(data_entries), 
partitioned_spec_);
+
+  std::vector<ManifestFile> manifests = {data_manifest};
+  ICEBERG_UNWRAP_OR_FAIL(
+      auto group,
+      ManifestGroup::Make(file_io_, schema_, GetSpecsById(), 
std::move(manifests)));
+  group->FilterFiles(Expressions::Equal("partition.data_bucket_16_2", 
Literal::Int(1)));
+
+  auto result = group->Entries();
+  EXPECT_THAT(result, IsError(ErrorKind::kInvalidExpression));
+  EXPECT_THAT(result, HasErrorMessage("Cannot find field 
'partition.data_bucket_16_2'"));

Review Comment:
   Why we don't skip unknown predicates? Usually we just ignore unknown or 
malformed predicates in practice (e.g. assume they are always evaluated to 
match), otherwise any malformed predicate can prohibit any table scan to 
proceed.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to