This is an automated email from the ASF dual-hosted git repository.
gangwu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-cpp.git
The following commit(s) were added to refs/heads/main by this push:
new 2bd493c0 feat: implement location provider (#467)
2bd493c0 is described below
commit 2bd493c0ec67e8676e719209ed4d1f7a1a743150
Author: wzhuo <[email protected]>
AuthorDate: Mon Jan 12 13:49:15 2026 +0800
feat: implement location provider (#467)
---
src/iceberg/CMakeLists.txt | 1 +
src/iceberg/location_provider.cc | 206 +++++++++++++++++++++++++++++
src/iceberg/location_provider.h | 31 +++--
src/iceberg/meson.build | 1 +
src/iceberg/test/CMakeLists.txt | 1 +
src/iceberg/test/location_provider_test.cc | 149 +++++++++++++++++++++
src/iceberg/test/meson.build | 1 +
src/iceberg/util/location_util.h | 11 +-
8 files changed, 383 insertions(+), 18 deletions(-)
diff --git a/src/iceberg/CMakeLists.txt b/src/iceberg/CMakeLists.txt
index 47a2cabc..3e0c66f9 100644
--- a/src/iceberg/CMakeLists.txt
+++ b/src/iceberg/CMakeLists.txt
@@ -40,6 +40,7 @@ set(ICEBERG_SOURCES
file_writer.cc
inheritable_metadata.cc
json_internal.cc
+ location_provider.cc
manifest/manifest_adapter.cc
manifest/manifest_entry.cc
manifest/manifest_group.cc
diff --git a/src/iceberg/location_provider.cc b/src/iceberg/location_provider.cc
new file mode 100644
index 00000000..a03f41e5
--- /dev/null
+++ b/src/iceberg/location_provider.cc
@@ -0,0 +1,206 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "iceberg/location_provider.h"
+
+#include "iceberg/partition_spec.h"
+#include "iceberg/table_properties.h"
+#include "iceberg/util/location_util.h"
+#include "iceberg/util/murmurhash3_internal.h"
+
+namespace iceberg {
+
+namespace {
+
+constexpr uint8_t kEntropyDirMask = 0x0F;
+constexpr uint8_t kRestDirMask = 0xFF;
+constexpr int32_t kHashBits = 20;
+constexpr int32_t kEntropyDirLength = 4;
+constexpr int32_t kEntropyDirDepth = 3;
+
+std::string DataLocation(const TableProperties& properties, std::string_view
location) {
+ auto data_location = properties.Get(TableProperties::kWriteDataLocation);
+ if (data_location.empty()) {
+ data_location = std::format("{}/data", location);
+ }
+ return data_location;
+}
+
+std::string PathContext(std::string_view location) {
+ std::string_view path = LocationUtil::StripTrailingSlash(location);
+
+ size_t last_slash = path.find_last_of('/');
+ if (last_slash != std::string_view::npos && last_slash < path.length() - 1) {
+ std::string_view data_path = path.substr(last_slash + 1);
+ std::string_view parent_path(path.data(), last_slash);
+ size_t parent_last_slash = parent_path.find_last_of('/');
+
+ if (parent_last_slash != std::string::npos) {
+ std::string_view parent_name = parent_path.substr(parent_last_slash + 1);
+ return std::format("{}/{}", parent_name, data_path);
+ } else {
+ return std::format("{}/{}", parent_path, data_path);
+ }
+ }
+
+ return std::string(location);
+}
+
+/// \brief Divides hash into directories for optimized orphan removal
operation using
+/// kEntropyDirDepth and kEntropyDirLength.
+///
+/// If the low `kHashBits = 20` of `hash` is '10011001100110011001', then
return
+/// '1001/1001/1001/10011001' with depth 3 and length 4.
+///
+/// \param hash The hash value to be divided.
+/// \return The path according to the `hash` value.
+std::string DirsFromHash(int32_t hash) {
+ std::string hash_with_dirs;
+
+ for (int32_t i = 0; i < kEntropyDirDepth * kEntropyDirLength; i +=
kEntropyDirLength) {
+ if (i > 0) {
+ hash_with_dirs += "/";
+ }
+ uint8_t dir_bits = kEntropyDirMask & (hash >> (kHashBits - i -
kEntropyDirLength));
+ hash_with_dirs += std::format("{:04b}", dir_bits);
+ }
+
+ hash_with_dirs += "/";
+ uint8_t rest_bits = kRestDirMask & hash;
+ hash_with_dirs += std::format("{:08b}", rest_bits);
+
+ return hash_with_dirs;
+}
+
+std::string ComputeHash(std::string_view file_name) {
+ int32_t hash_value = 0;
+ MurmurHash3_x86_32(file_name.data(), file_name.size(), 0, &hash_value);
+ return DirsFromHash(hash_value);
+}
+
+} // namespace
+
+// Default location provider for local file system.
+class DefaultLocationProvider : public LocationProvider {
+ public:
+ DefaultLocationProvider(std::string_view location, const TableProperties&
properties);
+
+ std::string NewDataLocation(std::string_view filename) override;
+
+ Result<std::string> NewDataLocation(const PartitionSpec& spec,
+ const PartitionValues& partition,
+ std::string_view filename) override;
+
+ private:
+ std::string data_location_;
+};
+
+// Implementation of DefaultLocationProvider
+DefaultLocationProvider::DefaultLocationProvider(std::string_view location,
+ const TableProperties&
properties)
+ : data_location_(
+ LocationUtil::StripTrailingSlash(DataLocation(properties,
location))) {}
+
+std::string DefaultLocationProvider::NewDataLocation(std::string_view
filename) {
+ return std::format("{}/{}", data_location_, filename);
+}
+
+Result<std::string> DefaultLocationProvider::NewDataLocation(
+ const PartitionSpec& spec, const PartitionValues& partition,
+ std::string_view filename) {
+ ICEBERG_ASSIGN_OR_RAISE(auto partition_path, spec.PartitionPath(partition));
+ return std::format("{}/{}/{}", data_location_, partition_path, filename);
+}
+
+// Location provider for object stores.
+class ObjectStoreLocationProvider : public LocationProvider {
+ public:
+ ObjectStoreLocationProvider(std::string_view location,
+ const TableProperties& properties);
+
+ std::string NewDataLocation(std::string_view filename) override;
+
+ Result<std::string> NewDataLocation(const PartitionSpec& spec,
+ const PartitionValues& partition,
+ std::string_view filename) override;
+
+ private:
+ std::string storage_location_;
+ std::string context_;
+ bool include_partition_paths_;
+};
+
+// Implementation of ObjectStoreLocationProvider
+ObjectStoreLocationProvider::ObjectStoreLocationProvider(
+ std::string_view location, const TableProperties& properties)
+ : include_partition_paths_(
+ properties.Get(TableProperties::kWriteObjectStorePartitionedPaths)) {
+ storage_location_ =
+ LocationUtil::StripTrailingSlash(DataLocation(properties, location));
+
+ // If the storage location is within the table prefix, don't add table and
database name
+ // context
+ if (!storage_location_.starts_with(location)) {
+ context_ = PathContext(location);
+ }
+}
+
+std::string ObjectStoreLocationProvider::NewDataLocation(std::string_view
filename) {
+ std::string hash = ComputeHash(filename);
+
+ if (!context_.empty()) {
+ return std::format("{}/{}/{}/{}", storage_location_, hash, context_,
filename);
+ } else {
+ // If partition paths are included, add last part of entropy as dir before
partition
+ // names
+ if (include_partition_paths_) {
+ return std::format("{}/{}/{}", storage_location_, hash, filename);
+ } else {
+ // If partition paths are not included, append last part of entropy with
`-` to file
+ // name
+ return std::format("{}/{}-{}", storage_location_, hash, filename);
+ }
+ }
+}
+
+Result<std::string> ObjectStoreLocationProvider::NewDataLocation(
+ const PartitionSpec& spec, const PartitionValues& partition,
+ std::string_view filename) {
+ if (include_partition_paths_) {
+ ICEBERG_ASSIGN_OR_RAISE(auto partition_path,
spec.PartitionPath(partition));
+ return NewDataLocation(std::format("{}/{}", partition_path, filename));
+ } else {
+ return NewDataLocation(filename);
+ }
+}
+
+Result<std::unique_ptr<LocationProvider>> LocationProvider::Make(
+ std::string_view location, const TableProperties& properties) {
+ location = LocationUtil::StripTrailingSlash(location);
+
+ // TODO(xxx): create location provider specified by
"write.location-provider.impl"
+
+ if (properties.Get(TableProperties::kObjectStoreEnabled)) {
+ return std::make_unique<ObjectStoreLocationProvider>(location, properties);
+ } else {
+ return std::make_unique<DefaultLocationProvider>(location, properties);
+ }
+}
+
+} // namespace iceberg
diff --git a/src/iceberg/location_provider.h b/src/iceberg/location_provider.h
index 90c63eb6..6b6fa447 100644
--- a/src/iceberg/location_provider.h
+++ b/src/iceberg/location_provider.h
@@ -19,38 +19,45 @@
#pragma once
+#include <memory>
#include <string>
+#include <string_view>
#include "iceberg/iceberg_export.h"
+#include "iceberg/result.h"
#include "iceberg/type_fwd.h"
namespace iceberg {
-/// \brief Interface for providing data file locations to write tasks.
+/// \brief Interface for providing data file locations.
class ICEBERG_EXPORT LocationProvider {
public:
virtual ~LocationProvider() = default;
/// \brief Return a fully-qualified data file location for the given
filename.
///
- /// \param filename a file name
+ /// \param filename file name to get location
/// \return a fully-qualified location URI for a data file
- virtual std::string NewDataLocation(const std::string& filename) = 0;
+ virtual std::string NewDataLocation(std::string_view filename) = 0;
/// \brief Return a fully-qualified data file location for the given
partition and
/// filename.
///
- /// \param spec a partition spec
- /// \param partition_data a tuple of partition data for data in the file,
matching the
- /// given spec
- /// \param filename a file name
+ /// \param spec partition spec
+ /// \param partition a tuple of partition values matching the given spec
+ /// \param filename file name
/// \return a fully-qualified location URI for a data file
+ virtual Result<std::string> NewDataLocation(const PartitionSpec& spec,
+ const PartitionValues& partition,
+ std::string_view filename) = 0;
+
+ /// \brief Create a LocationProvider for the given table location and
properties.
///
- /// TODO(wgtmac): StructLike is not well thought yet, we may wrap an
ArrowArray
- /// with single row in StructLike.
- virtual std::string NewDataLocation(const PartitionSpec& spec,
- const StructLike& partition_data,
- const std::string& filename) = 0;
+ /// \param location table location
+ /// \param properties table properties
+ /// \return a LocationProvider instance
+ static Result<std::unique_ptr<LocationProvider>> Make(
+ std::string_view location, const TableProperties& properties);
};
} // namespace iceberg
diff --git a/src/iceberg/meson.build b/src/iceberg/meson.build
index 1a59c894..3164f390 100644
--- a/src/iceberg/meson.build
+++ b/src/iceberg/meson.build
@@ -61,6 +61,7 @@ iceberg_sources = files(
'file_writer.cc',
'inheritable_metadata.cc',
'json_internal.cc',
+ 'location_provider.cc',
'manifest/manifest_adapter.cc',
'manifest/manifest_entry.cc',
'manifest/manifest_group.cc',
diff --git a/src/iceberg/test/CMakeLists.txt b/src/iceberg/test/CMakeLists.txt
index 0e41fcfb..6124b6bc 100644
--- a/src/iceberg/test/CMakeLists.txt
+++ b/src/iceberg/test/CMakeLists.txt
@@ -73,6 +73,7 @@ add_iceberg_test(schema_test
add_iceberg_test(table_test
SOURCES
+ location_provider_test.cc
metrics_config_test.cc
snapshot_test.cc
snapshot_util_test.cc
diff --git a/src/iceberg/test/location_provider_test.cc
b/src/iceberg/test/location_provider_test.cc
new file mode 100644
index 00000000..b287ded7
--- /dev/null
+++ b/src/iceberg/test/location_provider_test.cc
@@ -0,0 +1,149 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "iceberg/location_provider.h"
+
+#include <gtest/gtest.h>
+
+#include "iceberg/location_provider.h"
+#include "iceberg/partition_spec.h"
+#include "iceberg/row/partition_values.h"
+#include "iceberg/table_properties.h"
+#include "iceberg/test/matchers.h"
+#include "iceberg/transform.h"
+
+namespace iceberg {
+
+namespace {
+
+// Helper function to split a string by delimiter
+std::vector<std::string> SplitString(const std::string& str, char delimiter) {
+ std::vector<std::string> result;
+ std::stringstream ss(str);
+ std::string item;
+
+ while (std::getline(ss, item, delimiter)) {
+ result.push_back(item);
+ }
+
+ return result;
+}
+
+} // namespace
+
+class LocationProviderTest : public ::testing::Test {
+ protected:
+ void SetUp() override { table_location_ = "/test/table/location"; }
+
+ std::string table_location_;
+ TableProperties properties_;
+};
+
+TEST_F(LocationProviderTest, DefaultLocationProvider) {
+ properties_ = {}; // Empty properties to use defaults
+ ICEBERG_UNWRAP_OR_FAIL(auto provider,
+ LocationProvider::Make(table_location_, properties_));
+
+ auto location = provider->NewDataLocation("my_file");
+ EXPECT_EQ(std::format("{}/data/my_file", table_location_), location);
+}
+
+TEST_F(LocationProviderTest, DefaultLocationProviderWithCustomDataLocation) {
+ std::ignore =
+ properties_.Set(TableProperties::kWriteDataLocation,
std::string("new_location"));
+ ICEBERG_UNWRAP_OR_FAIL(auto provider,
+ LocationProvider::Make(table_location_, properties_));
+
+ auto location = provider->NewDataLocation("my_file");
+ EXPECT_EQ("new_location/my_file", location);
+}
+
+TEST_F(LocationProviderTest, ObjectStorageLocationProvider) {
+ std::ignore = properties_.Set(TableProperties::kObjectStoreEnabled, true);
+ ICEBERG_UNWRAP_OR_FAIL(auto provider,
+ LocationProvider::Make(table_location_, properties_));
+
+ auto location = provider->NewDataLocation("test.parquet");
+ std::string relative_location = location;
+ if (relative_location.starts_with(table_location_)) {
+ relative_location = relative_location.substr(table_location_.size());
+ }
+
+ std::vector<std::string> parts = SplitString(relative_location, '/');
+ ASSERT_EQ(7, parts.size());
+ EXPECT_EQ("", parts[0]);
+ EXPECT_EQ("data", parts[1]);
+ for (int i = 2; i <= 5; i++) {
+ EXPECT_FALSE(parts[i].empty());
+ }
+ EXPECT_EQ("test.parquet", parts[6]);
+}
+
+TEST_F(LocationProviderTest, ObjectStorageWithPartition) {
+ std::ignore = properties_.Set(TableProperties::kObjectStoreEnabled, true);
+ ICEBERG_UNWRAP_OR_FAIL(auto provider,
+ LocationProvider::Make(table_location_, properties_));
+
+ ICEBERG_UNWRAP_OR_FAIL(
+ auto mock_spec,
+ PartitionSpec::Make(PartitionSpec::kInitialSpecId,
+ {PartitionField(1, 1, "data#1",
Transform::Identity())},
+ PartitionSpec::kInvalidPartitionFieldId + 1));
+ PartitionValues mock_partition_data({Literal::String("val#1")});
+ ICEBERG_UNWRAP_OR_FAIL(
+ auto location,
+ provider->NewDataLocation(*mock_spec, mock_partition_data,
"test.parquet"));
+
+ std::vector<std::string> parts = SplitString(location, '/');
+ ASSERT_GT(parts.size(), 2);
+ EXPECT_EQ("data%231=%22val%231%22", parts[parts.size() - 2]);
+}
+
+TEST_F(LocationProviderTest, ObjectStorageExcludePartitionInPath) {
+ std::ignore = properties_.Set(TableProperties::kObjectStoreEnabled, true)
+ .Set(TableProperties::kWriteObjectStorePartitionedPaths,
false);
+ ICEBERG_UNWRAP_OR_FAIL(auto provider,
+ LocationProvider::Make(table_location_, properties_));
+
+ auto location = provider->NewDataLocation("test.parquet");
+
+ EXPECT_THAT(location, testing::HasSubstr(table_location_));
+ EXPECT_THAT(location, testing::HasSubstr("/data/"));
+ EXPECT_THAT(location, testing::HasSubstr("-test.parquet"));
+}
+
+TEST_F(LocationProviderTest, HashInjection) {
+ std::ignore = properties_.Set(TableProperties::kObjectStoreEnabled, true);
+ ICEBERG_UNWRAP_OR_FAIL(auto provider,
+ LocationProvider::Make(table_location_, properties_));
+
+ auto location_a = provider->NewDataLocation("a");
+ EXPECT_THAT(location_a,
testing::EndsWith("/data/0101/0110/1001/10110010/a"));
+
+ auto location_b = provider->NewDataLocation("b");
+ EXPECT_THAT(location_b,
testing::EndsWith("/data/1110/0111/1110/00000011/b"));
+
+ auto location_c = provider->NewDataLocation("c");
+ EXPECT_THAT(location_c,
testing::EndsWith("/data/0010/1101/0110/01011111/c"));
+
+ auto location_d = provider->NewDataLocation("d");
+ EXPECT_THAT(location_d,
testing::EndsWith("/data/1001/0001/0100/01110011/d"));
+}
+
+} // namespace iceberg
diff --git a/src/iceberg/test/meson.build b/src/iceberg/test/meson.build
index e15a7a9e..95c68962 100644
--- a/src/iceberg/test/meson.build
+++ b/src/iceberg/test/meson.build
@@ -46,6 +46,7 @@ iceberg_tests = {
},
'table_test': {
'sources': files(
+ 'location_provider_test.cc',
'metrics_config_test.cc',
'snapshot_test.cc',
'snapshot_util_test.cc',
diff --git a/src/iceberg/util/location_util.h b/src/iceberg/util/location_util.h
index 547dd5ce..eb78dece 100644
--- a/src/iceberg/util/location_util.h
+++ b/src/iceberg/util/location_util.h
@@ -19,7 +19,7 @@
#pragma once
-#include <string>
+#include <string_view>
#include "iceberg/iceberg_export.h"
@@ -27,16 +27,15 @@ namespace iceberg {
class ICEBERG_EXPORT LocationUtil {
public:
- static std::string StripTrailingSlash(const std::string& path) {
+ static std::string_view StripTrailingSlash(std::string_view path) {
if (path.empty()) {
return "";
}
- std::string_view result = path;
- while (result.ends_with("/") && !result.ends_with("://")) {
- result.remove_suffix(1);
+ while (path.ends_with("/") && !path.ends_with("://")) {
+ path.remove_suffix(1);
}
- return std::string(result);
+ return path;
}
};