This is an automated email from the ASF dual-hosted git repository.

gangwu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-cpp.git


The following commit(s) were added to refs/heads/main by this push:
     new 2bd493c0 feat: implement location provider (#467)
2bd493c0 is described below

commit 2bd493c0ec67e8676e719209ed4d1f7a1a743150
Author: wzhuo <[email protected]>
AuthorDate: Mon Jan 12 13:49:15 2026 +0800

    feat: implement location provider (#467)
---
 src/iceberg/CMakeLists.txt                 |   1 +
 src/iceberg/location_provider.cc           | 206 +++++++++++++++++++++++++++++
 src/iceberg/location_provider.h            |  31 +++--
 src/iceberg/meson.build                    |   1 +
 src/iceberg/test/CMakeLists.txt            |   1 +
 src/iceberg/test/location_provider_test.cc | 149 +++++++++++++++++++++
 src/iceberg/test/meson.build               |   1 +
 src/iceberg/util/location_util.h           |  11 +-
 8 files changed, 383 insertions(+), 18 deletions(-)

diff --git a/src/iceberg/CMakeLists.txt b/src/iceberg/CMakeLists.txt
index 47a2cabc..3e0c66f9 100644
--- a/src/iceberg/CMakeLists.txt
+++ b/src/iceberg/CMakeLists.txt
@@ -40,6 +40,7 @@ set(ICEBERG_SOURCES
     file_writer.cc
     inheritable_metadata.cc
     json_internal.cc
+    location_provider.cc
     manifest/manifest_adapter.cc
     manifest/manifest_entry.cc
     manifest/manifest_group.cc
diff --git a/src/iceberg/location_provider.cc b/src/iceberg/location_provider.cc
new file mode 100644
index 00000000..a03f41e5
--- /dev/null
+++ b/src/iceberg/location_provider.cc
@@ -0,0 +1,206 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "iceberg/location_provider.h"
+
+#include "iceberg/partition_spec.h"
+#include "iceberg/table_properties.h"
+#include "iceberg/util/location_util.h"
+#include "iceberg/util/murmurhash3_internal.h"
+
+namespace iceberg {
+
+namespace {
+
+constexpr uint8_t kEntropyDirMask = 0x0F;
+constexpr uint8_t kRestDirMask = 0xFF;
+constexpr int32_t kHashBits = 20;
+constexpr int32_t kEntropyDirLength = 4;
+constexpr int32_t kEntropyDirDepth = 3;
+
+std::string DataLocation(const TableProperties& properties, std::string_view 
location) {
+  auto data_location = properties.Get(TableProperties::kWriteDataLocation);
+  if (data_location.empty()) {
+    data_location = std::format("{}/data", location);
+  }
+  return data_location;
+}
+
+std::string PathContext(std::string_view location) {
+  std::string_view path = LocationUtil::StripTrailingSlash(location);
+
+  size_t last_slash = path.find_last_of('/');
+  if (last_slash != std::string_view::npos && last_slash < path.length() - 1) {
+    std::string_view data_path = path.substr(last_slash + 1);
+    std::string_view parent_path(path.data(), last_slash);
+    size_t parent_last_slash = parent_path.find_last_of('/');
+
+    if (parent_last_slash != std::string::npos) {
+      std::string_view parent_name = parent_path.substr(parent_last_slash + 1);
+      return std::format("{}/{}", parent_name, data_path);
+    } else {
+      return std::format("{}/{}", parent_path, data_path);
+    }
+  }
+
+  return std::string(location);
+}
+
+/// \brief Divides hash into directories for optimized orphan removal 
operation using
+/// kEntropyDirDepth and kEntropyDirLength.
+///
+/// If the low `kHashBits = 20` of `hash` is '10011001100110011001', then 
return
+/// '1001/1001/1001/10011001' with depth 3 and length 4.
+///
+/// \param hash The hash value to be divided.
+/// \return The path according to the `hash` value.
+std::string DirsFromHash(int32_t hash) {
+  std::string hash_with_dirs;
+
+  for (int32_t i = 0; i < kEntropyDirDepth * kEntropyDirLength; i += 
kEntropyDirLength) {
+    if (i > 0) {
+      hash_with_dirs += "/";
+    }
+    uint8_t dir_bits = kEntropyDirMask & (hash >> (kHashBits - i - 
kEntropyDirLength));
+    hash_with_dirs += std::format("{:04b}", dir_bits);
+  }
+
+  hash_with_dirs += "/";
+  uint8_t rest_bits = kRestDirMask & hash;
+  hash_with_dirs += std::format("{:08b}", rest_bits);
+
+  return hash_with_dirs;
+}
+
+std::string ComputeHash(std::string_view file_name) {
+  int32_t hash_value = 0;
+  MurmurHash3_x86_32(file_name.data(), file_name.size(), 0, &hash_value);
+  return DirsFromHash(hash_value);
+}
+
+}  // namespace
+
+// Default location provider for local file system.
+class DefaultLocationProvider : public LocationProvider {
+ public:
+  DefaultLocationProvider(std::string_view location, const TableProperties& 
properties);
+
+  std::string NewDataLocation(std::string_view filename) override;
+
+  Result<std::string> NewDataLocation(const PartitionSpec& spec,
+                                      const PartitionValues& partition,
+                                      std::string_view filename) override;
+
+ private:
+  std::string data_location_;
+};
+
+// Implementation of DefaultLocationProvider
+DefaultLocationProvider::DefaultLocationProvider(std::string_view location,
+                                                 const TableProperties& 
properties)
+    : data_location_(
+          LocationUtil::StripTrailingSlash(DataLocation(properties, 
location))) {}
+
+std::string DefaultLocationProvider::NewDataLocation(std::string_view 
filename) {
+  return std::format("{}/{}", data_location_, filename);
+}
+
+Result<std::string> DefaultLocationProvider::NewDataLocation(
+    const PartitionSpec& spec, const PartitionValues& partition,
+    std::string_view filename) {
+  ICEBERG_ASSIGN_OR_RAISE(auto partition_path, spec.PartitionPath(partition));
+  return std::format("{}/{}/{}", data_location_, partition_path, filename);
+}
+
+// Location provider for object stores.
+class ObjectStoreLocationProvider : public LocationProvider {
+ public:
+  ObjectStoreLocationProvider(std::string_view location,
+                              const TableProperties& properties);
+
+  std::string NewDataLocation(std::string_view filename) override;
+
+  Result<std::string> NewDataLocation(const PartitionSpec& spec,
+                                      const PartitionValues& partition,
+                                      std::string_view filename) override;
+
+ private:
+  std::string storage_location_;
+  std::string context_;
+  bool include_partition_paths_;
+};
+
+// Implementation of ObjectStoreLocationProvider
+ObjectStoreLocationProvider::ObjectStoreLocationProvider(
+    std::string_view location, const TableProperties& properties)
+    : include_partition_paths_(
+          properties.Get(TableProperties::kWriteObjectStorePartitionedPaths)) {
+  storage_location_ =
+      LocationUtil::StripTrailingSlash(DataLocation(properties, location));
+
+  // If the storage location is within the table prefix, don't add table and 
database name
+  // context
+  if (!storage_location_.starts_with(location)) {
+    context_ = PathContext(location);
+  }
+}
+
+std::string ObjectStoreLocationProvider::NewDataLocation(std::string_view 
filename) {
+  std::string hash = ComputeHash(filename);
+
+  if (!context_.empty()) {
+    return std::format("{}/{}/{}/{}", storage_location_, hash, context_, 
filename);
+  } else {
+    // If partition paths are included, add last part of entropy as dir before 
partition
+    // names
+    if (include_partition_paths_) {
+      return std::format("{}/{}/{}", storage_location_, hash, filename);
+    } else {
+      // If partition paths are not included, append last part of entropy with 
`-` to file
+      // name
+      return std::format("{}/{}-{}", storage_location_, hash, filename);
+    }
+  }
+}
+
+Result<std::string> ObjectStoreLocationProvider::NewDataLocation(
+    const PartitionSpec& spec, const PartitionValues& partition,
+    std::string_view filename) {
+  if (include_partition_paths_) {
+    ICEBERG_ASSIGN_OR_RAISE(auto partition_path, 
spec.PartitionPath(partition));
+    return NewDataLocation(std::format("{}/{}", partition_path, filename));
+  } else {
+    return NewDataLocation(filename);
+  }
+}
+
+Result<std::unique_ptr<LocationProvider>> LocationProvider::Make(
+    std::string_view location, const TableProperties& properties) {
+  location = LocationUtil::StripTrailingSlash(location);
+
+  // TODO(xxx): create location provider specified by 
"write.location-provider.impl"
+
+  if (properties.Get(TableProperties::kObjectStoreEnabled)) {
+    return std::make_unique<ObjectStoreLocationProvider>(location, properties);
+  } else {
+    return std::make_unique<DefaultLocationProvider>(location, properties);
+  }
+}
+
+}  // namespace iceberg
diff --git a/src/iceberg/location_provider.h b/src/iceberg/location_provider.h
index 90c63eb6..6b6fa447 100644
--- a/src/iceberg/location_provider.h
+++ b/src/iceberg/location_provider.h
@@ -19,38 +19,45 @@
 
 #pragma once
 
+#include <memory>
 #include <string>
+#include <string_view>
 
 #include "iceberg/iceberg_export.h"
+#include "iceberg/result.h"
 #include "iceberg/type_fwd.h"
 
 namespace iceberg {
 
-/// \brief Interface for providing data file locations to write tasks.
+/// \brief Interface for providing data file locations.
 class ICEBERG_EXPORT LocationProvider {
  public:
   virtual ~LocationProvider() = default;
 
   /// \brief Return a fully-qualified data file location for the given 
filename.
   ///
-  /// \param filename a file name
+  /// \param filename file name to get location
   /// \return a fully-qualified location URI for a data file
-  virtual std::string NewDataLocation(const std::string& filename) = 0;
+  virtual std::string NewDataLocation(std::string_view filename) = 0;
 
   /// \brief Return a fully-qualified data file location for the given 
partition and
   /// filename.
   ///
-  /// \param spec a partition spec
-  /// \param partition_data a tuple of partition data for data in the file, 
matching the
-  /// given spec
-  /// \param filename a file name
+  /// \param spec partition spec
+  /// \param partition a tuple of partition values matching the given spec
+  /// \param filename file name
   /// \return a fully-qualified location URI for a data file
+  virtual Result<std::string> NewDataLocation(const PartitionSpec& spec,
+                                              const PartitionValues& partition,
+                                              std::string_view filename) = 0;
+
+  /// \brief Create a LocationProvider for the given table location and 
properties.
   ///
-  /// TODO(wgtmac): StructLike is not well thought yet, we may wrap an 
ArrowArray
-  /// with single row in StructLike.
-  virtual std::string NewDataLocation(const PartitionSpec& spec,
-                                      const StructLike& partition_data,
-                                      const std::string& filename) = 0;
+  /// \param location table location
+  /// \param properties table properties
+  /// \return a LocationProvider instance
+  static Result<std::unique_ptr<LocationProvider>> Make(
+      std::string_view location, const TableProperties& properties);
 };
 
 }  // namespace iceberg
diff --git a/src/iceberg/meson.build b/src/iceberg/meson.build
index 1a59c894..3164f390 100644
--- a/src/iceberg/meson.build
+++ b/src/iceberg/meson.build
@@ -61,6 +61,7 @@ iceberg_sources = files(
     'file_writer.cc',
     'inheritable_metadata.cc',
     'json_internal.cc',
+    'location_provider.cc',
     'manifest/manifest_adapter.cc',
     'manifest/manifest_entry.cc',
     'manifest/manifest_group.cc',
diff --git a/src/iceberg/test/CMakeLists.txt b/src/iceberg/test/CMakeLists.txt
index 0e41fcfb..6124b6bc 100644
--- a/src/iceberg/test/CMakeLists.txt
+++ b/src/iceberg/test/CMakeLists.txt
@@ -73,6 +73,7 @@ add_iceberg_test(schema_test
 
 add_iceberg_test(table_test
                  SOURCES
+                 location_provider_test.cc
                  metrics_config_test.cc
                  snapshot_test.cc
                  snapshot_util_test.cc
diff --git a/src/iceberg/test/location_provider_test.cc 
b/src/iceberg/test/location_provider_test.cc
new file mode 100644
index 00000000..b287ded7
--- /dev/null
+++ b/src/iceberg/test/location_provider_test.cc
@@ -0,0 +1,149 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "iceberg/location_provider.h"
+
+#include <gtest/gtest.h>
+
+#include "iceberg/location_provider.h"
+#include "iceberg/partition_spec.h"
+#include "iceberg/row/partition_values.h"
+#include "iceberg/table_properties.h"
+#include "iceberg/test/matchers.h"
+#include "iceberg/transform.h"
+
+namespace iceberg {
+
+namespace {
+
+// Helper function to split a string by delimiter
+std::vector<std::string> SplitString(const std::string& str, char delimiter) {
+  std::vector<std::string> result;
+  std::stringstream ss(str);
+  std::string item;
+
+  while (std::getline(ss, item, delimiter)) {
+    result.push_back(item);
+  }
+
+  return result;
+}
+
+}  // namespace
+
+class LocationProviderTest : public ::testing::Test {
+ protected:
+  void SetUp() override { table_location_ = "/test/table/location"; }
+
+  std::string table_location_;
+  TableProperties properties_;
+};
+
+TEST_F(LocationProviderTest, DefaultLocationProvider) {
+  properties_ = {};  // Empty properties to use defaults
+  ICEBERG_UNWRAP_OR_FAIL(auto provider,
+                         LocationProvider::Make(table_location_, properties_));
+
+  auto location = provider->NewDataLocation("my_file");
+  EXPECT_EQ(std::format("{}/data/my_file", table_location_), location);
+}
+
+TEST_F(LocationProviderTest, DefaultLocationProviderWithCustomDataLocation) {
+  std::ignore =
+      properties_.Set(TableProperties::kWriteDataLocation, 
std::string("new_location"));
+  ICEBERG_UNWRAP_OR_FAIL(auto provider,
+                         LocationProvider::Make(table_location_, properties_));
+
+  auto location = provider->NewDataLocation("my_file");
+  EXPECT_EQ("new_location/my_file", location);
+}
+
+TEST_F(LocationProviderTest, ObjectStorageLocationProvider) {
+  std::ignore = properties_.Set(TableProperties::kObjectStoreEnabled, true);
+  ICEBERG_UNWRAP_OR_FAIL(auto provider,
+                         LocationProvider::Make(table_location_, properties_));
+
+  auto location = provider->NewDataLocation("test.parquet");
+  std::string relative_location = location;
+  if (relative_location.starts_with(table_location_)) {
+    relative_location = relative_location.substr(table_location_.size());
+  }
+
+  std::vector<std::string> parts = SplitString(relative_location, '/');
+  ASSERT_EQ(7, parts.size());
+  EXPECT_EQ("", parts[0]);
+  EXPECT_EQ("data", parts[1]);
+  for (int i = 2; i <= 5; i++) {
+    EXPECT_FALSE(parts[i].empty());
+  }
+  EXPECT_EQ("test.parquet", parts[6]);
+}
+
+TEST_F(LocationProviderTest, ObjectStorageWithPartition) {
+  std::ignore = properties_.Set(TableProperties::kObjectStoreEnabled, true);
+  ICEBERG_UNWRAP_OR_FAIL(auto provider,
+                         LocationProvider::Make(table_location_, properties_));
+
+  ICEBERG_UNWRAP_OR_FAIL(
+      auto mock_spec,
+      PartitionSpec::Make(PartitionSpec::kInitialSpecId,
+                          {PartitionField(1, 1, "data#1", 
Transform::Identity())},
+                          PartitionSpec::kInvalidPartitionFieldId + 1));
+  PartitionValues mock_partition_data({Literal::String("val#1")});
+  ICEBERG_UNWRAP_OR_FAIL(
+      auto location,
+      provider->NewDataLocation(*mock_spec, mock_partition_data, 
"test.parquet"));
+
+  std::vector<std::string> parts = SplitString(location, '/');
+  ASSERT_GT(parts.size(), 2);
+  EXPECT_EQ("data%231=%22val%231%22", parts[parts.size() - 2]);
+}
+
+TEST_F(LocationProviderTest, ObjectStorageExcludePartitionInPath) {
+  std::ignore = properties_.Set(TableProperties::kObjectStoreEnabled, true)
+                    .Set(TableProperties::kWriteObjectStorePartitionedPaths, 
false);
+  ICEBERG_UNWRAP_OR_FAIL(auto provider,
+                         LocationProvider::Make(table_location_, properties_));
+
+  auto location = provider->NewDataLocation("test.parquet");
+
+  EXPECT_THAT(location, testing::HasSubstr(table_location_));
+  EXPECT_THAT(location, testing::HasSubstr("/data/"));
+  EXPECT_THAT(location, testing::HasSubstr("-test.parquet"));
+}
+
+TEST_F(LocationProviderTest, HashInjection) {
+  std::ignore = properties_.Set(TableProperties::kObjectStoreEnabled, true);
+  ICEBERG_UNWRAP_OR_FAIL(auto provider,
+                         LocationProvider::Make(table_location_, properties_));
+
+  auto location_a = provider->NewDataLocation("a");
+  EXPECT_THAT(location_a, 
testing::EndsWith("/data/0101/0110/1001/10110010/a"));
+
+  auto location_b = provider->NewDataLocation("b");
+  EXPECT_THAT(location_b, 
testing::EndsWith("/data/1110/0111/1110/00000011/b"));
+
+  auto location_c = provider->NewDataLocation("c");
+  EXPECT_THAT(location_c, 
testing::EndsWith("/data/0010/1101/0110/01011111/c"));
+
+  auto location_d = provider->NewDataLocation("d");
+  EXPECT_THAT(location_d, 
testing::EndsWith("/data/1001/0001/0100/01110011/d"));
+}
+
+}  // namespace iceberg
diff --git a/src/iceberg/test/meson.build b/src/iceberg/test/meson.build
index e15a7a9e..95c68962 100644
--- a/src/iceberg/test/meson.build
+++ b/src/iceberg/test/meson.build
@@ -46,6 +46,7 @@ iceberg_tests = {
     },
     'table_test': {
         'sources': files(
+            'location_provider_test.cc',
             'metrics_config_test.cc',
             'snapshot_test.cc',
             'snapshot_util_test.cc',
diff --git a/src/iceberg/util/location_util.h b/src/iceberg/util/location_util.h
index 547dd5ce..eb78dece 100644
--- a/src/iceberg/util/location_util.h
+++ b/src/iceberg/util/location_util.h
@@ -19,7 +19,7 @@
 
 #pragma once
 
-#include <string>
+#include <string_view>
 
 #include "iceberg/iceberg_export.h"
 
@@ -27,16 +27,15 @@ namespace iceberg {
 
 class ICEBERG_EXPORT LocationUtil {
  public:
-  static std::string StripTrailingSlash(const std::string& path) {
+  static std::string_view StripTrailingSlash(std::string_view path) {
     if (path.empty()) {
       return "";
     }
 
-    std::string_view result = path;
-    while (result.ends_with("/") && !result.ends_with("://")) {
-      result.remove_suffix(1);
+    while (path.ends_with("/") && !path.ends_with("://")) {
+      path.remove_suffix(1);
     }
-    return std::string(result);
+    return path;
   }
 };
 

Reply via email to