This is an automated email from the ASF dual-hosted git repository.

fokko pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-cpp.git


The following commit(s) were added to refs/heads/main by this push:
     new 01b454c  feat: adopt MurmurHash3 (#78)
01b454c is described below

commit 01b454c1905b24664541c30cb31f1ac2a430926d
Author: Junwang Zhao <[email protected]>
AuthorDate: Thu Apr 17 01:08:22 2025 +0800

    feat: adopt MurmurHash3 (#78)
    
    Which will be used by BucketTransform.
---
 .github/.licenserc.yaml                  |   1 +
 LICENSE                                  |  13 +
 NOTICE                                   |   5 +
 src/iceberg/CMakeLists.txt               |   7 +-
 src/iceberg/util/murmurhash3_internal.cc | 414 +++++++++++++++++++++++++++++++
 src/iceberg/util/murmurhash3_internal.h  |  38 +++
 6 files changed, 475 insertions(+), 3 deletions(-)

diff --git a/.github/.licenserc.yaml b/.github/.licenserc.yaml
index c657a03..8b20554 100644
--- a/.github/.licenserc.yaml
+++ b/.github/.licenserc.yaml
@@ -12,5 +12,6 @@ header:
     - 'LICENSE'
     - 'NOTICE'
     - 'src/iceberg/expected.h'
+    - 'src/iceberg/util/murmurhash3_internal.*'
 
   comment: on-failure
diff --git a/LICENSE b/LICENSE
index c23c810..ce4ab84 100644
--- a/LICENSE
+++ b/LICENSE
@@ -254,3 +254,16 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, 
DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
+
+--------------------------------------------------------------------------------
+
+The file src/iceberg/murmur3_internal.h contains code adapted from
+
+https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.h
+
+The file src/iceberg/murmur3_internal.cc contains code adapted from
+
+https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp
+
+MurmurHash3 was written by Austin Appleby, and is placed in the public
+domain. The author disclaims copyright to this source code.
diff --git a/NOTICE b/NOTICE
index c71e81d..cad0c1a 100644
--- a/NOTICE
+++ b/NOTICE
@@ -7,3 +7,8 @@ The Apache Software Foundation (http://www.apache.org/).
 This product includes code from zeus-cpp
  * Copyright (c) 2024 zeus-cpp
  * https://github.com/zeus-cpp/expected
+
+This product includes code from smhasher
+ * MurmurHash3 was written by Austin Appleby, and is placed in the public
+ * domain. The author hereby disclaims copyright to this source code.
+ * https://github.com/aappleby/smhasher
diff --git a/src/iceberg/CMakeLists.txt b/src/iceberg/CMakeLists.txt
index 1a8c334..65a4f2e 100644
--- a/src/iceberg/CMakeLists.txt
+++ b/src/iceberg/CMakeLists.txt
@@ -21,11 +21,12 @@ set(ICEBERG_SOURCES
     arrow_c_data_internal.cc
     demo.cc
     json_internal.cc
+    partition_field.cc
+    partition_spec.cc
     schema.cc
     schema_field.cc
     schema_internal.cc
-    partition_field.cc
-    partition_spec.cc
+    snapshot.cc
     sort_field.cc
     sort_order.cc
     statistics_file.cc
@@ -33,7 +34,7 @@ set(ICEBERG_SOURCES
     transform.cc
     transform_function.cc
     type.cc
-    snapshot.cc)
+    util/murmurhash3_internal.cc)
 
 set(ICEBERG_STATIC_BUILD_INTERFACE_LIBS)
 set(ICEBERG_SHARED_BUILD_INTERFACE_LIBS)
diff --git a/src/iceberg/util/murmurhash3_internal.cc 
b/src/iceberg/util/murmurhash3_internal.cc
new file mode 100644
index 0000000..ce1013e
--- /dev/null
+++ b/src/iceberg/util/murmurhash3_internal.cc
@@ -0,0 +1,414 @@
+//-----------------------------------------------------------------------------
+// MurmurHash3 was written by Austin Appleby, and is placed in the public
+// domain. The author hereby disclaims copyright to this source code.
+
+// Note - The x86 and x64 versions do _not_ produce the same results, as the
+// algorithms are optimized for their respective platforms. You can still
+// compile and run any of them on any platform, but your performance with the
+// non-native version will be less than optimal.
+
+#include "iceberg/util/murmurhash3_internal.h"
+
+//-----------------------------------------------------------------------------
+// Platform-specific functions and macros
+
+// Microsoft Visual Studio
+
+#if defined(_MSC_VER)
+
+#  define FORCE_INLINE __forceinline
+
+#  include <cstdlib>
+
+#  define ROTL32(x, y) _rotl(x, y)
+#  define ROTL64(x, y) _rotl64(x, y)
+
+#  define BIG_CONSTANT(x) (x)
+
+// Other compilers
+
+#else  // defined(_MSC_VER)
+
+#  define FORCE_INLINE inline __attribute__((always_inline))
+
+inline uint32_t rotl32(uint32_t x, int8_t r) { return (x << r) | (x >> (32 - 
r)); }
+
+inline uint64_t rotl64(uint64_t x, int8_t r) { return (x << r) | (x >> (64 - 
r)); }
+
+#  define ROTL32(x, y) rotl32(x, y)
+#  define ROTL64(x, y) rotl64(x, y)
+
+#  define BIG_CONSTANT(x) (x##LLU)
+
+#endif  // !defined(_MSC_VER)
+
+// NOLINTBEGIN
+
+namespace iceberg {
+
+namespace {
+//-----------------------------------------------------------------------------
+// Block read - if your platform needs to do endian-swapping or can only
+// handle aligned reads, do the conversion here
+
+FORCE_INLINE uint32_t getblock32(const uint32_t* p, int i) { return p[i]; }
+
+FORCE_INLINE uint64_t getblock64(const uint64_t* p, int i) { return p[i]; }
+
+//-----------------------------------------------------------------------------
+// Finalization mix - force all bits of a hash block to avalanche
+
+FORCE_INLINE uint32_t fmix32(uint32_t h) {
+  h ^= h >> 16;
+  h *= 0x85ebca6b;
+  h ^= h >> 13;
+  h *= 0xc2b2ae35;
+  h ^= h >> 16;
+
+  return h;
+}
+
+//----------
+
+FORCE_INLINE uint64_t fmix64(uint64_t k) {
+  k ^= k >> 33;
+  k *= BIG_CONSTANT(0xff51afd7ed558ccd);
+  k ^= k >> 33;
+  k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53);
+  k ^= k >> 33;
+
+  return k;
+}
+
+}  // namespace
+
+//-----------------------------------------------------------------------------
+
+void MurmurHash3_x86_32(const void* key, int len, uint32_t seed, void* out) {
+  const uint8_t* data = (const uint8_t*)key;
+  const int nblocks = len / 4;
+
+  uint32_t h1 = seed;
+
+  const uint32_t c1 = 0xcc9e2d51;
+  const uint32_t c2 = 0x1b873593;
+
+  //----------
+  // body
+
+  const uint32_t* blocks = (const uint32_t*)(data + nblocks * 4);
+
+  for (int i = -nblocks; i; i++) {
+    uint32_t k1 = getblock32(blocks, i);
+
+    k1 *= c1;
+    k1 = ROTL32(k1, 15);
+    k1 *= c2;
+
+    h1 ^= k1;
+    h1 = ROTL32(h1, 13);
+    h1 = h1 * 5 + 0xe6546b64;
+  }
+
+  //----------
+  // tail
+
+  const uint8_t* tail = (const uint8_t*)(data + nblocks * 4);
+
+  uint32_t k1 = 0;
+
+  switch (len & 3) {
+    case 3:
+      k1 ^= tail[2] << 16;
+    case 2:
+      k1 ^= tail[1] << 8;
+    case 1:
+      k1 ^= tail[0];
+      k1 *= c1;
+      k1 = ROTL32(k1, 15);
+      k1 *= c2;
+      h1 ^= k1;
+  };
+
+  //----------
+  // finalization
+
+  h1 ^= len;
+
+  h1 = fmix32(h1);
+
+  *(uint32_t*)out = h1;
+}
+
+//-----------------------------------------------------------------------------
+
+void MurmurHash3_x86_128(const void* key, const int len, uint32_t seed, void* 
out) {
+  const uint8_t* data = (const uint8_t*)key;
+  const int nblocks = len / 16;
+
+  uint32_t h1 = seed;
+  uint32_t h2 = seed;
+  uint32_t h3 = seed;
+  uint32_t h4 = seed;
+
+  const uint32_t c1 = 0x239b961b;
+  const uint32_t c2 = 0xab0e9789;
+  const uint32_t c3 = 0x38b34ae5;
+  const uint32_t c4 = 0xa1e38b93;
+
+  //----------
+  // body
+
+  const uint32_t* blocks = (const uint32_t*)(data + nblocks * 16);
+
+  for (int i = -nblocks; i; i++) {
+    uint32_t k1 = getblock32(blocks, i * 4 + 0);
+    uint32_t k2 = getblock32(blocks, i * 4 + 1);
+    uint32_t k3 = getblock32(blocks, i * 4 + 2);
+    uint32_t k4 = getblock32(blocks, i * 4 + 3);
+
+    k1 *= c1;
+    k1 = ROTL32(k1, 15);
+    k1 *= c2;
+    h1 ^= k1;
+
+    h1 = ROTL32(h1, 19);
+    h1 += h2;
+    h1 = h1 * 5 + 0x561ccd1b;
+
+    k2 *= c2;
+    k2 = ROTL32(k2, 16);
+    k2 *= c3;
+    h2 ^= k2;
+
+    h2 = ROTL32(h2, 17);
+    h2 += h3;
+    h2 = h2 * 5 + 0x0bcaa747;
+
+    k3 *= c3;
+    k3 = ROTL32(k3, 17);
+    k3 *= c4;
+    h3 ^= k3;
+
+    h3 = ROTL32(h3, 15);
+    h3 += h4;
+    h3 = h3 * 5 + 0x96cd1c35;
+
+    k4 *= c4;
+    k4 = ROTL32(k4, 18);
+    k4 *= c1;
+    h4 ^= k4;
+
+    h4 = ROTL32(h4, 13);
+    h4 += h1;
+    h4 = h4 * 5 + 0x32ac3b17;
+  }
+
+  //----------
+  // tail
+
+  const uint8_t* tail = (const uint8_t*)(data + nblocks * 16);
+
+  uint32_t k1 = 0;
+  uint32_t k2 = 0;
+  uint32_t k3 = 0;
+  uint32_t k4 = 0;
+
+  switch (len & 15) {
+    case 15:
+      k4 ^= tail[14] << 16;
+    case 14:
+      k4 ^= tail[13] << 8;
+    case 13:
+      k4 ^= tail[12] << 0;
+      k4 *= c4;
+      k4 = ROTL32(k4, 18);
+      k4 *= c1;
+      h4 ^= k4;
+
+    case 12:
+      k3 ^= tail[11] << 24;
+    case 11:
+      k3 ^= tail[10] << 16;
+    case 10:
+      k3 ^= tail[9] << 8;
+    case 9:
+      k3 ^= tail[8] << 0;
+      k3 *= c3;
+      k3 = ROTL32(k3, 17);
+      k3 *= c4;
+      h3 ^= k3;
+
+    case 8:
+      k2 ^= tail[7] << 24;
+    case 7:
+      k2 ^= tail[6] << 16;
+    case 6:
+      k2 ^= tail[5] << 8;
+    case 5:
+      k2 ^= tail[4] << 0;
+      k2 *= c2;
+      k2 = ROTL32(k2, 16);
+      k2 *= c3;
+      h2 ^= k2;
+
+    case 4:
+      k1 ^= tail[3] << 24;
+    case 3:
+      k1 ^= tail[2] << 16;
+    case 2:
+      k1 ^= tail[1] << 8;
+    case 1:
+      k1 ^= tail[0] << 0;
+      k1 *= c1;
+      k1 = ROTL32(k1, 15);
+      k1 *= c2;
+      h1 ^= k1;
+  };
+
+  //----------
+  // finalization
+
+  h1 ^= len;
+  h2 ^= len;
+  h3 ^= len;
+  h4 ^= len;
+
+  h1 += h2;
+  h1 += h3;
+  h1 += h4;
+  h2 += h1;
+  h3 += h1;
+  h4 += h1;
+
+  h1 = fmix32(h1);
+  h2 = fmix32(h2);
+  h3 = fmix32(h3);
+  h4 = fmix32(h4);
+
+  h1 += h2;
+  h1 += h3;
+  h1 += h4;
+  h2 += h1;
+  h3 += h1;
+  h4 += h1;
+
+  ((uint32_t*)out)[0] = h1;
+  ((uint32_t*)out)[1] = h2;
+  ((uint32_t*)out)[2] = h3;
+  ((uint32_t*)out)[3] = h4;
+}
+
+//-----------------------------------------------------------------------------
+
+void MurmurHash3_x64_128(const void* key, const int len, const uint32_t seed, 
void* out) {
+  const uint8_t* data = (const uint8_t*)key;
+  const int nblocks = len / 16;
+
+  uint64_t h1 = seed;
+  uint64_t h2 = seed;
+
+  const uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5);
+  const uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f);
+
+  //----------
+  // body
+
+  const uint64_t* blocks = (const uint64_t*)(data);
+
+  for (int i = 0; i < nblocks; i++) {
+    uint64_t k1 = getblock64(blocks, i * 2 + 0);
+    uint64_t k2 = getblock64(blocks, i * 2 + 1);
+
+    k1 *= c1;
+    k1 = ROTL64(k1, 31);
+    k1 *= c2;
+    h1 ^= k1;
+
+    h1 = ROTL64(h1, 27);
+    h1 += h2;
+    h1 = h1 * 5 + 0x52dce729;
+
+    k2 *= c2;
+    k2 = ROTL64(k2, 33);
+    k2 *= c1;
+    h2 ^= k2;
+
+    h2 = ROTL64(h2, 31);
+    h2 += h1;
+    h2 = h2 * 5 + 0x38495ab5;
+  }
+
+  //----------
+  // tail
+
+  const uint8_t* tail = (const uint8_t*)(data + nblocks * 16);
+
+  uint64_t k1 = 0;
+  uint64_t k2 = 0;
+
+  switch (len & 15) {
+    case 15:
+      k2 ^= ((uint64_t)tail[14]) << 48;
+    case 14:
+      k2 ^= ((uint64_t)tail[13]) << 40;
+    case 13:
+      k2 ^= ((uint64_t)tail[12]) << 32;
+    case 12:
+      k2 ^= ((uint64_t)tail[11]) << 24;
+    case 11:
+      k2 ^= ((uint64_t)tail[10]) << 16;
+    case 10:
+      k2 ^= ((uint64_t)tail[9]) << 8;
+    case 9:
+      k2 ^= ((uint64_t)tail[8]) << 0;
+      k2 *= c2;
+      k2 = ROTL64(k2, 33);
+      k2 *= c1;
+      h2 ^= k2;
+
+    case 8:
+      k1 ^= ((uint64_t)tail[7]) << 56;
+    case 7:
+      k1 ^= ((uint64_t)tail[6]) << 48;
+    case 6:
+      k1 ^= ((uint64_t)tail[5]) << 40;
+    case 5:
+      k1 ^= ((uint64_t)tail[4]) << 32;
+    case 4:
+      k1 ^= ((uint64_t)tail[3]) << 24;
+    case 3:
+      k1 ^= ((uint64_t)tail[2]) << 16;
+    case 2:
+      k1 ^= ((uint64_t)tail[1]) << 8;
+    case 1:
+      k1 ^= ((uint64_t)tail[0]) << 0;
+      k1 *= c1;
+      k1 = ROTL64(k1, 31);
+      k1 *= c2;
+      h1 ^= k1;
+  };
+
+  //----------
+  // finalization
+
+  h1 ^= len;
+  h2 ^= len;
+
+  h1 += h2;
+  h2 += h1;
+
+  h1 = fmix64(h1);
+  h2 = fmix64(h2);
+
+  h1 += h2;
+  h2 += h1;
+
+  ((uint64_t*)out)[0] = h1;
+  ((uint64_t*)out)[1] = h2;
+}
+
+//-----------------------------------------------------------------------------
+}  // namespace iceberg
+
+// NOLINTEND
diff --git a/src/iceberg/util/murmurhash3_internal.h 
b/src/iceberg/util/murmurhash3_internal.h
new file mode 100644
index 0000000..5b71f93
--- /dev/null
+++ b/src/iceberg/util/murmurhash3_internal.h
@@ -0,0 +1,38 @@
+//-----------------------------------------------------------------------------
+// MurmurHash3 was written by Austin Appleby, and is placed in the public
+// domain. The author hereby disclaims copyright to this source code.
+
+#pragma once
+
+//-----------------------------------------------------------------------------
+// Platform-specific functions and macros
+
+// Microsoft Visual Studio
+
+#if defined(_MSC_VER) && (_MSC_VER < 1600)
+
+typedef unsigned char uint8_t;
+typedef unsigned int uint32_t;
+typedef unsigned __int64 uint64_t;
+
+// Other compilers
+
+#else  // defined(_MSC_VER)
+
+#  include <cstdint>
+
+#endif  // !defined(_MSC_VER)
+
+namespace iceberg {
+
+//-----------------------------------------------------------------------------
+
+void MurmurHash3_x86_32(const void* key, int len, uint32_t seed, void* out);
+
+void MurmurHash3_x86_128(const void* key, int len, uint32_t seed, void* out);
+
+void MurmurHash3_x64_128(const void* key, int len, uint32_t seed, void* out);
+
+//-----------------------------------------------------------------------------
+
+}  // namespace iceberg

Reply via email to