When creating LRU hash maps with BPF_F_NO_COMMON_LRU, the kernel
silently rounds max_entries up to a multiple of num_possible_cpus()
in htab_map_alloc() to ensure each per-CPU LRU list has at least one
element. However, the original value requested by the caller is lost --
map->max_entries is overwritten with the rounded value.

This creates a problem for userspace map managers (e.g., Cilium) that
reconcile map parameters against their configured values. When the
kernel-reported max_entries differs from the originally requested value,
the reconciliation logic detects a "mismatch" and may enter an infinite
delete-recreate loop, as seen in production incidents where non-power-
of-2 CPU counts caused small but persistent rounding differences.

Add a new 'requested_max_entries' field to struct bpf_map (kernel-
internal) and struct bpf_map_info (UAPI) that preserves the caller's
original max_entries value. The field is set in bpf_map_init_from_attr()
before any map-type-specific adjustments, and exposed to userspace
via BPF_OBJ_GET_INFO_BY_FD.

This is a purely additive, backward-compatible change:
- Old callers that don't know about the new field see it zeroed (via
  memset in bpf_map_get_info_by_fd) and can safely ignore it.
- New callers can compare requested_max_entries vs max_entries to detect
  kernel adjustments and avoid false reconciliation mismatches.

The new field is placed between max_entries (u32) and map_extra (u64)
in struct bpf_map, filling the existing alignment padding hole -- no
increase in struct size.

Also update the BPF_F_NO_COMMON_LRU documentation to describe the
rounding behavior and the new field.

Selftests are included covering LRU hash maps with and without
BPF_F_NO_COMMON_LRU, LRU per-CPU hash maps with BPF_F_NO_COMMON_LRU,
and regular hash maps.

Signed-off-by: Anand Kumar Shaw <[email protected]>
---
 include/linux/bpf.h                           |   1 +
 include/uapi/linux/bpf.h                      |  12 ++
 kernel/bpf/syscall.c                          |   2 +
 tools/include/uapi/linux/bpf.h                |  12 ++
 .../prog_tests/map_requested_max_entries.c    | 134 ++++++++++++++++++
 5 files changed, 161 insertions(+)
 create mode 100644 
tools/testing/selftests/bpf/prog_tests/map_requested_max_entries.c

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index cd9b96434..8606b2c40 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -304,6 +304,7 @@ struct bpf_map {
        u32 key_size;
        u32 value_size;
        u32 max_entries;
+       u32 requested_max_entries; /* original max_entries before kernel 
adjustment */
        u64 map_extra; /* any per-map-type extra fields */
        u32 map_flags;
        u32 id;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index c8d400b76..39cd781c2 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1405,6 +1405,12 @@ enum {
  * which can scale and perform better.
  * Note, the LRU nodes (including free nodes) cannot be moved
  * across different LRU lists.
+ *
+ * When this flag is set, the kernel rounds max_entries up to a multiple
+ * of num_possible_cpus() so that each per-CPU LRU list has at least one
+ * element. The actual (possibly adjusted) value is reported via
+ * bpf_map_info.max_entries, while the original requested value is
+ * preserved in bpf_map_info.requested_max_entries.
  */
        BPF_F_NO_COMMON_LRU     = (1U << 1),
 /* Specify numa node during map creation */
@@ -6717,6 +6723,12 @@ struct bpf_map_info {
        __u64 map_extra;
        __aligned_u64 hash;
        __u32 hash_size;
+       /* Original max_entries as requested by the caller. May differ from
+        * max_entries if the kernel adjusted it (e.g., rounded up to a
+        * multiple of num_possible_cpus() for per-CPU LRU hash maps when
+        * BPF_F_NO_COMMON_LRU is set).
+        */
+       __u32 requested_max_entries;
 } __attribute__((aligned(8)));
 
 struct bpf_btf_info {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index dd89bf809..66a518f3a 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -439,6 +439,7 @@ void bpf_map_init_from_attr(struct bpf_map *map, union 
bpf_attr *attr)
        map->key_size = attr->key_size;
        map->value_size = attr->value_size;
        map->max_entries = attr->max_entries;
+       map->requested_max_entries = attr->max_entries;
        map->map_flags = bpf_map_flags_retain_permanent(attr->map_flags);
        map->numa_node = bpf_map_attr_numa_node(attr);
        map->map_extra = attr->map_extra;
@@ -5301,6 +5302,7 @@ static int bpf_map_get_info_by_fd(struct file *file,
        info.key_size = map->key_size;
        info.value_size = map->value_size;
        info.max_entries = map->max_entries;
+       info.requested_max_entries = map->requested_max_entries;
        info.map_flags = map->map_flags;
        info.map_extra = map->map_extra;
        memcpy(info.name, map->name, sizeof(map->name));
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 5e38b4887..bea369e10 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1405,6 +1405,12 @@ enum {
  * which can scale and perform better.
  * Note, the LRU nodes (including free nodes) cannot be moved
  * across different LRU lists.
+ *
+ * When this flag is set, the kernel rounds max_entries up to a multiple
+ * of num_possible_cpus() so that each per-CPU LRU list has at least one
+ * element. The actual (possibly adjusted) value is reported via
+ * bpf_map_info.max_entries, while the original requested value is
+ * preserved in bpf_map_info.requested_max_entries.
  */
        BPF_F_NO_COMMON_LRU     = (1U << 1),
 /* Specify numa node during map creation */
@@ -6717,6 +6723,12 @@ struct bpf_map_info {
        __u64 map_extra;
        __aligned_u64 hash;
        __u32 hash_size;
+       /* Original max_entries as requested by the caller. May differ from
+        * max_entries if the kernel adjusted it (e.g., rounded up to a
+        * multiple of num_possible_cpus() for per-CPU LRU hash maps when
+        * BPF_F_NO_COMMON_LRU is set).
+        */
+       __u32 requested_max_entries;
 } __attribute__((aligned(8)));
 
 struct bpf_btf_info {
diff --git a/tools/testing/selftests/bpf/prog_tests/map_requested_max_entries.c 
b/tools/testing/selftests/bpf/prog_tests/map_requested_max_entries.c
new file mode 100644
index 000000000..e54e88326
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/map_requested_max_entries.c
@@ -0,0 +1,134 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Test that bpf_map_info.requested_max_entries correctly reports the
+ * original max_entries value requested by the caller, even when the
+ * kernel adjusts max_entries internally (e.g., rounding up for per-CPU
+ * LRU hash maps with BPF_F_NO_COMMON_LRU).
+ */
+#include <test_progs.h>
+#include <bpf/bpf.h>
+
+static void test_lru_hash_no_common_lru(void)
+{
+       LIBBPF_OPTS(bpf_map_create_opts, opts);
+       struct bpf_map_info info = {};
+       __u32 info_len = sizeof(info);
+       /* Use a prime number to guarantee rounding on any SMP system */
+       __u32 requested = 997;
+       int map_fd, err;
+
+       opts.map_flags = BPF_F_NO_COMMON_LRU;
+
+       map_fd = bpf_map_create(BPF_MAP_TYPE_LRU_HASH, "test_lru_pcpu",
+                               sizeof(__u32), sizeof(__u32),
+                               requested, &opts);
+       if (!ASSERT_GE(map_fd, 0, "bpf_map_create"))
+               return;
+
+       err = bpf_map_get_info_by_fd(map_fd, &info, &info_len);
+       if (!ASSERT_OK(err, "bpf_map_get_info_by_fd"))
+               goto out;
+
+       ASSERT_EQ(info.requested_max_entries, requested,
+                 "requested_max_entries");
+       ASSERT_GE(info.max_entries, requested,
+                 "max_entries >= requested");
+
+out:
+       close(map_fd);
+}
+
+static void test_lru_percpu_hash_no_common_lru(void)
+{
+       LIBBPF_OPTS(bpf_map_create_opts, opts);
+       struct bpf_map_info info = {};
+       __u32 info_len = sizeof(info);
+       __u32 requested = 997;
+       int map_fd, err;
+
+       opts.map_flags = BPF_F_NO_COMMON_LRU;
+
+       map_fd = bpf_map_create(BPF_MAP_TYPE_LRU_PERCPU_HASH,
+                               "test_lru_pcpu_v",
+                               sizeof(__u32), sizeof(__u32),
+                               requested, &opts);
+       if (!ASSERT_GE(map_fd, 0, "bpf_map_create"))
+               return;
+
+       err = bpf_map_get_info_by_fd(map_fd, &info, &info_len);
+       if (!ASSERT_OK(err, "bpf_map_get_info_by_fd"))
+               goto out;
+
+       ASSERT_EQ(info.requested_max_entries, requested,
+                 "requested_max_entries");
+       ASSERT_GE(info.max_entries, requested,
+                 "max_entries >= requested");
+
+out:
+       close(map_fd);
+}
+
+static void test_lru_hash_common_lru(void)
+{
+       struct bpf_map_info info = {};
+       __u32 info_len = sizeof(info);
+       __u32 requested = 997;
+       int map_fd, err;
+
+       /* Without BPF_F_NO_COMMON_LRU, max_entries should not be rounded */
+       map_fd = bpf_map_create(BPF_MAP_TYPE_LRU_HASH, "test_lru_common",
+                               sizeof(__u32), sizeof(__u32),
+                               requested, NULL);
+       if (!ASSERT_GE(map_fd, 0, "bpf_map_create"))
+               return;
+
+       err = bpf_map_get_info_by_fd(map_fd, &info, &info_len);
+       if (!ASSERT_OK(err, "bpf_map_get_info_by_fd"))
+               goto out;
+
+       ASSERT_EQ(info.requested_max_entries, requested,
+                 "requested_max_entries");
+       ASSERT_EQ(info.max_entries, requested,
+                 "max_entries == requested (no rounding)");
+
+out:
+       close(map_fd);
+}
+
+static void test_hash_map(void)
+{
+       struct bpf_map_info info = {};
+       __u32 info_len = sizeof(info);
+       __u32 requested = 256;
+       int map_fd, err;
+
+       /* Regular hash map: max_entries should equal requested */
+       map_fd = bpf_map_create(BPF_MAP_TYPE_HASH, "test_hash",
+                               sizeof(__u32), sizeof(__u32),
+                               requested, NULL);
+       if (!ASSERT_GE(map_fd, 0, "bpf_map_create"))
+               return;
+
+       err = bpf_map_get_info_by_fd(map_fd, &info, &info_len);
+       if (!ASSERT_OK(err, "bpf_map_get_info_by_fd"))
+               goto out;
+
+       ASSERT_EQ(info.requested_max_entries, requested,
+                 "requested_max_entries");
+       ASSERT_EQ(info.max_entries, requested,
+                 "max_entries == requested");
+
+out:
+       close(map_fd);
+}
+
+void test_map_requested_max_entries(void)
+{
+       if (test__start_subtest("lru_hash_no_common_lru"))
+               test_lru_hash_no_common_lru();
+       if (test__start_subtest("lru_percpu_hash_no_common_lru"))
+               test_lru_percpu_hash_no_common_lru();
+       if (test__start_subtest("lru_hash_common_lru"))
+               test_lru_hash_common_lru();
+       if (test__start_subtest("hash_map"))
+               test_hash_map();
+}
-- 
2.39.5 (Apple Git-154)


Reply via email to