On 10/08/2018 08:13 PM, Song Liu wrote:
On Mon, Oct 8, 2018 at 12:12 PM Mauricio Vasquez B
<mauricio.vasq...@polito.it> wrote:
The following patch implements a bpf queue/stack maps that
provides the peek/pop/push functions.  There is not a direct
relationship between those functions and the current maps
syscalls, hence a new MAP_LOOKUP_AND_DELETE_ELEM syscall is added,
this is mapped to the pop operation in the queue/stack maps
and it is still to implement in other kind of maps.
Do we need this system call for other maps (non-stack/queue)?
If not, maybe we can just call it POP, and only implement it for
stack and queue?

Yes, this system call could also benefit other maps.  The first idea was to add pop/push/peek system calls as well, but them Alexei realized it was too specific for queue/stack maps and we decided to go ahead with this solution that is more general.

Signed-off-by: Mauricio Vasquez B <mauricio.vasq...@polito.it>
---
  include/linux/bpf.h      |    1 +
  include/uapi/linux/bpf.h |    1 +
  kernel/bpf/syscall.c     |   81 ++++++++++++++++++++++++++++++++++++++++++++++
  3 files changed, 83 insertions(+)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 027697b6a22f..98c7eeb6d138 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -39,6 +39,7 @@ struct bpf_map_ops {
         void *(*map_lookup_elem)(struct bpf_map *map, void *key);
         int (*map_update_elem)(struct bpf_map *map, void *key, void *value, 
u64 flags);
         int (*map_delete_elem)(struct bpf_map *map, void *key);
+       void *(*map_lookup_and_delete_elem)(struct bpf_map *map, void *key);

         /* funcs called by prog_array and perf_event_array map */
         void *(*map_fd_get_ptr)(struct bpf_map *map, struct file *map_file,
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index f9187b41dff6..3bb94aa2d408 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -103,6 +103,7 @@ enum bpf_cmd {
         BPF_BTF_LOAD,
         BPF_BTF_GET_FD_BY_ID,
         BPF_TASK_FD_QUERY,
+       BPF_MAP_LOOKUP_AND_DELETE_ELEM,
  };

  enum bpf_map_type {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index eb75e8af73ff..c33d9303f72f 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -975,6 +975,84 @@ static int map_get_next_key(union bpf_attr *attr)
         return err;
  }

+#define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD value
+
+static int map_lookup_and_delete_elem(union bpf_attr *attr)
+{
+       void __user *ukey = u64_to_user_ptr(attr->key);
+       void __user *uvalue = u64_to_user_ptr(attr->value);
+       int ufd = attr->map_fd;
+       struct bpf_map *map;
+       void *key, *value, *ptr;
+       u32 value_size;
+       struct fd f;
+       int err;
+
+       if (CHECK_ATTR(BPF_MAP_LOOKUP_AND_DELETE_ELEM))
+               return -EINVAL;
+
+       f = fdget(ufd);
+       map = __bpf_map_get(f);
+       if (IS_ERR(map))
+               return PTR_ERR(map);
+
+       if (!(f.file->f_mode & FMODE_CAN_WRITE)) {
+               err = -EPERM;
+               goto err_put;
+       }
+
+       key = __bpf_copy_key(ukey, map->key_size);
+       if (IS_ERR(key)) {
+               err = PTR_ERR(key);
+               goto err_put;
+       }
+
+       value_size = map->value_size;
+
+       err = -ENOMEM;
+       value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
+       if (!value)
+               goto free_key;
+
+       err = -EFAULT;
+       if (copy_from_user(value, uvalue, value_size) != 0)
+               goto free_value;
+
+       /* must increment bpf_prog_active to avoid kprobe+bpf triggering from
+        * inside bpf map update or delete otherwise deadlocks are possible
+        */
+       preempt_disable();
+       __this_cpu_inc(bpf_prog_active);
+       if (!map->ops->map_lookup_and_delete_elem) {
Why do have have this check here? Shouldn't it be check much earlier?
If we really need it here, we need at least add the following:
In this particular patch the check can be done much earlier, but in next patch we need it on this position, so I leave it here to avoid moving around on next patch.

__this_cpu_dec(bpf_prog_active);
        preempt_enable();
You are right, I missed that. Will fix for next version.


+               err = -ENOTSUPP;
+               goto free_value;
+       }
+       rcu_read_lock();
+       ptr = map->ops->map_lookup_and_delete_elem(map, key);
+       if (ptr)
+               memcpy(value, ptr, value_size);
+       rcu_read_unlock();
+       err = ptr ? 0 : -ENOENT;
+       __this_cpu_dec(bpf_prog_active);
+       preempt_enable();
+
+       if (err)
+               goto free_value;
+
+       if (copy_to_user(uvalue, value, value_size) != 0)
+               goto free_value;
+
+       err = 0;
+
+free_value:
+       kfree(value);
+free_key:
+       kfree(key);
+err_put:
+       fdput(f);
+       return err;
+}
+
  static const struct bpf_prog_ops * const bpf_prog_types[] = {
  #define BPF_PROG_TYPE(_id, _name) \
         [_id] = & _name ## _prog_ops,
@@ -2448,6 +2526,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, 
uattr, unsigned int, siz
         case BPF_TASK_FD_QUERY:
                 err = bpf_task_fd_query(&attr, uattr);
                 break;
+       case BPF_MAP_LOOKUP_AND_DELETE_ELEM:
+               err = map_lookup_and_delete_elem(&attr);
+               break;
         default:
                 err = -EINVAL;
                 break;


Reply via email to