Re: [PATCH bpf-next 3/6] bpf: add MAP_LOOKUP_AND_DELETE_ELEM syscall

2018-10-09 Thread Mauricio Vasquez




On 10/08/2018 08:13 PM, Song Liu wrote:

On Mon, Oct 8, 2018 at 12:12 PM Mauricio Vasquez B
 wrote:

The following patch implements a bpf queue/stack maps that
provides the peek/pop/push functions.  There is not a direct
relationship between those functions and the current maps
syscalls, hence a new MAP_LOOKUP_AND_DELETE_ELEM syscall is added,
this is mapped to the pop operation in the queue/stack maps
and it is still to implement in other kind of maps.

Do we need this system call for other maps (non-stack/queue)?
If not, maybe we can just call it POP, and only implement it for
stack and queue?

Yes, this system call could also benefit other maps.  The first idea was 
to add pop/push/peek system calls as well, but them Alexei realized it 
was too specific for queue/stack maps and we decided to go ahead with 
this solution that is more general.



Signed-off-by: Mauricio Vasquez B 
---
  include/linux/bpf.h  |1 +
  include/uapi/linux/bpf.h |1 +
  kernel/bpf/syscall.c |   81 ++
  3 files changed, 83 insertions(+)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 027697b6a22f..98c7eeb6d138 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -39,6 +39,7 @@ struct bpf_map_ops {
 void *(*map_lookup_elem)(struct bpf_map *map, void *key);
 int (*map_update_elem)(struct bpf_map *map, void *key, void *value, 
u64 flags);
 int (*map_delete_elem)(struct bpf_map *map, void *key);
+   void *(*map_lookup_and_delete_elem)(struct bpf_map *map, void *key);

 /* funcs called by prog_array and perf_event_array map */
 void *(*map_fd_get_ptr)(struct bpf_map *map, struct file *map_file,
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index f9187b41dff6..3bb94aa2d408 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -103,6 +103,7 @@ enum bpf_cmd {
 BPF_BTF_LOAD,
 BPF_BTF_GET_FD_BY_ID,
 BPF_TASK_FD_QUERY,
+   BPF_MAP_LOOKUP_AND_DELETE_ELEM,
  };

  enum bpf_map_type {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index eb75e8af73ff..c33d9303f72f 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -975,6 +975,84 @@ static int map_get_next_key(union bpf_attr *attr)
 return err;
  }

+#define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD value
+
+static int map_lookup_and_delete_elem(union bpf_attr *attr)
+{
+   void __user *ukey = u64_to_user_ptr(attr->key);
+   void __user *uvalue = u64_to_user_ptr(attr->value);
+   int ufd = attr->map_fd;
+   struct bpf_map *map;
+   void *key, *value, *ptr;
+   u32 value_size;
+   struct fd f;
+   int err;
+
+   if (CHECK_ATTR(BPF_MAP_LOOKUP_AND_DELETE_ELEM))
+   return -EINVAL;
+
+   f = fdget(ufd);
+   map = __bpf_map_get(f);
+   if (IS_ERR(map))
+   return PTR_ERR(map);
+
+   if (!(f.file->f_mode & FMODE_CAN_WRITE)) {
+   err = -EPERM;
+   goto err_put;
+   }
+
+   key = __bpf_copy_key(ukey, map->key_size);
+   if (IS_ERR(key)) {
+   err = PTR_ERR(key);
+   goto err_put;
+   }
+
+   value_size = map->value_size;
+
+   err = -ENOMEM;
+   value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
+   if (!value)
+   goto free_key;
+
+   err = -EFAULT;
+   if (copy_from_user(value, uvalue, value_size) != 0)
+   goto free_value;
+
+   /* must increment bpf_prog_active to avoid kprobe+bpf triggering from
+* inside bpf map update or delete otherwise deadlocks are possible
+*/
+   preempt_disable();
+   __this_cpu_inc(bpf_prog_active);
+   if (!map->ops->map_lookup_and_delete_elem) {

Why do have have this check here? Shouldn't it be check much earlier?
If we really need it here, we need at least add the following:
In this particular patch the check can be done much earlier, but in next 
patch we need it on this position, so I leave it here to avoid moving 
around on next patch.



__this_cpu_dec(bpf_prog_active);
preempt_enable();

You are right, I missed that. Will fix for next version.




+   err = -ENOTSUPP;
+   goto free_value;
+   }
+   rcu_read_lock();
+   ptr = map->ops->map_lookup_and_delete_elem(map, key);
+   if (ptr)
+   memcpy(value, ptr, value_size);
+   rcu_read_unlock();
+   err = ptr ? 0 : -ENOENT;
+   __this_cpu_dec(bpf_prog_active);
+   preempt_enable();
+
+   if (err)
+   goto free_value;
+
+   if (copy_to_user(uvalue, value, value_size) != 0)
+   goto free_value;
+
+   err = 0;
+
+free_value:
+   kfree(value);
+free_key:
+   kfree(key);
+err_put:
+   fdput(f);
+   return err;
+}
+
  static const struct bpf_prog_ops * const bpf_prog_types[] = {
  #define BPF_PROG_TYPE(_id, _name) \
 [_id] = & _name ## _prog_ops,

Re: [PATCH bpf-next 3/6] bpf: add MAP_LOOKUP_AND_DELETE_ELEM syscall

2018-10-08 Thread Song Liu
On Mon, Oct 8, 2018 at 12:12 PM Mauricio Vasquez B
 wrote:
>
> The following patch implements a bpf queue/stack maps that
> provides the peek/pop/push functions.  There is not a direct
> relationship between those functions and the current maps
> syscalls, hence a new MAP_LOOKUP_AND_DELETE_ELEM syscall is added,
> this is mapped to the pop operation in the queue/stack maps
> and it is still to implement in other kind of maps.

Do we need this system call for other maps (non-stack/queue)?
If not, maybe we can just call it POP, and only implement it for
stack and queue?

>
> Signed-off-by: Mauricio Vasquez B 
> ---
>  include/linux/bpf.h  |1 +
>  include/uapi/linux/bpf.h |1 +
>  kernel/bpf/syscall.c |   81 
> ++
>  3 files changed, 83 insertions(+)
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 027697b6a22f..98c7eeb6d138 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -39,6 +39,7 @@ struct bpf_map_ops {
> void *(*map_lookup_elem)(struct bpf_map *map, void *key);
> int (*map_update_elem)(struct bpf_map *map, void *key, void *value, 
> u64 flags);
> int (*map_delete_elem)(struct bpf_map *map, void *key);
> +   void *(*map_lookup_and_delete_elem)(struct bpf_map *map, void *key);
>
> /* funcs called by prog_array and perf_event_array map */
> void *(*map_fd_get_ptr)(struct bpf_map *map, struct file *map_file,
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index f9187b41dff6..3bb94aa2d408 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -103,6 +103,7 @@ enum bpf_cmd {
> BPF_BTF_LOAD,
> BPF_BTF_GET_FD_BY_ID,
> BPF_TASK_FD_QUERY,
> +   BPF_MAP_LOOKUP_AND_DELETE_ELEM,
>  };
>
>  enum bpf_map_type {
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index eb75e8af73ff..c33d9303f72f 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -975,6 +975,84 @@ static int map_get_next_key(union bpf_attr *attr)
> return err;
>  }
>
> +#define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD value
> +
> +static int map_lookup_and_delete_elem(union bpf_attr *attr)
> +{
> +   void __user *ukey = u64_to_user_ptr(attr->key);
> +   void __user *uvalue = u64_to_user_ptr(attr->value);
> +   int ufd = attr->map_fd;
> +   struct bpf_map *map;
> +   void *key, *value, *ptr;
> +   u32 value_size;
> +   struct fd f;
> +   int err;
> +
> +   if (CHECK_ATTR(BPF_MAP_LOOKUP_AND_DELETE_ELEM))
> +   return -EINVAL;
> +
> +   f = fdget(ufd);
> +   map = __bpf_map_get(f);
> +   if (IS_ERR(map))
> +   return PTR_ERR(map);
> +
> +   if (!(f.file->f_mode & FMODE_CAN_WRITE)) {
> +   err = -EPERM;
> +   goto err_put;
> +   }
> +
> +   key = __bpf_copy_key(ukey, map->key_size);
> +   if (IS_ERR(key)) {
> +   err = PTR_ERR(key);
> +   goto err_put;
> +   }
> +
> +   value_size = map->value_size;
> +
> +   err = -ENOMEM;
> +   value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
> +   if (!value)
> +   goto free_key;
> +
> +   err = -EFAULT;
> +   if (copy_from_user(value, uvalue, value_size) != 0)
> +   goto free_value;
> +
> +   /* must increment bpf_prog_active to avoid kprobe+bpf triggering from
> +* inside bpf map update or delete otherwise deadlocks are possible
> +*/
> +   preempt_disable();
> +   __this_cpu_inc(bpf_prog_active);
> +   if (!map->ops->map_lookup_and_delete_elem) {
Why do have have this check here? Shouldn't it be check much earlier?
If we really need it here, we need at least add the following:

__this_cpu_dec(bpf_prog_active);
   preempt_enable();


> +   err = -ENOTSUPP;
> +   goto free_value;
> +   }
> +   rcu_read_lock();
> +   ptr = map->ops->map_lookup_and_delete_elem(map, key);
> +   if (ptr)
> +   memcpy(value, ptr, value_size);
> +   rcu_read_unlock();
> +   err = ptr ? 0 : -ENOENT;
> +   __this_cpu_dec(bpf_prog_active);
> +   preempt_enable();
> +
> +   if (err)
> +   goto free_value;
> +
> +   if (copy_to_user(uvalue, value, value_size) != 0)
> +   goto free_value;
> +
> +   err = 0;
> +
> +free_value:
> +   kfree(value);
> +free_key:
> +   kfree(key);
> +err_put:
> +   fdput(f);
> +   return err;
> +}
> +
>  static const struct bpf_prog_ops * const bpf_prog_types[] = {
>  #define BPF_PROG_TYPE(_id, _name) \
> [_id] = & _name ## _prog_ops,
> @@ -2448,6 +2526,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, 
> uattr, unsigned int, siz
> case BPF_TASK_FD_QUERY:
> err = bpf_task_fd_query(, uattr);
> break;
> +   case BPF_MAP_LOOKUP_AND_DELETE_ELEM:
> +   err = 

[PATCH bpf-next 3/6] bpf: add MAP_LOOKUP_AND_DELETE_ELEM syscall

2018-10-08 Thread Mauricio Vasquez B
The following patch implements a bpf queue/stack maps that
provides the peek/pop/push functions.  There is not a direct
relationship between those functions and the current maps
syscalls, hence a new MAP_LOOKUP_AND_DELETE_ELEM syscall is added,
this is mapped to the pop operation in the queue/stack maps
and it is still to implement in other kind of maps.

Signed-off-by: Mauricio Vasquez B 
---
 include/linux/bpf.h  |1 +
 include/uapi/linux/bpf.h |1 +
 kernel/bpf/syscall.c |   81 ++
 3 files changed, 83 insertions(+)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 027697b6a22f..98c7eeb6d138 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -39,6 +39,7 @@ struct bpf_map_ops {
void *(*map_lookup_elem)(struct bpf_map *map, void *key);
int (*map_update_elem)(struct bpf_map *map, void *key, void *value, u64 
flags);
int (*map_delete_elem)(struct bpf_map *map, void *key);
+   void *(*map_lookup_and_delete_elem)(struct bpf_map *map, void *key);
 
/* funcs called by prog_array and perf_event_array map */
void *(*map_fd_get_ptr)(struct bpf_map *map, struct file *map_file,
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index f9187b41dff6..3bb94aa2d408 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -103,6 +103,7 @@ enum bpf_cmd {
BPF_BTF_LOAD,
BPF_BTF_GET_FD_BY_ID,
BPF_TASK_FD_QUERY,
+   BPF_MAP_LOOKUP_AND_DELETE_ELEM,
 };
 
 enum bpf_map_type {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index eb75e8af73ff..c33d9303f72f 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -975,6 +975,84 @@ static int map_get_next_key(union bpf_attr *attr)
return err;
 }
 
+#define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD value
+
+static int map_lookup_and_delete_elem(union bpf_attr *attr)
+{
+   void __user *ukey = u64_to_user_ptr(attr->key);
+   void __user *uvalue = u64_to_user_ptr(attr->value);
+   int ufd = attr->map_fd;
+   struct bpf_map *map;
+   void *key, *value, *ptr;
+   u32 value_size;
+   struct fd f;
+   int err;
+
+   if (CHECK_ATTR(BPF_MAP_LOOKUP_AND_DELETE_ELEM))
+   return -EINVAL;
+
+   f = fdget(ufd);
+   map = __bpf_map_get(f);
+   if (IS_ERR(map))
+   return PTR_ERR(map);
+
+   if (!(f.file->f_mode & FMODE_CAN_WRITE)) {
+   err = -EPERM;
+   goto err_put;
+   }
+
+   key = __bpf_copy_key(ukey, map->key_size);
+   if (IS_ERR(key)) {
+   err = PTR_ERR(key);
+   goto err_put;
+   }
+
+   value_size = map->value_size;
+
+   err = -ENOMEM;
+   value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
+   if (!value)
+   goto free_key;
+
+   err = -EFAULT;
+   if (copy_from_user(value, uvalue, value_size) != 0)
+   goto free_value;
+
+   /* must increment bpf_prog_active to avoid kprobe+bpf triggering from
+* inside bpf map update or delete otherwise deadlocks are possible
+*/
+   preempt_disable();
+   __this_cpu_inc(bpf_prog_active);
+   if (!map->ops->map_lookup_and_delete_elem) {
+   err = -ENOTSUPP;
+   goto free_value;
+   }
+   rcu_read_lock();
+   ptr = map->ops->map_lookup_and_delete_elem(map, key);
+   if (ptr)
+   memcpy(value, ptr, value_size);
+   rcu_read_unlock();
+   err = ptr ? 0 : -ENOENT;
+   __this_cpu_dec(bpf_prog_active);
+   preempt_enable();
+
+   if (err)
+   goto free_value;
+
+   if (copy_to_user(uvalue, value, value_size) != 0)
+   goto free_value;
+
+   err = 0;
+
+free_value:
+   kfree(value);
+free_key:
+   kfree(key);
+err_put:
+   fdput(f);
+   return err;
+}
+
 static const struct bpf_prog_ops * const bpf_prog_types[] = {
 #define BPF_PROG_TYPE(_id, _name) \
[_id] = & _name ## _prog_ops,
@@ -2448,6 +2526,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, 
uattr, unsigned int, siz
case BPF_TASK_FD_QUERY:
err = bpf_task_fd_query(, uattr);
break;
+   case BPF_MAP_LOOKUP_AND_DELETE_ELEM:
+   err = map_lookup_and_delete_elem();
+   break;
default:
err = -EINVAL;
break;