On 7/22/15 1:09 AM, Kaixu Xia wrote:
The user space event FDs from perf_event_open() syscall are
converted to the pointer to struct perf_event and stored in map.
...
+static int replace_map_with_perf_event(void *value)
+{
+ struct perf_event *event;
+ u32 fd;
+
+ fd = *(u32 *)value;
+
+ event = perf_event_get(fd);
+ if (IS_ERR(event))
+ return PTR_ERR(event);
+
+ /* limit the event type to PERF_TYPE_RAW
+ * and PERF_TYPE_HARDWARE
+ */
+ if (event->attr.type != PERF_TYPE_RAW &&
+ event->attr.type != PERF_TYPE_HARDWARE)
+ return -EINVAL;
+
+ memcpy(value, &event, sizeof(struct perf_event *));
+
+ return 0;
+}
+
+static bool check_map_perf_event_stored(struct bpf_map *map, void *key)
+{
+ void *event;
+ bool is_stored = false;
+
+ rcu_read_lock();
+ event = array_map_lookup_elem(map, key);
+ if (event && (*(unsigned long *)event))
+ is_stored = true;
+ rcu_read_unlock();
+
+ return is_stored;
+}
+
+/* only called from syscall */
+static int perf_event_array_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags)
+{
+ /* check if the value is already stored */
+ if (check_map_perf_event_stored(map, key))
+ return -EINVAL;
+
+ if (replace_map_with_perf_event(value))
+ return -EBADF;
the above three functions are broken due to races.
update_elem can be called by different user space processes.
Please see how prog_array solves it via xchg()
+ if (map->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY) {
+ if (!map->ops->map_traverse_elem)
+ return -EPERM;
+
+ rcu_read_lock();
+ if (map->ops->map_traverse_elem(bpf_map_perf_event_put, map) <
0) {
+ rcu_read_unlock();
+ return -EINVAL;
+ }
+ rcu_read_unlock();
as I mentioned as part of patch 2 the above seems unnecessary.
diff --git a/kernel/events/core.c b/kernel/events/core.c
index d3dae34..14a9924 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -8574,6 +8574,32 @@ void perf_event_delayed_put(struct task_struct *task)
WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
}
+struct perf_event *perf_event_get(unsigned int fd)
+{
+ struct perf_event *event;
+ struct fd f;
+
+ f = fdget(fd);
+
+ if (!f.file)
+ return ERR_PTR(-EBADF);
+
+ if (f.file->f_op != &perf_fops) {
+ fdput(f);
+ return ERR_PTR(-EINVAL);
+ }
+
+ event = f.file->private_data;
+
+ if (!atomic_long_inc_not_zero(&event->refcount)) {
I don't understand necessity of '_not_zero' suffix. why?
How it can possibly race to zero here if we have an fd?
+ fdput(f);
+ return ERR_PTR(-ENOENT);
+ }
+
+ fdput(f);
+ return event;
+}
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/