Introduce the smp_load_acquire() and smp_store_release() macros. Fix
synchronization in io_uring_cq_advance() and __io_uring_get_cqe().
Remove a superfluous local variable, if-test and write barrier from
__io_uring_submit(). Remove a superfluous barrier from
test/io_uring_enter.c.

Cc: Stefan Hajnoczi <[email protected]>
Cc: Roman Penyaev <[email protected]>
Signed-off-by: Bart Van Assche <[email protected]>
---
 man/io_uring_setup.2  |  6 ++-
 src/barrier.h         | 87 +++++++++++++++++++++++++++++++++++++++++--
 src/liburing.h        | 15 +++-----
 src/queue.c           | 30 ++++-----------
 test/io_uring_enter.c |  8 ++--
 5 files changed, 107 insertions(+), 39 deletions(-)

diff --git a/man/io_uring_setup.2 b/man/io_uring_setup.2
index ebaee2d43f35..9ab01434c18d 100644
--- a/man/io_uring_setup.2
+++ b/man/io_uring_setup.2
@@ -97,7 +97,11 @@ call with the following code sequence:
 
 .in +4n
 .EX
-read_barrier();
+/*
+ * Ensure that the wakeup flag is read after the tail pointer has been
+ * written.
+ */
+smp_mb();
 if (*sq_ring->flags & IORING_SQ_NEED_WAKEUP)
     io_uring_enter(fd, 0, 0, IORING_ENTER_SQ_WAKEUP);
 .EE
diff --git a/src/barrier.h b/src/barrier.h
index ef00f6722ba9..e1a407fccde2 100644
--- a/src/barrier.h
+++ b/src/barrier.h
@@ -1,16 +1,95 @@
 #ifndef LIBURING_BARRIER_H
 #define LIBURING_BARRIER_H
 
+/*
+From the kernel documentation file refcount-vs-atomic.rst:
+
+A RELEASE memory ordering guarantees that all prior loads and
+stores (all po-earlier instructions) on the same CPU are completed
+before the operation. It also guarantees that all po-earlier
+stores on the same CPU and all propagated stores from other CPUs
+must propagate to all other CPUs before the release operation
+(A-cumulative property). This is implemented using
+:c:func:`smp_store_release`.
+
+An ACQUIRE memory ordering guarantees that all post loads and
+stores (all po-later instructions) on the same CPU are
+completed after the acquire operation. It also guarantees that all
+po-later stores on the same CPU must propagate to all other CPUs
+after the acquire operation executes. This is implemented using
+:c:func:`smp_acquire__after_ctrl_dep`.
+*/
+
+/* From tools/include/linux/compiler.h */
+/* Optimization barrier */
+/* The "volatile" is due to gcc bugs */
+#define barrier() __asm__ __volatile__("": : :"memory")
+
+/* From tools/virtio/linux/compiler.h */
+#define WRITE_ONCE(var, val) \
+       (*((volatile typeof(val) *)(&(var))) = (val))
+#define READ_ONCE(var) (*((volatile typeof(var) *)(&(var))))
+
+
 #if defined(__x86_64) || defined(__i386__)
-#define read_barrier() __asm__ __volatile__("":::"memory")
-#define write_barrier()        __asm__ __volatile__("":::"memory")
+/* From tools/arch/x86/include/asm/barrier.h */
+#if defined(__i386__)
+/*
+ * Some non-Intel clones support out of order store. wmb() ceases to be a
+ * nop for these.
+ */
+#define mb()   asm volatile("lock; addl $0,0(%%esp)" ::: "memory")
+#define rmb()  asm volatile("lock; addl $0,0(%%esp)" ::: "memory")
+#define wmb()  asm volatile("lock; addl $0,0(%%esp)" ::: "memory")
+#elif defined(__x86_64__)
+#define mb()   asm volatile("mfence" ::: "memory")
+#define rmb()  asm volatile("lfence" ::: "memory")
+#define wmb()  asm volatile("sfence" ::: "memory")
+#define smp_rmb() barrier()
+#define smp_wmb() barrier()
+#define smp_mb()  asm volatile("lock; addl $0,-132(%%rsp)" ::: "memory", "cc")
+#endif
+
+#if defined(__x86_64__)
+#define smp_store_release(p, v)                        \
+do {                                           \
+       barrier();                              \
+       WRITE_ONCE(*(p), (v));                  \
+} while (0)
+
+#define smp_load_acquire(p)                    \
+({                                             \
+       typeof(*p) ___p1 = READ_ONCE(*(p));     \
+       barrier();                              \
+       ___p1;                                  \
+})
+#endif /* defined(__x86_64__) */
 #else
 /*
  * Add arch appropriate definitions. Be safe and use full barriers for
  * archs we don't have support for.
  */
-#define read_barrier() __sync_synchronize()
-#define write_barrier()        __sync_synchronize()
+#define smp_rmb()      __sync_synchronize()
+#define smp_wmb()      __sync_synchronize()
+#endif
+
+/* From tools/include/asm/barrier.h */
+
+#ifndef smp_store_release
+# define smp_store_release(p, v)               \
+do {                                           \
+       smp_mb();                               \
+       WRITE_ONCE(*p, v);                      \
+} while (0)
+#endif
+
+#ifndef smp_load_acquire
+# define smp_load_acquire(p)                   \
+({                                             \
+       typeof(*p) ___p1 = READ_ONCE(*p);       \
+       smp_mb();                               \
+       ___p1;                                  \
+})
 #endif
 
 #endif
diff --git a/src/liburing.h b/src/liburing.h
index d3fcd1524540..a350a013ef8a 100644
--- a/src/liburing.h
+++ b/src/liburing.h
@@ -88,11 +88,10 @@ extern int io_uring_register_eventfd(struct io_uring *ring, 
int fd);
 extern int io_uring_unregister_eventfd(struct io_uring *ring);
 
 #define io_uring_for_each_cqe(ring, head, cqe)                                 
\
+       /* smp_load_acquire() enforces the order of tail and CQE reads. */      
\
        for (head = *(ring)->cq.khead;                                          
\
-            /* See read_barrier() explanation in __io_uring_get_cqe() */       
\
-            ({read_barrier();                                                  
\
-              cqe = (head != *(ring)->cq.ktail ?                               
\
-               &(ring)->cq.cqes[head & (*(ring)->cq.kring_mask)] : NULL);});   
\
+            (cqe = (head != smp_load_acquire((ring)->cq.ktail) ?               
\
+               &(ring)->cq.cqes[head & (*(ring)->cq.kring_mask)] : NULL));     
\
             head++)                                                            
\
 
 
@@ -105,13 +104,11 @@ static inline void io_uring_cq_advance(struct io_uring 
*ring,
        if (nr) {
                struct io_uring_cq *cq = &ring->cq;
 
-               (*cq->khead) += nr;
-
                /*
-                * Ensure that the kernel sees our new head, the kernel has
-                * the matching read barrier.
+                * Ensure that the kernel only sees the new value of the head
+                * index after the CQEs have been read.
                 */
-               write_barrier();
+               smp_store_release(cq->khead, *cq->khead + nr);
        }
 }
 
diff --git a/src/queue.c b/src/queue.c
index bec363fc0ebf..72b22935c2ef 100644
--- a/src/queue.c
+++ b/src/queue.c
@@ -77,7 +77,7 @@ static int __io_uring_submit(struct io_uring *ring, unsigned 
wait_nr)
 {
        struct io_uring_sq *sq = &ring->sq;
        const unsigned mask = *sq->kring_mask;
-       unsigned ktail, ktail_next, submitted, to_submit;
+       unsigned ktail, submitted, to_submit;
        unsigned flags;
        int ret;
 
@@ -88,15 +88,11 @@ static int __io_uring_submit(struct io_uring *ring, 
unsigned wait_nr)
         * Fill in sqes that we have queued up, adding them to the kernel ring
         */
        submitted = 0;
-       ktail = ktail_next = *sq->ktail;
+       ktail = *sq->ktail;
        to_submit = sq->sqe_tail - sq->sqe_head;
        while (to_submit--) {
-               ktail_next++;
-               read_barrier();
-
                sq->array[ktail & mask] = sq->sqe_head & mask;
-               ktail = ktail_next;
-
+               ktail++;
                sq->sqe_head++;
                submitted++;
        }
@@ -104,21 +100,11 @@ static int __io_uring_submit(struct io_uring *ring, 
unsigned wait_nr)
        if (!submitted)
                return 0;
 
-       if (*sq->ktail != ktail) {
-               /*
-                * First write barrier ensures that the SQE stores are updated
-                * with the tail update. This is needed so that the kernel
-                * will never see a tail update without the preceeding sQE
-                * stores being done.
-                */
-               write_barrier();
-               *sq->ktail = ktail;
-               /*
-                * The kernel has the matching read barrier for reading the
-                * SQ tail.
-                */
-               write_barrier();
-       }
+       /*
+        * Ensure that the kernel sees the SQE updates before it sees the tail
+        * update.
+        */
+       smp_store_release(sq->ktail, ktail);
 
        flags = 0;
        if (wait_nr || sq_ring_needs_enter(ring, &flags)) {
diff --git a/test/io_uring_enter.c b/test/io_uring_enter.c
index d6e407e621ff..b25afd5790f3 100644
--- a/test/io_uring_enter.c
+++ b/test/io_uring_enter.c
@@ -262,9 +262,11 @@ main(int argc, char **argv)
        ktail = *sq->ktail;
        sq->array[ktail & mask] = index;
        ++ktail;
-       write_barrier();
-       *sq->ktail = ktail;
-       write_barrier();
+       /*
+        * Ensure that the kernel sees the SQE update before it sees the tail
+        * update.
+        */
+       smp_store_release(sq->ktail, ktail);
 
        ret = io_uring_enter(ring.ring_fd, 1, 0, 0, NULL);
        /* now check to see if our sqe was dropped */
-- 
2.22.0.410.gd8fdbe21b5-goog

Reply via email to