Getters such as job_get_aio_context are often wrong because the AioContext can change immediately after returning.

So, I wonder if job.aio_context should be protected with a kind of "fake rwlock": read under BQL or job_lock, write under BQL+job_lock. For this to work, you can add an assertion for qemu_in_main_thread() to child_job_set_aio_ctx, or even better have the assertion in a wrapper API job_set_aio_context_locked().

And then, we can remove job_get_aio_context().

Let's look at all cases individually:

On 1/5/22 15:02, Emanuele Giuseppe Esposito wrote:
diff --git a/block/commit.c b/block/commit.c
index f639eb49c5..961b57edf0 100644
--- a/block/commit.c
+++ b/block/commit.c
@@ -369,7 +369,7 @@ void commit_start(const char *job_id, BlockDriverState *bs,
          goto fail;
      }
- s->base = blk_new(s->common.job.aio_context,
+    s->base = blk_new(job_get_aio_context(&s->common.job),
                        base_perms,
                        BLK_PERM_CONSISTENT_READ
                        | BLK_PERM_GRAPH_MOD

Here the AioContext is the one of bs. It cannot change because we're under BQL. Replace with bdrv_get_aio_context.

@@ -382,7 +382,7 @@ void commit_start(const char *job_id, BlockDriverState *bs,
      s->base_bs = base;
/* Required permissions are already taken with block_job_add_bdrv() */
-    s->top = blk_new(s->common.job.aio_context, 0, BLK_PERM_ALL);
+    s->top = blk_new(job_get_aio_context(&s->common.job), 0, BLK_PERM_ALL);
      ret = blk_insert_bs(s->top, top, errp);
      if (ret < 0) {
          goto fail;

Same.

diff --git a/block/mirror.c b/block/mirror.c
index 41450df55c..72b4367b4e 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -1743,7 +1743,7 @@ static BlockJob *mirror_start_job(
          target_perms |= BLK_PERM_GRAPH_MOD;
      }
- s->target = blk_new(s->common.job.aio_context,
+    s->target = blk_new(job_get_aio_context(&s->common.job),
                          target_perms, target_shared_perms);
      ret = blk_insert_bs(s->target, target, errp);
      if (ret < 0) {

Same.

diff --git a/block/replication.c b/block/replication.c
index 50ea778937..68018948b9 100644
--- a/block/replication.c
+++ b/block/replication.c
@@ -148,8 +148,8 @@ static void replication_close(BlockDriverState *bs)
      }
      if (s->stage == BLOCK_REPLICATION_FAILOVER) {
          commit_job = &s->commit_job->job;
-        assert(commit_job->aio_context == qemu_get_current_aio_context());
          WITH_JOB_LOCK_GUARD() {
+            assert(commit_job->aio_context == qemu_get_current_aio_context());
              job_cancel_sync_locked(commit_job, false);
          }
      }

Ok.

diff --git a/blockjob.c b/blockjob.c
index cf1f49f6c2..468ba735c5 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -155,14 +155,16 @@ static void child_job_set_aio_ctx(BdrvChild *c, 
AioContext *ctx,
          bdrv_set_aio_context_ignore(sibling->bs, ctx, ignore);
      }
- job->job.aio_context = ctx;
+    WITH_JOB_LOCK_GUARD() {
+        job->job.aio_context = ctx;
+    }
  }
static AioContext *child_job_get_parent_aio_context(BdrvChild *c)
  {
      BlockJob *job = c->opaque;
- return job->job.aio_context;
+    return job_get_aio_context(&job->job);
  }
static const BdrvChildClass child_job = {

Both called with BQL held, I think.

@@ -218,19 +220,21 @@ int block_job_add_bdrv(BlockJob *job, const char *name, 
BlockDriverState *bs,
  {
      BdrvChild *c;
      bool need_context_ops;
+    AioContext *job_aiocontext;
      assert(qemu_in_main_thread());
bdrv_ref(bs); - need_context_ops = bdrv_get_aio_context(bs) != job->job.aio_context;
+    job_aiocontext = job_get_aio_context(&job->job);
+    need_context_ops = bdrv_get_aio_context(bs) != job_aiocontext;
- if (need_context_ops && job->job.aio_context != qemu_get_aio_context()) {
-        aio_context_release(job->job.aio_context);
+    if (need_context_ops && job_aiocontext != qemu_get_aio_context()) {
+        aio_context_release(job_aiocontext);
      }
      c = bdrv_root_attach_child(bs, name, &child_job, 0, perm, shared_perm, 
job,
                                 errp);
-    if (need_context_ops && job->job.aio_context != qemu_get_aio_context()) {
-        aio_context_acquire(job->job.aio_context);
+    if (need_context_ops && job_aiocontext != qemu_get_aio_context()) {
+        aio_context_acquire(job_aiocontext);
      }
      if (c == NULL) {
          return -EPERM;

BQL held, too.

diff --git a/job.c b/job.c
index f16a4ef542..8a5b710d9b 100644
--- a/job.c
+++ b/job.c
@@ -566,7 +566,7 @@ void job_enter_cond_locked(Job *job, bool(*fn)(Job *job))
      job->busy = true;
      real_job_unlock();
      job_unlock();
-    aio_co_enter(job->aio_context, job->co);
+    aio_co_enter(job_get_aio_context(job), job->co);
      job_lock();
  }

If you replace aio_co_enter with aio_co_schedule, you can call it without dropping the lock. The difference being that aio_co_schedule will always go through a bottom half.

@@ -1138,7 +1138,6 @@ static void coroutine_fn job_co_entry(void *opaque)
      Job *job = opaque;
      int ret;
- assert(job->aio_context == qemu_get_current_aio_context());
      assert(job && job->driver && job->driver->run);
      job_pause_point(job);
      ret = job->driver->run(job, &job->err);
@@ -1177,7 +1176,7 @@ void job_start(Job *job)
          job->paused = false;
          job_state_transition_locked(job, JOB_STATUS_RUNNING);
      }
-    aio_co_enter(job->aio_context, job->co);
+    aio_co_enter(job_get_aio_context(job), job->co);

Better to use aio_co_schedule here, too, and move it under the previous WITH_JOB_LOCK_GUARD.

  }
void job_cancel_locked(Job *job, bool force)
@@ -1303,7 +1302,8 @@ int job_finish_sync_locked(Job *job, void (*finish)(Job 
*, Error **errp),
      }
job_unlock();
-    AIO_WAIT_WHILE(job->aio_context, (job_enter(job), !job_is_completed(job)));
+    AIO_WAIT_WHILE(job_get_aio_context(job),
+                   (job_enter(job), !job_is_completed(job)));
      job_lock();

Here I think we are also holding the BQL, because this function is "sync", so it's safe to use job->aio_context.

Paolo

Reply via email to