Getters such as job_get_aio_context are often wrong because the
AioContext can change immediately after returning.
So, I wonder if job.aio_context should be protected with a kind of "fake
rwlock": read under BQL or job_lock, write under BQL+job_lock. For this
to work, you can add an assertion for qemu_in_main_thread() to
child_job_set_aio_ctx, or even better have the assertion in a wrapper
API job_set_aio_context_locked().
And then, we can remove job_get_aio_context().
Let's look at all cases individually:
On 1/5/22 15:02, Emanuele Giuseppe Esposito wrote:
diff --git a/block/commit.c b/block/commit.c
index f639eb49c5..961b57edf0 100644
--- a/block/commit.c
+++ b/block/commit.c
@@ -369,7 +369,7 @@ void commit_start(const char *job_id, BlockDriverState *bs,
goto fail;
}
- s->base = blk_new(s->common.job.aio_context,
+ s->base = blk_new(job_get_aio_context(&s->common.job),
base_perms,
BLK_PERM_CONSISTENT_READ
| BLK_PERM_GRAPH_MOD
Here the AioContext is the one of bs. It cannot change because we're
under BQL. Replace with bdrv_get_aio_context.
@@ -382,7 +382,7 @@ void commit_start(const char *job_id, BlockDriverState *bs,
s->base_bs = base;
/* Required permissions are already taken with block_job_add_bdrv() */
- s->top = blk_new(s->common.job.aio_context, 0, BLK_PERM_ALL);
+ s->top = blk_new(job_get_aio_context(&s->common.job), 0, BLK_PERM_ALL);
ret = blk_insert_bs(s->top, top, errp);
if (ret < 0) {
goto fail;
Same.
diff --git a/block/mirror.c b/block/mirror.c
index 41450df55c..72b4367b4e 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -1743,7 +1743,7 @@ static BlockJob *mirror_start_job(
target_perms |= BLK_PERM_GRAPH_MOD;
}
- s->target = blk_new(s->common.job.aio_context,
+ s->target = blk_new(job_get_aio_context(&s->common.job),
target_perms, target_shared_perms);
ret = blk_insert_bs(s->target, target, errp);
if (ret < 0) {
Same.
diff --git a/block/replication.c b/block/replication.c
index 50ea778937..68018948b9 100644
--- a/block/replication.c
+++ b/block/replication.c
@@ -148,8 +148,8 @@ static void replication_close(BlockDriverState *bs)
}
if (s->stage == BLOCK_REPLICATION_FAILOVER) {
commit_job = &s->commit_job->job;
- assert(commit_job->aio_context == qemu_get_current_aio_context());
WITH_JOB_LOCK_GUARD() {
+ assert(commit_job->aio_context == qemu_get_current_aio_context());
job_cancel_sync_locked(commit_job, false);
}
}
Ok.
diff --git a/blockjob.c b/blockjob.c
index cf1f49f6c2..468ba735c5 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -155,14 +155,16 @@ static void child_job_set_aio_ctx(BdrvChild *c,
AioContext *ctx,
bdrv_set_aio_context_ignore(sibling->bs, ctx, ignore);
}
- job->job.aio_context = ctx;
+ WITH_JOB_LOCK_GUARD() {
+ job->job.aio_context = ctx;
+ }
}
static AioContext *child_job_get_parent_aio_context(BdrvChild *c)
{
BlockJob *job = c->opaque;
- return job->job.aio_context;
+ return job_get_aio_context(&job->job);
}
static const BdrvChildClass child_job = {
Both called with BQL held, I think.
@@ -218,19 +220,21 @@ int block_job_add_bdrv(BlockJob *job, const char *name,
BlockDriverState *bs,
{
BdrvChild *c;
bool need_context_ops;
+ AioContext *job_aiocontext;
assert(qemu_in_main_thread());
bdrv_ref(bs);
- need_context_ops = bdrv_get_aio_context(bs) != job->job.aio_context;
+ job_aiocontext = job_get_aio_context(&job->job);
+ need_context_ops = bdrv_get_aio_context(bs) != job_aiocontext;
- if (need_context_ops && job->job.aio_context != qemu_get_aio_context()) {
- aio_context_release(job->job.aio_context);
+ if (need_context_ops && job_aiocontext != qemu_get_aio_context()) {
+ aio_context_release(job_aiocontext);
}
c = bdrv_root_attach_child(bs, name, &child_job, 0, perm, shared_perm,
job,
errp);
- if (need_context_ops && job->job.aio_context != qemu_get_aio_context()) {
- aio_context_acquire(job->job.aio_context);
+ if (need_context_ops && job_aiocontext != qemu_get_aio_context()) {
+ aio_context_acquire(job_aiocontext);
}
if (c == NULL) {
return -EPERM;
BQL held, too.
diff --git a/job.c b/job.c
index f16a4ef542..8a5b710d9b 100644
--- a/job.c
+++ b/job.c
@@ -566,7 +566,7 @@ void job_enter_cond_locked(Job *job, bool(*fn)(Job *job))
job->busy = true;
real_job_unlock();
job_unlock();
- aio_co_enter(job->aio_context, job->co);
+ aio_co_enter(job_get_aio_context(job), job->co);
job_lock();
}
If you replace aio_co_enter with aio_co_schedule, you can call it
without dropping the lock. The difference being that aio_co_schedule
will always go through a bottom half.
@@ -1138,7 +1138,6 @@ static void coroutine_fn job_co_entry(void *opaque)
Job *job = opaque;
int ret;
- assert(job->aio_context == qemu_get_current_aio_context());
assert(job && job->driver && job->driver->run);
job_pause_point(job);
ret = job->driver->run(job, &job->err);
@@ -1177,7 +1176,7 @@ void job_start(Job *job)
job->paused = false;
job_state_transition_locked(job, JOB_STATUS_RUNNING);
}
- aio_co_enter(job->aio_context, job->co);
+ aio_co_enter(job_get_aio_context(job), job->co);
Better to use aio_co_schedule here, too, and move it under the previous
WITH_JOB_LOCK_GUARD.
}
void job_cancel_locked(Job *job, bool force)
@@ -1303,7 +1302,8 @@ int job_finish_sync_locked(Job *job, void (*finish)(Job
*, Error **errp),
}
job_unlock();
- AIO_WAIT_WHILE(job->aio_context, (job_enter(job), !job_is_completed(job)));
+ AIO_WAIT_WHILE(job_get_aio_context(job),
+ (job_enter(job), !job_is_completed(job)));
job_lock();
Here I think we are also holding the BQL, because this function is
"sync", so it's safe to use job->aio_context.
Paolo