A vmcore corrupt issue has been noticed in powerpc arch [1]. Although the original issue was closed, but it still can be reproduced with upstream makedumpfile.
With --num-threads=N enabled, there will be N sub-threads created. All sub-threads are producers which responsible for mm page processing, e.g. compression. The main thread is the consumer which responsible for writing the compressed data into file. page_flag_buf->ready is used to sync main and sub-threads. When a sub-thread finishes page processing, it will set ready flag to be FLAG_READY. In the meantime, main thread looply check all threads of the ready flags, and break the loop when find FLAG_READY. page_flag_buf->ready is read/write by main/sub-threads simultaneously, but it is unprotected and unsafe. I have tested both mutex and atomic_rw can fix this issue. This patch takes atomic_rw for its simplicity. [1]: https://github.com/makedumpfile/makedumpfile/issues/15 Signed-off-by: Tao Liu <l...@redhat.com> --- makedumpfile.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/makedumpfile.c b/makedumpfile.c index 2d3b08b..bac45c2 100644 --- a/makedumpfile.c +++ b/makedumpfile.c @@ -8621,7 +8621,8 @@ kdump_thread_function_cyclic(void *arg) { while (buf_ready == FALSE) { pthread_testcancel(); - if (page_flag_buf->ready == FLAG_READY) + if (__atomic_load_n(&page_flag_buf->ready, + __ATOMIC_SEQ_CST) == FLAG_READY) continue; /* get next dumpable pfn */ @@ -8637,7 +8638,8 @@ kdump_thread_function_cyclic(void *arg) { info->current_pfn = pfn + 1; page_flag_buf->pfn = pfn; - page_flag_buf->ready = FLAG_FILLING; + __atomic_store_n(&page_flag_buf->ready, FLAG_FILLING, + __ATOMIC_SEQ_CST); pthread_mutex_unlock(&info->current_pfn_mutex); sem_post(&info->page_flag_buf_sem); @@ -8726,7 +8728,8 @@ kdump_thread_function_cyclic(void *arg) { page_flag_buf->index = index; buf_ready = TRUE; next: - page_flag_buf->ready = FLAG_READY; + __atomic_store_n(&page_flag_buf->ready, FLAG_READY, + __ATOMIC_SEQ_CST); page_flag_buf = page_flag_buf->next; } @@ -8855,7 +8858,8 @@ write_kdump_pages_parallel_cyclic(struct cache_data *cd_header, * current_pfn is used for recording the value of pfn when checking the pfn. */ for (i = 0; i < info->num_threads; i++) { - if (info->page_flag_buf[i]->ready == FLAG_UNUSED) + if (__atomic_load_n(&info->page_flag_buf[i]->ready, + __ATOMIC_SEQ_CST) == FLAG_UNUSED) continue; temp_pfn = info->page_flag_buf[i]->pfn; @@ -8863,7 +8867,8 @@ write_kdump_pages_parallel_cyclic(struct cache_data *cd_header, * count how many threads have reached the end. */ if (temp_pfn >= end_pfn) { - info->page_flag_buf[i]->ready = FLAG_UNUSED; + __atomic_store_n(&info->page_flag_buf[i]->ready, + FLAG_UNUSED, __ATOMIC_SEQ_CST); end_count++; continue; } @@ -8885,7 +8890,8 @@ write_kdump_pages_parallel_cyclic(struct cache_data *cd_header, * If the page_flag_buf is not ready, the pfn recorded may be changed. * So we should recheck. */ - if (info->page_flag_buf[consuming]->ready != FLAG_READY) { + if (__atomic_load_n(&info->page_flag_buf[consuming]->ready, + __ATOMIC_SEQ_CST) != FLAG_READY) { clock_gettime(CLOCK_MONOTONIC, &new); if (new.tv_sec - last.tv_sec > WAIT_TIME) { ERRMSG("Can't get data of pfn.\n"); @@ -8927,7 +8933,8 @@ write_kdump_pages_parallel_cyclic(struct cache_data *cd_header, goto out; page_data_buf[index].used = FALSE; } - info->page_flag_buf[consuming]->ready = FLAG_UNUSED; + __atomic_store_n(&info->page_flag_buf[consuming]->ready, + FLAG_UNUSED, __ATOMIC_SEQ_CST); info->page_flag_buf[consuming] = info->page_flag_buf[consuming]->next; } finish: -- 2.47.0