I originally reported this on 3.4.76 but I see the same issue on 3.4.77.
I've also tested with a 3.13 rc and could not reproduce the issue. All
of these tests are done on a 12-core amd64 machines.
I wrote this simple program (attached) to play around with kernel AIO.
It simply does kernel AIO with O_DIRECT on a small temp file stored on
an ext4 filesystem.
When I run it with "LD_PRELOAD=libhugetlbfs.so", it triggers a "Bad page
state" BUG on exit every time.
Removing LD_PRELOAD from the command line fixes the problem. Note that
my kernel does not use THP, it is NOT compiled with
CONFIG_TRANSPARENT_HUGEPAGE.
kernel: BUG: Bad page state in process aio_test pfn:1b7201
kernel: page:ffffea0006dc8040 count:0
mapcount:1 mapping: (null) index:0x91e
kernel: page flags: 0x20000000008000(tail)
kernel: Modules linked in: nfsd exportfs nfs nfs_acl auth_rpcgss fscache lockd
sunrpc rdma_ucm rdma_cm ib_addr iw_cm ib_uverbs ib_cm ib_sa ib_mad ib_core
ipmi_si ipmi_devintf ioatdma coretemp microcode i2c_i801 serio_raw pcspkr
i2c_core dca dm_mod sg sr_mod cdrom crc32c_intel ahci libahci [last unloaded:
scsi_wait_scan]
kernel: Pid: 5170, comm: aio_test Tainted: G O 3.4.77bug #1
kernel: Call Trace:
kernel: [<ffffffff810f3300>] ? is_free_buddy_page+0xa0/0xd0
kernel: [<ffffffff814c0861>] bad_page+0xe6/0xfc
kernel: [<ffffffff810f3dbc>] free_pages_prepare+0xfc/0x110
kernel: [<ffffffff811afe20>] ? noalloc_get_block_write+0x30/0x30
kernel: [<ffffffff810f3dff>] __free_pages_ok+0x2f/0xd0
kernel: [<ffffffff810f4080>] __free_pages+0x20/0x40
kernel: [<ffffffff81124737>] update_and_free_page+0x77/0x80
kernel: [<ffffffff8112633e>] free_huge_page+0x16e/0x180
kernel: [<ffffffff810f8030>] __put_compound_page+0x20/0x50
kernel: [<ffffffff810f8108>] put_compound_page+0x78/0x140
kernel: [<ffffffff810f8546>] put_page+0x36/0x40
kernel: [<ffffffff81126ede>] __unmap_hugepage_range+0x1ce/0x230
kernel: [<ffffffff81127331>] unmap_hugepage_range+0x51/0x90
kernel: [<ffffffff8110e880>] unmap_single_vma+0x730/0x740
kernel: [<ffffffff8110f05f>] unmap_vmas+0x5f/0x80
kernel: [<ffffffff8111672c>] exit_mmap+0xbc/0x130
kernel: [<ffffffff8112e223>] ? kmem_cache_free+0xd3/0xe0
kernel: [<ffffffff81035155>] mmput+0x35/0xf0
kernel: [<ffffffff8103a58d>] exit_mm+0xfd/0x120
kernel: [<ffffffff8103bb6c>] do_exit+0x16c/0x8b0
kernel: [<ffffffff811540c4>] ? mntput+0x24/0x40
kernel: [<ffffffff81138962>] ? fput+0x192/0x250
kernel: [<ffffffff8103c5ff>] do_group_exit+0x3f/0xa0
kernel: [<ffffffff8103c677>] sys_exit_group+0x17/0x20
kernel: [<ffffffff814d0492>] system_call_fastpath+0x16/0x1b
When I revert the following patch, I cannot reproduce the problem
commit b07ef016454ff46f98e633b5a6247ca7e343fb67
Author: Khalid Aziz <[email protected]>
Date: Wed Sep 11 14:22:20 2013 -0700
This patch was added to the 3.4 branch for 3.4.69.
27c73ae759774e63313c1fbfeb17ba076cea64c5 might have fixed the issue in
the dev branchbut I have not tried to backport it
--
Guillaume Morin <[email protected]>
#define _GNU_SOURCE
#include <libaio.h>
#include <errno.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <sys/eventfd.h>
#include <sys/epoll.h>
#include <sys/mman.h>
#include <stdio.h>
#include <stdlib.h>
#define FILE_SIZE 4096
int main(void)
{
io_context_t ctx;
int fd,fd_odirect,i,event_fd,epoll_fd;
struct epoll_event ev;
void *buf;
size_t offset = 0;
struct iocb cb;
struct iocb * cbs[1] = { &cb };
fd = open("/tmp/foo",O_RDWR|O_CREAT);
if (fd == -1) {
perror("open");
return 1;
}
for (i = 0; i < FILE_SIZE; ++i) {
char c = rand() % 255;
write(fd, &c, 1);
}
close(fd);
fd_odirect = open("/tmp/foo",O_RDONLY|O_DIRECT);
if (fd_odirect == -1) {
perror("open");
return 1;
}
memset(&ctx, 0, sizeof(ctx));
if (0 != io_queue_init(1, &ctx)) {
perror("ctx");
return 1;
}
event_fd = eventfd(0, EFD_CLOEXEC);
if (event_fd == -1) {
perror("eventfd");
return -1;
}
epoll_fd = epoll_create(1);
if (epoll_fd == -1) {
perror("epoll_fd");
return 1;
}
ev.events = EPOLLIN;
if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, event_fd, &ev) == -1) {
perror("epoll_ctl");
return 1;
}
posix_memalign(&buf, 512, 32768);
while (1) {
struct timespec ts = { 0, 0 };
struct io_event ioev;
int ret;
long v;
io_prep_pread(&cb, fd_odirect, buf + offset, 512, offset);
io_set_eventfd(&cb, event_fd);
if (1 != io_submit(ctx, 1, cbs)) {
perror("io_submit");
return 1;
}
ret = epoll_wait(epoll_fd, &ev, 1, -1);
if (ret != 1) {
perror("epoll_wait");
}
read(event_fd, &v, 8);
printf("event_fd returned %ld\n", v);
if (io_getevents(ctx, 1, 1, &ioev, &ts) != 1) {
perror("io_getevents");
return 1;
}
printf("Read 1 res %ld res2 %ld\n", ioev.res, ioev.res2);
offset += ioev.res;
if (ioev.res == 0) {
break;
}
if ((offset + 512) > 32768) {
puts("ERROR - reading past buffer");
return 1;
}
}
free(buf);
io_destroy(ctx);
close(event_fd);
close(epoll_fd);
close(fd_odirect);
return 0;
}