Hi, There is a test in the Fish shell (tests::history::test_history_races) that systematically fails when I run it. The test simulates multiple processes/threads trying to write to the shell history file at the same time. In my case, the test freezes/deadlocks with errors like "Bad Addr" and "Is Directory". When I add a sleep, the freeze/deadlocks disappear but the test eventually fails because the fake history is not the right size. See https://github.com/fish-shell/fish-shell/issues/11933 for more details.
I wrote a test case in pure C (attached) that also triggers the issue
although it's not as systematic (30-50%).
To compile: gcc main.c -o test.exe
To run: ./test.exe
Most failures look like this:
```
$ ./test.exe
tmp_dir: /tmp/flockc2Hz4c
open file error: 21 - Is a directory
/tmp/flockc2Hz4c/append_file
assertion "file_fd >= 0" failed: file "main.c", line 49, function:
thread_func
Aborted
```
Occasionally (maybe 10%), it looks like that:
```
$ ./test.exe
tmp_dir: /tmp/flock5Oly9J
lock error: 14 - Bad address
assertion "lock_res == 0" failed: file "main.c", line 38,
function: thread_func
Aborted
```
I believe the freeze/deadlock in the Fish test is because, unlike my
test, they don't assert/crash, and the next time they access the
history file, there is a bunch of deadlock in cygwin internals.
If that helps, this is a partial capture of the stack traces at one such time:
```
Thread 9
#2 0x00000001800d487f in muto::acquire (this=0x1802c24c0
<lock_process::locker>, ms=ms@entry=4294967295) at
/d/S/B/src/msys2-runtime/winsup/cygwin/sync.cc:84
#3 0x00000001800dd6e0 in dtable::lock (this=<optimized out>) at
/d/S/B/src/msys2-runtime/winsup/cygwin/local_includes/dtable.h:77
#4 cygheap_fdnew::cygheap_fdnew (this=<synthetic pointer>,
seed_fd=-1, lockit=true) at
/d/S/B/src/msys2-runtime/winsup/cygwin/local_includes/cygheap.h:593
#5 open (unix_path=0xa0002b3b0
"[...]/fish-shell/target/fish-test-home", flags=262144) at
/d/S/B/src/msys2-runtime/winsup/cygwin/syscalls.cc:1576
Thread 10
#2 0x00000001800d487f in muto::acquire (this=0x1802c24c0
<lock_process::locker>, ms=ms@entry=4294967295) at
/d/S/B/src/msys2-runtime/winsup/cygwin/sync.cc:84
#3 0x00000001800dd6e0 in dtable::lock (this=<optimized out>) at
/d/S/B/src/msys2-runtime/winsup/cygwin/local_includes/dtable.h:77
#4 cygheap_fdnew::cygheap_fdnew (this=<synthetic pointer>,
seed_fd=-1, lockit=true) at
/d/S/B/src/msys2-runtime/winsup/cygwin/local_includes/cygheap.h:593
#5 open (unix_path=0xa0002bfe0
"[...]/fish-shell/target/fish-test-home/race_test_history.FwyAgK",
flags=264706) at
/d/S/B/src/msys2-runtime/winsup/cygwin/syscalls.cc:1576
Thread 11
#2 0x00000001800670bb in inode_t::LOCK (this=0x80000ba20) at
/d/S/B/src/msys2-runtime/winsup/cygwin/flock.cc:314
#3 inode_t::get (dev=1881899537, ino=ino@entry=10977524092162599,
create_if_missing=create_if_missing@entry=false, lock=lock@entry=true)
at /d/S/B/src/msys2-runtime/winsup/cygwin/flock.cc:504
#4 0x0000000180068eb1 in fhandler_base::del_my_locks
(this=0x80000b810, from=on_close) at
/d/S/B/src/msys2-runtime/winsup/cygwin/flock.cc:402
#5 0x000000018010d5bf in fhandler_base::close_with_arch
(this=0x80000b810, flag=flag@entry=-1) at
/d/S/B/src/msys2-runtime/winsup/cygwin/fhandler/base.cc:1306
#6 0x00000001800de36b in __close (fd=5, flag=-1) at
/d/S/B/src/msys2-runtime/winsup/cygwin/syscalls.cc:1710
#7 close (fd=5) at /d/S/B/src/msys2-runtime/winsup/cygwin/syscalls.cc:1722
Thread 12
#2 0x00000001800d487f in muto::acquire (this=0x1802c24c0
<lock_process::locker>, ms=ms@entry=4294967295) at
/d/S/B/src/msys2-runtime/winsup/cygwin/sync.cc:84
#3 0x00000001800dd6e0 in dtable::lock (this=<optimized out>) at
/d/S/B/src/msys2-runtime/winsup/cygwin/local_includes/dtable.h:77
#4 cygheap_fdnew::cygheap_fdnew (this=<synthetic pointer>,
seed_fd=-1, lockit=true) at
/d/S/B/src/msys2-runtime/winsup/cygwin/local_includes/cygheap.h:593
#5 open (unix_path=0x7ff10b488
"[...]/fish-shell/target/fish-test-home/race_test_history.pZO5DS",
flags=263169) at
/d/S/B/src/msys2-runtime/winsup/cygwin/syscalls.cc:1576
```
The freeze/deadlock can be reproduced in my C code by calling
"continue" inside the "if (lock_res != 0) {" instead of triggering the
assert just after.
I haven't been able to reproduce the missing data in the history file
so it's unknown if it's an issue in Fish or flock not locking properly
at times. So far the test passes on Linux and MacOS.
Thanks,
Nahor
#include <stdlib.h>
#include <assert.h>
#include <errno.h>
#include <fcntl.h>
#include <pthread.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
const int THREAD_COUNT = 10;
const int ITERATION_COUNT = 100;
const int MAX_PATH = 128;
typedef struct {
int thread_id;
pthread_barrier_t *barrier;
const char *dir;
const char *file;
} thread_args;
static void *thread_func(void *arg) {
thread_args *args = (thread_args *)arg;
int res = pthread_barrier_wait(args->barrier);
assert(res == 0 || res == PTHREAD_BARRIER_SERIAL_THREAD);
for (int i = 0; i < ITERATION_COUNT; ++i) {
// open and lock dir
int dir_fd = open(args->dir, O_RDONLY | O_CLOEXEC, 0);
assert(dir_fd >= 0);
int lock_res = flock(dir_fd, LOCK_EX);
if (lock_res != 0) {
printf("lock error: %d - %s\n", errno, strerror(errno));
}
assert(lock_res == 0);
// open file
char file[MAX_PATH];
assert(snprintf(file, MAX_PATH, "%s/%s", args->dir, "append_file") <
MAX_PATH);
int file_fd = open(file, O_WRONLY | O_APPEND | O_CREAT | O_CLOEXEC, 0600);
if (file_fd < 0) {
printf("open file error: %d - %s\n\t%s\n", errno, strerror(errno), file);
}
assert(file_fd >= 0);
// write data
char data[MAX_PATH];
int len = snprintf(data, MAX_PATH, "%d-%d\n", args->thread_id, i);
assert(len < MAX_PATH);
int write_res = write(file_fd, data, len);
if (write_res != len) {
printf("write error: %d != %d: %d - %s\n", write_res, len, errno,
strerror(errno));
}
assert(write_res == len);
assert(fsync(file_fd) == 0);
assert(close(file_fd) == 0);
assert(close(dir_fd) == 0);
}
printf("done[%i]\n", args->thread_id);
return NULL;
}
int main() {
char tmp_dir[MAX_PATH];
assert(strncpy(tmp_dir, "/tmp/flockXXXXXX", MAX_PATH) == tmp_dir);
assert(mkdtemp(tmp_dir) != NULL);
printf("tmp_dir: %s\n", tmp_dir);
pthread_barrier_t barrier;
assert(pthread_barrier_init(&barrier, NULL, THREAD_COUNT) == 0);
pthread_t thread[THREAD_COUNT];
thread_args args[THREAD_COUNT];
for (int i = 0; i < THREAD_COUNT; ++i) {
args[i].thread_id = i;
args[i].barrier = &barrier;
args[i].dir = tmp_dir;
args[i].file = NULL;
assert(pthread_create(&thread[i], NULL, &thread_func, &args[i]) == 0);
}
void *ret;
for (int i = 0; i < THREAD_COUNT; ++i) {
assert(pthread_join(thread[i], &ret) == 0);
}
assert(pthread_barrier_destroy(&barrier) == 0);
printf("done[main]\n");
return 0;
}
cygcheck.out
Description: Binary data
-- Problem reports: https://cygwin.com/problems.html FAQ: https://cygwin.com/faq/ Documentation: https://cygwin.com/docs.html Unsubscribe info: https://cygwin.com/ml/#unsubscribe-simple

