Hi all,

We're trying to use inotify for change notification in Samba 4, most of the
time, it works very well, but under certain conditions, the server can
become unresponsive (to client NFS requests), due to some kind of deadlock
possibly caused by inotify_clean() running as a cyclic.

Here are the steps to reproduce:

1. suppose you have a zfs file system with plenty of space, say /zpool/temp
which is created in the zpool named 'zpool'; download the attached 2 C
programs and compile them.

2. in terminal windows #1, run the following dtrace command to monitor the
'dirty_total':

dtrace -n 'txg-syncing /((dsl_pool_t *)arg0)->dp_spa->spa_name == "zpool"/
{printf("%4dMB (%2d) of %4dMB used", ((dsl_pool_t
*)arg0)->dp_dirty_total/1024/1024, ((dsl_pool_t
*)arg0)->dp_dirty_total*100/`zfs_dirty_data_max, `zfs_dirty_data_max / 1024
/ 1024);}'

Replace the string "zpool" with your actual zpool name.

3. in terminal window #2, run the following command

inotify_stress_new -f /zpool/temp -n 4

It forks 4 processes, each of which uses inotify_* functions to monitor the
folder /zpool/temp, create and remove some temporary files in that folder.

4. in terminal windows #3, run the following command:

write-files -f /zpool/temp -n 50

It forks 50 processes, each one creates a subfolder under /zpool/temp,
keeps creating files of 1GB size.

Depending on your machine's horse power, you may want to adjust the number
of processes for write-files program. Its purpose is basically to make
zpool dirty_total go beyond 60% of zfs_dirty_data_max.

Normally, within a few minutes, terminal windows #1 will stop printing
anything, which means the issue is reproduced.

Now once the issue is reproduced, I run 'echo ::threadlist -v 10 | mdb -k'
to dump the kernel threads. So far there are two patterns I observed:

Case No.1: inotify_clean() stuck at cv_timedwait_hires()
---------------------------
    swtch+0x82()
    cv_timedwait_hires+0xec(ffffff01e852ae2e, ffffff01e852ae30,
1047be67f9694,
    186a0, 3)
    dmu_tx_delay+0x147(ffffff8114229c00, d80448bb)
    dmu_tx_wait+0x8e(ffffff8114229c00)
    dmu_tx_assign+0x55(ffffff8114229c00, 1)
    dmu_free_long_range_impl+0x9f(ffffff443a1943c0, ffffff4be51507e8, 0,
ffffffffffffffff)
    dmu_free_long_range+0x62(ffffff443a1943c0, 179294d, 0, ffffffffffffffff)
    zfs_rmnode+0x52(ffffff44d20891f0)
    zfs_zinactive+0xe8(ffffff44d20891f0)
    zfs_inactive+0x75(ffffff450b8e8500, 0, 0)
    fop_inactive+0x76(ffffff450b8e8500, 0, 0)
    vn_rele+0x82(ffffff450b8e8500)
    inotify_clean+0xfc(ffffff43abf54810)
    cyclic_softint+0xf3(ffffff42eb589a80, 0)
    cbe_low_level+0x14()
    av_dispatch_softvect+0x78(2)
    apix_dispatch_softint+0x35(0, 0)
    switch_sp_and_call+0x13()
    0xffffff42eb589a80()
    apix_do_interrupt+0xfe(ffffff01e84faad0, 0)
    _interrupt+0xba()
    i86_mwait+0xd()
    cpu_idle_mwait+0x109()
    idle+0xa7()
    thread_start+8()
---------------------------
Case No. 2: inotify_clean() waits for a mutex which is held by another
thread stuck at cv_timedwait_hires()
---------------------------
> ffffff003d29cc40::findstack -v
stack pointer for thread ffffff003d29cc40: ffffff003d29c960
[ ffffff003d29c960 resume_from_intr+0xb7() ]
  ffffff003d29c990 swtch+0x82()
  ffffff003d29ca30 turnstile_block+0x21a(ffffff090bf5bd58, 0,
ffffff096b1d9808, fffffffffbc07aa0, 0, 0)
  ffffff003d29caa0 mutex_vector_enter+0x3a3(ffffff096b1d9808)
  ffffff003d29caf0 inotify_clean+0x2f(ffffff096b1d9808)
  ffffff003d29cb90 cyclic_softint+0xfd(ffffff090b55bb00, 0)
  ffffff003d29cba0 cbe_low_level+0x14()
  ffffff003d29cbf0 av_dispatch_softvect+0x78(2)
  ffffff003d29cc20 dispatch_softint+0x39(0, 0)
  ffffff003d5a8e60 switch_sp_and_call+0x13()
  ffffff003d5a8e90 0xffffff0900000001()
  4136b0 [no mapping for address]
> ffffff096b1d9808::mutex
            ADDR  TYPE             HELD MINSPL OLDSPL WAITERS
ffffff096b1d9808 adapt ffffff0912213000      -      -     yes
> ffffff0912213000::findstack -v
stack pointer for thread ffffff0912213000: ffffff003d8372f0
[ ffffff003d8372f0 _resume_from_idle+0xf4() ]
  ffffff003d837320 swtch+0x141()
  ffffff003d8373b0 cv_timedwait_hires+0xec(ffffff09122131ee,
ffffff09122131f0, 7d07b0bbc4, 186a0, 3)
  ffffff003d837410 dmu_tx_delay+0x147(ffffff096f22a900, 87f76900)
  ffffff003d837460 dmu_tx_wait+0x8e(ffffff096f22a900)
  ffffff003d8374a0 dmu_tx_assign+0x55(ffffff096f22a900, 1)
  ffffff003d837520 dmu_free_long_range_impl+0xa7(ffffff0914365780,
ffffff096cb94d28, 0, ffffffffffffffff)
  ffffff003d837590 dmu_free_long_range+0x62(ffffff0914365780, 89e, 0,
ffffffffffffffff)
  ffffff003d8375e0 zfs_rmnode+0x52(ffffff0978376220)
  ffffff003d837620 zfs_zinactive+0xe8(ffffff0978376220)
  ffffff003d837680 zfs_inactive+0x75(ffffff09783d0e00, ffffff096ed6abb8, 0)
  ffffff003d8376e0 fop_inactive+0x76(ffffff09783d0e00, ffffff096ed6abb8, 0)
  ffffff003d837710 vn_rele+0x82(ffffff09783d0e00)
  ffffff003d837750 inotify_watch_remove+0x93(ffffff096b1d9808,
ffffff097a002540)
  ffffff003d837830 inotify_rm_watch+0x5b(ffffff096b1d9808, 1)
  ffffff003d837cc0 inotify_ioctl+0x113(12000000004, 696e7902, 1, 202003,
ffffff096ed6abb8, ffffff003d837ea8)
  ffffff003d837d00 cdev_ioctl+0x39(12000000004, 696e7902, 1, 202003,
ffffff096ed6abb8, ffffff003d837ea8)
  ffffff003d837d50 spec_ioctl+0x60(ffffff0968d52d40, 696e7902, 1, 202003,
ffffff096ed6abb8, ffffff003d837ea8, 0)
  ffffff003d837de0 fop_ioctl+0x55(ffffff0968d52d40, 696e7902, 1, 202003,
ffffff096ed6abb8, ffffff003d837ea8, 0)
  ffffff003d837f00 ioctl+0x9b(3, 696e7902, 1)
  ffffff003d837f10 sys_syscall+0x196()
---------------------------

I've reproduced this issue using the latest image joyent_20150514T133314Z
by the way.

Thanks,

-Youzhong



-------------------------------------------
smartos-discuss
Archives: https://www.listbox.com/member/archive/184463/=now
RSS Feed: https://www.listbox.com/member/archive/rss/184463/25769125-55cfbc00
Modify Your Subscription: 
https://www.listbox.com/member/?member_id=25769125&id_secret=25769125-7688e9fb
Powered by Listbox: http://www.listbox.com
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <sys/types.h>
#include <sys/ioctl.h>
#include <sys/inotify.h>
#ifdef __sun__
#include <sys/filio.h>
#endif

#include <unistd.h>
#include <stropts.h>
#include <poll.h>
#include <fcntl.h>
#include <sys/time.h>
#include <sys/wait.h>
#include <signal.h>
#include <string.h>
#include <time.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/resource.h>
#include <sys/ipc.h>
#include <sys/shm.h>

#include <dirent.h>
#include <strings.h>
#include <alloca.h>

#include <limits.h>
#include <stddef.h>
#include <sys/stat.h>

#define ZERO_STRUCT(x) memset((char *)&(x), 0, sizeof(x))

void readfile(char *filename)
{
    int fd;
    int ret;
    char buf[128*1024];
	ssize_t bytes_read, offset;
	struct stat st;

	ret = stat(filename, &st);
	if(ret == -1) {
		perror("stat");
		return;
	}
	if(st.st_mode & S_IFDIR) return;
	
    fd = open(filename, O_RDONLY);
    if(fd == -1) {
       perror("open");
       return;
    }
	offset = 0;
    while((bytes_read = pread(fd, buf, sizeof(buf), offset)) != 0)
	{
       if(bytes_read == -1) {
		   perror("pread");
		   break;
	   }
	   offset += bytes_read;
    }

    ret = close(fd);
    if(ret == -1) {
       perror("close");
    }
}

void populate_buffer(char *buffer, int size)
{
	int i;
	srandom(gethrtime());
	for(i = 0; i < size; i++) {
		buffer[i] = random();
	}
}

void writefile(char *filename)
{
    int fd;
    int ret;
    char buf[128*1024];
	int bufmax = 128 * 1024;
	ssize_t total_bytes_written, bytes_written, bytes_to_write, offset;
	struct stat st;
	off_t filesize;

	ret = stat(filename, &st);
	if(ret == -1) {
		perror("stat");
		return;
	}
	if(st.st_mode & S_IFDIR) return;
	
	filesize = st.st_size;
	
    fd = open(filename, O_RDWR);
    if(fd == -1) {
       perror("open");
       return;
    }
	offset = 0;
	total_bytes_written = 0;
    while(total_bytes_written < filesize)
	{
		if(filesize - total_bytes_written >= bufmax)
			bytes_to_write = bufmax;
		else
			bytes_to_write = filesize - total_bytes_written;
		populate_buffer(buf, bytes_to_write);
		bytes_written = pwrite(fd, buf, bytes_to_write, offset);
		if(bytes_written == -1) perror("pwrite");
		offset += bytes_written;
		total_bytes_written += bytes_written;
    }

    ret = close(fd);
    if(ret == -1) {
       perror("close");
    }
}

static void volatile_file(char *pathname, int fd) 
{
	FILE *fp = NULL;
	const char	*fileprefix = "$matlab_temp_file$";
	char	fullname[PATH_MAX+64];
	int wd;
	
	sprintf(fullname, "%s/$matlab_temp_file$_%d_%llu", pathname, getpid(), gethrtime());
	
	fp = fopen(fullname, "w");
	if(fp == NULL)
	{
		fprintf(stderr, "ERROR: unable to create file \"%s\" [%d] [%s]\n", fullname, errno, strerror(errno));
		return;
	}	
	fwrite(fileprefix, sizeof(char), strlen(fileprefix), fp);
	if(ferror(fp)) {
		fprintf(stderr, "ERROR: failure writing to \"%s\"\n", fullname);
	}
	fclose(fp);
	
	remove(fullname);	
		
}

static void scanfolder(char *path, int fd)
{
	DIR *dirp;
	struct dirent *dp, *ep;
	char filepath[PATH_MAX];
	int cntFiles = 0;

	if ((dirp = opendir(path)) == NULL) {
		perror("opendir");
		return;
	}
	
	ep = alloca(sizeof(struct dirent) + PATH_MAX + 1);
	bzero(ep, sizeof(struct dirent) + PATH_MAX + 1);

	while(readdir_r(dirp, ep, &dp) == 0 && dp != NULL)
	{
		if(dp->d_name[0] == '.') continue;
		snprintf(filepath, sizeof(filepath), "%s/%s", path, dp->d_name);
		/*
		readfile(filepath);
		writefile(filepath);
		*/
		cntFiles++;
	}

	closedir(dirp);
	free(ep);
	
	volatile_file(path, fd);
	
	return;
}

void proc_to_run(char *pathname)
{
	int fd;
	int wd;
	int ret;
	struct pollfd fds[1];
	char buffer[4096];
	int loops = 100;
	
	fd = inotify_init();
	/*checking for error*/
	if ( fd < 0 ) {
		perror( "inotify_init" );
		return;
	}

	while(loops--) 
	{
		wd = inotify_add_watch( fd, pathname, 
			IN_MASK_ADD | IN_ONLYDIR | IN_CREATE | IN_DELETE | IN_MOVED_FROM | IN_MOVED_TO | IN_ATTRIB); /* 0x210003c4 */
		if(wd == -1) {
			perror("inotify_add_watch");
			break;
		}
		
		fds[0].fd = fd;
		fds[0].events = POLLIN;
		
		scanfolder(pathname, fd);
		/* check if there's data available for reading */
		ret = poll(fds, 1, 100);
		if(ret > 0) {
			/* read data */
			read(fd, buffer, 4096);
		}

		inotify_rm_watch(fd, wd);
	}
	
	close(fd);

	return;
}

static void sig_cld(int signum)
{
        while (waitpid((pid_t)-1,(int *)NULL, WNOHANG) > 0)
        {
        }
}

void (*CatchSignal(int signum,void (*handler)(int )))(int)
{
        struct sigaction act;
        struct sigaction oldact;

        ZERO_STRUCT(act);

        act.sa_handler = handler;
#ifdef SA_RESTART
        /*
         * We *want* SIGALRM to interrupt a system call.
         */
        if(signum != SIGALRM)
                act.sa_flags = SA_RESTART;
#endif
        sigemptyset(&act.sa_mask);
        sigaddset(&act.sa_mask,signum);
        sigaction(signum,&act,&oldact);
        return oldact.sa_handler;
}

void CatchChild(void)
{
        CatchSignal(SIGCHLD, sig_cld);
}

void wait_children() {
	int pid;
	while (pid = waitpid(-1, NULL, 0)) {
		if (errno == ECHILD) break;
	}
}

void print_usage(char *argv[])
{
	printf("Usage:\n\n");
	printf("\t%s -f folder -n processes\n", argv[0]);
	printf("\n");
}

int main(int argc, char *argv[])
{
	int c;
	char *foldername = NULL;
	int n_children = 100;
	pid_t pid;
	int n_processes;
    int loops = 10;
	
	while ((c = getopt(argc, argv, ":f:n:")) != -1) {
		switch(c) {
		case 'f':
			foldername = optarg;
			break;
		case 'n':
			n_children = atoi(optarg);
			break;
		case ':':
			printf("-%c without arg\n", optopt);
			exit(1);
			break;
		}
	}
	if(foldername == NULL) {
		printf("Must specify a folder.\n\n");
		print_usage(argv);
		exit(1);
	}

	printf("(f) folder = %s\n", foldername);
	printf("(n) number of processes = %d\n", n_children);
	n_processes = n_children;

	CatchChild();
	
	while(1) {
		n_children = n_processes;
		while(n_children--) {
			pid = fork();
			if(pid == -1) {
				perror("fork: ");
				continue;
			} 
			else if(pid == 0) {
				/* child */
				proc_to_run(foldername);
				exit(0);
			}
			else {
				/* parent */
			}
		}
		sleep(5);
		wait_children();
	}
	
	return 0;
}

/* 
gcc -D_POSIX_PTHREAD_SEMANTICS -o inotify_stress_new inotify_stress_new.c 
*/
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <sys/types.h>
#include <sys/ioctl.h>
#include <sys/inotify.h>
#ifdef __sun__
#include <sys/filio.h>
#endif

#include <unistd.h>
#include <stropts.h>
#include <poll.h>
#include <fcntl.h>
#include <sys/time.h>
#include <sys/wait.h>
#include <signal.h>
#include <string.h>
#include <time.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/resource.h>
#include <sys/ipc.h>
#include <sys/shm.h>

#include <dirent.h>
#include <strings.h>
#include <alloca.h>

#include <limits.h>
#include <stddef.h>
#include <sys/stat.h>

#include <malloc.h> 

#define DEFAULT_BUFFER_SIZE 128*1024

void populate_buffer(char *buffer, int size)
{
	int i;
	long *pl = NULL;
	
	srandom(gethrtime());
	/*
	for(i = 0; i < size & sizeof(long); i += sizeof(long)) {
		pl = (long*) &buffer[i];
		*pl = random();
	}
	*/
	for(i = 0; i < size; i++) {
		buffer[i] = random();
	}
}

/* 0 for success */
int makeDir(char *dir)
{
	int ret;

	ret = mkdir(dir, S_IRWXU | S_IRWXG | S_IRWXO);
	if(ret)
	{
		fprintf(stderr, "ERROR: unable to create folder \"%s\" [%d] [%s]\n", dir, errno, strerror(errno));
	}

	return ret;
}

void writefile(char *filename, unsigned long fileLen, int bufferSize)
{
    int fd;
    int ret;
    char *buf = NULL;
	int bufmax = bufferSize;
	ssize_t total_bytes_written, bytes_written, bytes_to_write, offset;
	struct stat st;
	off_t filesize = fileLen;
	
	buf = malloc(sizeof(char) * bufferSize);
	if(buf == NULL) {
		fprintf(stderr, "ERROR: out of memory - pid %d\n", getpid());
		return;
	}

    fd = open(filename, O_RDWR | O_CREAT);
    if(fd == -1) {
       perror("open");
	   free(buf);
       return;
    }
	offset = 0;
	total_bytes_written = 0;
    while(total_bytes_written < filesize)
	{
		if(filesize - total_bytes_written >= bufmax)
			bytes_to_write = bufmax;
		else
			bytes_to_write = filesize - total_bytes_written;
		populate_buffer(buf, bytes_to_write);
		bytes_written = pwrite(fd, buf, bytes_to_write, offset);
		if(bytes_written == -1) {
			perror("pwrite");
			close(fd);
			free(buf);
			return;
		}
		offset += bytes_written;
		total_bytes_written += bytes_written;
    }

    ret = close(fd);
    if(ret == -1) {
       perror("close");
    }
	free(buf);
}

/* return 0 for success, 1 for failure */
int WriteToFile(char* filename, unsigned long fileLen, int bufferSize)
{
	FILE *fp = NULL;
	int ret;
	char *buffer = NULL;
	int bufferLen = bufferSize/sizeof(char);
	bufferSize = bufferLen * sizeof(char);
	int bytesWritten = 0;
	unsigned long bytesWrittenTotal = 0;
	size_t bytesToWrite;

	fp = fopen(filename, "wb");
	if(fp == NULL)
	{
		fprintf(stderr, "ERROR: unable to create file \"%s\" [%d] [%s]\n", filename, errno, strerror(errno));
		return 1;
	}

	{
		buffer = (char*) calloc(bufferLen, sizeof(char));
		if(buffer == NULL)
		{
			fprintf(stderr, "ERROR: unable to allocate memory(buffer size %d) \"%s\" [%d] [%s]\n", bufferSize, filename, errno, strerror(errno));
			fclose(fp);
			return 1;
		}

		memset(buffer, 0, bufferSize); 

		/* Buffer size in bytes. Allowable range: 2 <= size <= INT_MAX (2147483647). Internally, the value supplied for size is rounded down to the nearest multiple of 2. */
		ret = setvbuf(fp, buffer, _IOFBF, bufferSize);
		if(ret != 0)
		{
			fprintf(stderr, "ERROR: unable to setvbuf(buffer size %d) \"%s\" [%d] [%s]\n", bufferSize, filename, errno, strerror(errno));
			fclose(fp);
			free(buffer);
			return 1;
		}

		while(bytesWrittenTotal < fileLen)
		{
			if((fileLen-bytesWrittenTotal) < bufferSize)
				bytesToWrite = fileLen - bytesWrittenTotal;
			else
				bytesToWrite = bufferSize;
			populate_buffer(buffer, bytesToWrite);
			bytesWritten = fwrite(buffer, 1, bytesToWrite, fp);
			bytesWrittenTotal += bytesWritten;
			if(bytesWritten != bytesToWrite)
			{
				fprintf(stderr, "ERROR: fwrite failure \"%s\" [%d] [%s]\n", filename, errno, strerror(errno));
				fclose(fp);
				free(buffer);
				return 1;
			}
		}
	}

	fclose(fp);
	free(buffer);

	return 0;
}

void proc_to_run(char *pathname, unsigned long filesize, int buffersize)
{
	int ret;
	char subdir[4096];
	char filename[4096];
	
	sprintf(subdir, "%s/%d_%llu", pathname, getpid(), gethrtime());
	
	if(makeDir(subdir)) return;
	
	while(1) {
		sprintf(filename, "%s/%llu", subdir, gethrtime());
		writefile(filename, filesize, buffersize);
	}
	
	return;
}

void wait_children() {
	int pid;
	while (pid = waitpid(-1, NULL, 0)) {
		if (errno == ECHILD) break;
	}
}

void print_usage(char *argv[])
{
	printf("Usage:\n\n");
	printf("\t%s -f folder -n processes -s filesize -b buffersize\n", argv[0]);
	printf("\n");
}

int main(int argc, char *argv[])
{
	int c;
	char *foldername = NULL;
	int n_children = 100;
	pid_t pid;
	int n_processes;
	unsigned long filesize = 1024*1024*1024;
	int buffersize = DEFAULT_BUFFER_SIZE;
       
	while ((c = getopt(argc, argv, ":f:n:s:b:")) != -1) {
		switch(c) {
		case 'b':
			buffersize = atoi(optarg);
			break;
		case 's':
			filesize = atoll(optarg);
			break;
		case 'f':
			foldername = optarg;
			break;
		case 'n':
			n_children = atoi(optarg);
			break;
		case ':':
			printf("-%c without arg\n", optopt);
			exit(1);
			break;
		}
	}
	if(foldername == NULL) {
		printf("Must specify a folder.\n\n");
		print_usage(argv);
		exit(1);
	}

	printf("(f) folder = %s\n", foldername);
	printf("(n) number of processes = %d\n", n_children);
	printf("(s) file size = %lu\n", filesize);
	printf("(b) buffer size = %d\n", buffersize);
	
	n_processes = n_children;

	while(n_children--) {
		pid = fork();
		if(pid == -1) {
			perror("fork: ");
			continue;
		} 
		else if(pid == 0) {
			/* child */
			proc_to_run(foldername, filesize, buffersize);
			exit(0);
		}
		else {
			/* parent */
		}
	}
	
	sleep(5);

	wait_children();

	return 0;
}
/* 
gcc -D_POSIX_PTHREAD_SEMANTICS -o write-files write-files.c 
*/

Reply via email to