When Tux3 snapshots arrive we will need some means of creating, deleting,
listing and otherwise managing them. This could get quite involved,
particularly when per-directory snapshots are supported. There is no existing
interface in Linux suitable for this purpose. Today I will present an
interface that I think can do the job well, and be useful in a number of other
ways, particularly for monitoring and debugging.
Here is my original ddlink writeup, posted a year ago:
"An alternative interface to device mapper"
http://lwn.net/Articles/271805/
Ddlink is a generic pipe-like interface originally intended for controlling
device drivers, but useful for many other kinds of kernel/userspace
interaction. Interfaces may range from very simple, implemented in a few dozen
lines of kernel and userspace code, to complex state machines such as the
device mapper control interface given as an example in my earlier post.
Ddlink was inspired by Trond Myklebust's venerable and successful rpc_pipefs
interface, currently used to control NFS clients and servers. Ddlink provides
an application program with an fd object that can be read, written, ioctled and
polled much like a pipe, suitable for efficient binary communication with
kernel components. Unlike a pipe, there is no write buffering. Each write to
a ddlink directly triggers some kernel handler. Reads are buffered via an
output queue of ddlink "items", each of which is an unrestricted blob. In
practice, a ddlink item is usually a C structure or ascii text. Ioctls on
ddlinks are unrestricted and the ioctl command space is unpolluted.
There are no partial reads of ddlink output data. A read call either provides
enough space to hold the next outbound kernel item or triggers EIO, meaning
"make your buffer bigger and try again". This arrangement takes the onus off
the userspace program to buffer partial reads in order to reassemble input that
would otherwise be brutally dismembered. As a bonus, the kernel code for
ddlink is considerably simplified versus Trond's rpc-pipefs precursor.
Unlike a pipe, there is no waiting for input on a ddlink: if there is
nothing to read then the read returns immediately with zero length. If
some other behavior is desired then it can be obtained using poll.
Ddlink provides a simple framework to the implementor for generalized
allocation and destruction of dditems. There is a small library of helper
functions that are useful for creating domain-specific ddlink interfaces.
The code for ddlink is compact:
* ~150 lines of core ddlink code
* ~100 lines of support for kernel ddlink implementations
* A ddlink kernel implementation example in 50 lines
In terms of object size:
* ~1800 bytes of kernel code for ddlink and library
* ~325 bytes of module code for example implementation
So ddlink is about as light and tight as an interface can be. It is also
highly efficient, flexible and extensible, and requires very little boilerplate
code, either in kernel or user space.
Ddlink has a number of advantages over ioctl:
- Input and output transfer size are part of the interface
- Delivers error messages as readable text
- Provides a mechanism for queueing asynchronous results (also
provides a mechanism for returning immediate results)
- Supports stateful interface protocols
- The creator of a ddlink fd owns it and does not have to worry
about traffic on it from other sources.
- Supports a file-oriented security model
- Pollable
Two ddlink examples userspace programs are attached, based on the example
kernel ddlink implementation in the patch.
- The fs/tux3 kenrel implementation is a simple echo, cut and
pasted from the ddlink.c example, where all text written to the
ddlink is just requeued for reading.
- Ioctl any file or directory on the mounted filesystem with 0xdd
to obtain a ddlink
- Example "ddtest.c" reads arguments from the command line, writes
to the ddlink, reads from the ddlink and shows the output.
- Example "ddtest.c" shows ddlink used for interprocess communication.
Forks a child process, reads from the ddlink, parent writes several
items to the ddlink and exits, child reads some of them and exits.
#include <stdio.h>
#include <unistd.h>
#include <fcntl.h>
#include <string.h>
#include <sys/ioctl.h>
int main(int argc, char *argv[])
{
int fd, dd, i, len;
if ((fd = open("/mnt", O_RDONLY)) == -1)
goto eek;
if ((dd = ioctl(fd, _IO(0, 0xdd))) == -1)
goto eek;
close(fd);
for (i = 1; i < argc; i++) {
char buf[100];
write(dd, argv[i], strlen(argv[i]));
len = read(dd, buf, sizeof(buf));
printf("dd text = '%.*s'\n", len, buf);
}
eek:
return 0;
}
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <errno.h>
#include <fcntl.h>
#include <poll.h>
#include <sys/ioctl.h>
void ok(int result, char *message)
{
if (result == -1) {
printf("%s error %i: %s\n", message, errno, strerror(errno));
exit(1);
}
}
int main(int argc, char *argv[])
{
char buf[] = "hello";
int fd, dd, i;
ok(-(argc < 2), "need a filename");
ok(fd = open(argv[1], O_RDONLY), "open");
ok(dd = ioctl(fd, _IO(0, 0xdd), buf), "ioctl");
switch (fork()) {
case -1:
ok(-1, "fork");
case 0:
write(dd, "hello", 6);
write(dd, "world", 6);
write(dd, "more1", 5);
write(dd, "more2", 5);
write(dd, "more3", 5);
write(dd, "more4", 5);
write(dd, "more5", 5);
return 0;
}
for (i = 0; i < 5; i++) {
int len;
char text[100];
poll(&(struct pollfd){ .fd = dd, .events = POLLIN }, 1, -1);
len = read(dd, text, 100);
printf("dd text %i = '%.*s'\n", len, len, text);
}
return 0;
}
diff -r e034620d446a user/kernel/Makefile
--- a/user/kernel/Makefile Tue Feb 24 06:40:53 2009 +0900
+++ b/user/kernel/Makefile Wed Feb 25 20:17:23 2009 -0800
@@ -7,7 +7,7 @@ clean:
make -C $(LINUX) M=`pwd` CONFIG_TUX3=m clean
else
obj-$(CONFIG_TUX3) += tux3.o
-tux3-objs += balloc.o btree.o dir.o dleaf.o filemap.o iattr.o \
+tux3-objs += ddlink.o balloc.o btree.o dir.o dleaf.o filemap.o iattr.o \
ileaf.o namei.o inode.o super.o xattr.o log.o commit.o utility.o
EXTRA_CFLAGS += -Werror -std=gnu99 -Wno-declaration-after-statement
endif
diff -r e034620d446a user/kernel/ddlink.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/user/kernel/ddlink.c Wed Feb 25 20:17:23 2009 -0800
@@ -0,0 +1,305 @@
+#include <linux/file.h>
+#include <linux/poll.h>
+#include <linux/mount.h>
+#include <linux/module.h>
+#include "ddlink.h"
+
+/*
+ * ddlink: Device Driver link. A userspace to kernel device control interface
+ * useful for kernel/userspace interfaces.
+ *
+ * (c) 2007-2009, Daniel Phillips <[email protected]>
+ */
+
+// To do:
+// * need spinlocks on push, pop, queue, ready and ???
+
+int ddlink_ready(struct ddinode *dd)
+{
+ return !list_empty(&dd->list);
+}
+EXPORT_SYMBOL_GPL(ddlink_ready);
+
+struct dditem *ddlink_pop(struct ddinode *dd)
+{
+ struct dditem *item;
+ BUG_ON(!ddlink_ready(dd));
+ item = list_entry(dd->list.next, struct dditem, link);
+ list_del(&item->link);
+ return item;
+}
+EXPORT_SYMBOL_GPL(ddlink_pop);
+
+void ddlink_clear(struct ddinode *dd)
+{
+ while (ddlink_ready(dd))
+ dd->destroy_item(ddlink_pop(dd));
+}
+EXPORT_SYMBOL_GPL(ddlink_clear);
+
+static struct inode *ddlink_alloc_inode(struct super_block *sb)
+{
+ struct ddinode *dd = kmalloc(sizeof(struct ddinode), GFP_KERNEL);
+ if (!dd)
+ return NULL;
+ *dd = (typeof(*dd)){
+ .wait = __WAIT_QUEUE_HEAD_INITIALIZER(dd->wait),
+ .list = LIST_HEAD_INIT(dd->list) };
+ inode_init_once(&dd->inode);
+ return &dd->inode;
+}
+
+static void ddlink_free_inode(struct inode *inode)
+{
+ struct ddinode *dd = ddinode(inode);
+ ddlink_clear(dd);
+ if (ddinfo(inode))
+ dd->destroy_info(ddinfo(inode));
+ kfree(ddinode(inode));
+}
+
+static struct super_operations ddlink_sops = {
+ .alloc_inode = ddlink_alloc_inode,
+ .destroy_inode = ddlink_free_inode
+};
+
+static struct vfsmount *ddlink_mnt;
+
+static int ddlink_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+{
+ return get_sb_pseudo(fs_type, "ddlink", &ddlink_sops, 0x20070318, mnt);
+}
+
+static struct file_system_type ddlink_type = {
+ .name = "ddlink",
+ .get_sb = ddlink_get_sb,
+ .kill_sb = kill_anon_super };
+
+static int ddlink_delete_dentry(struct dentry *dentry) { return 1; } // vfs bogon
+static struct dentry_operations ddlink_dops = { .d_delete = ddlink_delete_dentry };
+
+int ddlink(struct file_operations *fops, void *(*create)(struct ddinode *dd, void *info), void *info)
+{
+ int err = -ENFILE, fd;
+ struct file *file = get_empty_filp();
+ struct inode *inode;
+ struct dentry *dentry;
+ void *private = NULL;
+
+ if (!file)
+ goto no_file;
+ inode = new_inode(ddlink_mnt->mnt_sb);
+ if (!inode)
+ goto no_inode;
+ ddinode(inode)->destroy_info = kfree;
+ ddinode(inode)->create_item = kmalloc;
+ ddinode(inode)->destroy_item = kfree;
+ if (create && IS_ERR(private = create(ddinode(inode), info))) {
+ err = PTR_ERR(private);
+ goto no_inode;
+ }
+ inode->i_private = private;
+
+ dentry = d_alloc(ddlink_mnt->mnt_sb->s_root, &(struct qstr){ });
+ if (!dentry)
+ goto no_dentry;
+ if ((fd = err = get_unused_fd()) < 0)
+ goto no_fd;
+ /* Mark inode dirty so it will not be moved to the dirty list */
+ inode->i_state = I_DIRTY;
+ inode->i_mode = S_IFCHR|S_IRWXUGO;
+ inode->i_uid = current_fsuid();
+ inode->i_gid = current_fsgid();
+ inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ inode->i_fop = fops;
+ dentry->d_op = &ddlink_dops;
+ d_instantiate(dentry, inode); // d_add?
+
+ file->f_dentry = dentry;
+ file->f_vfsmnt = mntget(ddlink_mnt);
+ file->f_mapping = inode->i_mapping;
+ file->f_mode = FMODE_READ|FMODE_WRITE;
+// file->f_flags = O_RDWR; // not used yet
+ file->f_op = inode->i_fop;
+ fd_install(fd, file);
+ return fd;
+no_fd:
+ dput(dentry);
+no_dentry:
+ iput(inode);
+no_inode:
+ put_filp(file);
+no_file:
+ return err;
+}
+EXPORT_SYMBOL_GPL(ddlink);
+
+static int __init ddlink_init(void)
+{
+ struct vfsmount *mnt;
+ register_filesystem(&ddlink_type);
+ mnt = kern_mount(&ddlink_type);
+ if (IS_ERR(mnt))
+ return PTR_ERR(mnt);
+ ddlink_mnt = mnt;
+ return 0;
+}
+
+static void __exit ddlink_exit(void)
+{
+ mntput(ddlink_mnt);
+ unregister_filesystem(&ddlink_type);
+}
+
+module_init(ddlink_init);
+module_exit(ddlink_exit);
+
+/* library methods */
+
+void ddlink_push(struct ddinode *dd, struct dditem *item)
+{
+ list_add(&item->link, &dd->list);
+ wake_up_interruptible(&dd->wait);
+}
+EXPORT_SYMBOL_GPL(ddlink_push);
+
+void ddlink_queue(struct ddinode *dd, struct dditem *item)
+{
+ list_add_tail(&item->link, &dd->list);
+ wake_up_interruptible(&dd->wait);
+}
+EXPORT_SYMBOL_GPL(ddlink_queue);
+
+struct dditem *dditem_new(struct ddinode *dd, size_t size)
+{
+ struct dditem *item;
+ if (!(item = dd->create_item(sizeof(*item) + size, __GFP_NOFAIL))) // NOFAIL??
+ return NULL;
+ *item = (typeof(*item)){ .size = size };
+ return item;
+}
+EXPORT_SYMBOL_GPL(dditem_new);
+
+struct dditem *dditem_in(struct ddinode *dd, const void *buf, size_t len, int z)
+{
+ struct dditem *item = dditem_new(dd, len + z);
+
+ if (!item)
+ return ERR_PTR(-ENOMEM);
+ if (copy_from_user(item->data, buf, len)) {
+ dd->destroy_item(item);
+ return ERR_PTR(-EFAULT);
+ }
+ if (z) {
+ item->size -= z;
+ item->data[len] = 0;
+ }
+ return item;
+}
+EXPORT_SYMBOL_GPL(dditem_in);
+
+int dditem_out(struct ddinode *dd, void *buf, size_t len, struct dditem *item) // !!! not tested
+{
+ if (len < item->size)
+ return -EINVAL;
+ if (copy_to_user(buf, item->data, len = item->size))
+ return -EFAULT;
+ dd->destroy_item(item);
+ return len;
+}
+EXPORT_SYMBOL_GPL(dditem_out);
+
+int ddlink_post(struct ddinode *dd, void *data, unsigned len)
+{
+ struct dditem *item = dditem_new(dd, len);
+ if (!item)
+ return -ENOMEM;
+ memcpy(item->data, data, len);
+ ddlink_push(dd, item);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ddlink_post);
+
+int ddlink_error(struct ddinode *dd, int err, const char *fmt, ...)
+{
+ int size = 200; /* enough for any error? */
+ struct dditem *item, *work;
+ va_list args;
+ if (err >= 0)
+ return err;
+ if (!(work = dditem_new(dd, size)))
+ return -ENOMEM;
+ va_start(args, fmt);
+ size = vsnprintf(work->data, size, fmt, args);
+ va_end(args);
+ if (!(item = dditem_new(dd, size))) {
+ err = -ENOMEM;
+ goto fail;
+ }
+ memcpy(item->data, work->data, size);
+ ddlink_push(dd, item);
+fail:
+ dd->destroy_item(work);
+ return err;
+}
+EXPORT_SYMBOL_GPL(ddlink_error);
+
+unsigned ddlink_poll(struct file *file, poll_table *table)
+{
+ struct ddinode *dd = ddinode(file->f_dentry->d_inode);
+ poll_wait(file, &dd->wait, table);
+ return ddlink_ready(dd) ? POLLIN : 0;
+}
+EXPORT_SYMBOL_GPL(ddlink_poll);
+
+#if 0
+/* ddlink example */
+
+#include "ddlink.h"
+
+static int ddlink_example_ioctl(struct inode *inode, struct file *file, unsigned cmd, unsigned long ptr)
+{
+ struct ddinode *dd = ddinode(inode);
+ printk("ddlink_ioctl %i (%lx) on %p\n", cmd, ptr, dd);
+ return 0;
+}
+
+static ssize_t ddlink_example_write(struct file *file, const char *buf, size_t len, loff_t *offset)
+{
+ struct ddinode *dd = ddinode(file->f_dentry->d_inode);
+ struct dditem *item = dditem_in(dd, buf, len, 1);
+ if (IS_ERR(item))
+ return PTR_ERR(item);
+ ddlink_queue(dd, item);
+ return len;
+}
+
+static ssize_t ddlink_example_read(struct file *file, char *buf, size_t len, loff_t *offset)
+{
+ struct ddinode *dd = ddinode(file->f_dentry->d_inode);
+ if (list_empty(&dd->list))
+ return EFAULT;
+ return dditem_out(dd, buf, len, ddlink_pop(dd));
+}
+
+static struct file_operations ddlink_example_fops = {
+ .read = ddlink_example_read,
+ .write = ddlink_example_write,
+ .ioctl = ddlink_example_ioctl,
+ .poll = ddlink_poll,
+};
+
+void *ddlink_example_create(struct ddinode *dd, void *info)
+{
+ return NULL;
+}
+
+long example_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ switch (cmd) {
+ case DDLINK:
+ return ddlink(&ddlink_example_fops, ddlink_example_create, NULL);
+ }
+ return -ENOTTY;
+}
+#endif
diff -r e034620d446a user/kernel/ddlink.h
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/user/kernel/ddlink.h Wed Feb 25 20:17:23 2009 -0800
@@ -0,0 +1,44 @@
+#include <linux/poll.h>
+#include <linux/list.h>
+
+#define DDLINK _IO(0, 0xdd)
+
+struct dditem {
+ struct list_head link;
+ unsigned type; // not actually used
+ unsigned size;
+ unsigned char data[];
+};
+
+struct ddinode {
+ struct list_head list;
+ wait_queue_head_t wait;
+ typeof(kmalloc) *create_item;
+ typeof(kfree) *destroy_item;
+ typeof(kfree) *destroy_info;
+ struct inode inode;
+};
+
+static inline struct ddinode *ddinode(struct inode *inode)
+{
+ return container_of(inode, struct ddinode, inode);
+}
+
+static inline void *ddinfo(struct inode *inode)
+{
+ return inode->i_private;
+}
+
+int ddlink(struct file_operations *fops, void *(*create)(struct ddinode *dd, void *info), void *info);
+void ddlink_queue(struct ddinode *dd, struct dditem *item);
+void ddlink_push(struct ddinode *dd, struct dditem *item);
+struct dditem *ddlink_pop(struct ddinode *dd);
+struct dditem *dditem_new(struct ddinode *dd, size_t size);
+void ddlink_clear(struct ddinode *dd);
+int ddlink_ready(struct ddinode *dd);
+
+struct dditem *dditem_in(struct ddinode *dd, const void *buf, size_t len, int z);
+int dditem_out(struct ddinode *dd, void *buf, size_t len, struct dditem *item);
+int ddlink_post(struct ddinode *dd, void *data, unsigned len);
+unsigned ddlink_poll(struct file *file, poll_table *table);
+int ddlink_error(struct ddinode *dd, int err, const char *fmt, ...);
diff -r e034620d446a user/kernel/inode.c
--- a/user/kernel/inode.c Tue Feb 24 06:40:53 2009 +0900
+++ b/user/kernel/inode.c Wed Feb 25 20:17:23 2009 -0800
@@ -373,13 +373,56 @@ int tux3_setattr(struct dentry *dentry,
return inode_setattr(inode, iattr);
}
+/* ddlink example */
+
+#include "ddlink.h"
+
+static int ddlink_example_ioctl(struct inode *inode, struct file *file, unsigned cmd, unsigned long ptr)
+{
+ struct ddinode *dd = ddinode(inode);
+ printk("ddlink_ioctl %i (%lx) on %p\n", cmd, ptr, dd);
+ return 0;
+}
+
+static ssize_t ddlink_example_write(struct file *file, const char *buf, size_t len, loff_t *offset)
+{
+ struct ddinode *dd = ddinode(file->f_dentry->d_inode);
+ struct dditem *item = dditem_in(dd, buf, len, 1);
+ if (IS_ERR(item))
+ return PTR_ERR(item);
+ ddlink_queue(dd, item);
+ return len;
+}
+
+static ssize_t ddlink_example_read(struct file *file, char *buf, size_t len, loff_t *offset)
+{
+ struct ddinode *dd = ddinode(file->f_dentry->d_inode);
+ if (list_empty(&dd->list))
+ return EFAULT;
+ return dditem_out(dd, buf, len, ddlink_pop(dd));
+}
+
+static struct file_operations ddlink_example_fops = {
+ .read = ddlink_example_read,
+ .write = ddlink_example_write,
+ .ioctl = ddlink_example_ioctl,
+ .poll = ddlink_poll,
+};
+
+long tux3_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ if (cmd == DDLINK)
+ return ddlink(&ddlink_example_fops, NULL, NULL);
+ return -ENOTTY;
+}
+
static const struct file_operations tux_file_fops = {
.llseek = generic_file_llseek,
.read = do_sync_read,
.write = do_sync_write,
.aio_read = generic_file_aio_read,
.aio_write = generic_file_aio_write,
-// .unlocked_ioctl = fat_generic_ioctl,
+ .unlocked_ioctl = tux3_ioctl,
#ifdef CONFIG_COMPAT
// .compat_ioctl = fat_compat_dir_ioctl,
#endif
diff -r e034620d446a user/kernel/namei.c
--- a/user/kernel/namei.c Tue Feb 24 06:40:53 2009 +0900
+++ b/user/kernel/namei.c Wed Feb 25 20:17:23 2009 -0800
@@ -253,7 +253,10 @@ error:
return err;
}
+long tux3_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+
const struct file_operations tux_dir_fops = {
+ .unlocked_ioctl = tux3_ioctl,
.llseek = generic_file_llseek,
.read = generic_read_dir,
.readdir = tux_readdir,
_______________________________________________
Tux3 mailing list
[email protected]
http://mailman.tux3.org/cgi-bin/mailman/listinfo/tux3