Here is a better patch that doesn't deadlock (the last one did) and uses the more proper generic_sync_sb_inodes instead of freeze_bdev.
I used the make-many-files.c program that Marcin dug up somewhere on the net to test this, making the partition 4 GB, which is a little too small to hold all the files. This exercises the nospace handling nicely. Note: one thing that freeze_bdev does that generic_sync_sb_inodes does not is prevent new writes during the flush. We will have to think about how this is to be handled. Daniel
diff -r a49705ea1c95 user/kernel/commit.c --- a/user/kernel/commit.c Tue Mar 03 20:43:11 2009 -0800 +++ b/user/kernel/commit.c Wed Mar 04 03:26:33 2009 -0800 @@ -4,6 +4,7 @@ */ #include "tux3.h" +#include <linux/writeback.h> #ifndef trace #define trace trace_on @@ -32,6 +33,8 @@ sb->atomgen = from_be_u32(super->atomgen); sb->freeatom = from_be_u32(super->freeatom); sb->dictsize = from_be_u64(super->dictsize); + sb->minchange = 8; /* total blocks changed by smallest change */ + sb->margin = 100; // should be tunable? trace("blocksize %u, blockbits %u, blockmask %08x", sb->blocksize, sb->blockbits, sb->blockmask); trace("volblocks %Lu, freeblocks %Lu, nextalloc %Lu", @@ -236,6 +239,50 @@ return 0; } +int reserve_credits(struct sb *sb, unsigned credits) +{ + sb->credits += credits; + if (sb->margin > 0 && sb->freeblocks < sb->credits + sb->margin) { +#ifdef __KERNEL__ + struct block_device *bdev = vfs_sb(sb)->s_bdev; + warn(">>> %Lx free, %Lx credits", (L)sb->freeblocks, (L)sb->credits); + sb->margin = -sb->margin; + generic_sync_sb_inodes(vfs_sb(sb), &(struct writeback_control){ + .sync_mode = WB_SYNC_ALL, + .range_end = LLONG_MAX, + .nr_to_write = LONG_MAX }); + sb->credits = 0; + thaw_bdev(bdev, vfs_sb(sb)); + sb->margin = -sb->margin; +#endif + warn(">>> nospace = %i", sb->freeblocks < sb->credits + sb->margin); + if (sb->freeblocks < sb->credits + sb->margin) { + return -ENOSPC; + } + } + return 0; +} + +void release_credits(struct sb *sb, unsigned credits) +{ + // <try to return unused credits to pool> // +} + +int reserve_begin(struct sb *sb) +{ + int err; + if ((err = reserve_credits(sb, sb->minchange))) + return err; + return change_begin(sb); +} + +int reduce_begin(struct sb *sb) +{ + int err; + if ((err = reserve_credits(sb, 0))) + return err; + return change_begin(sb); +} #ifdef __KERNEL__ static void *useme[] = { clean_buffer, need_delta, stage_delta, commit_delta, useme }; #endif diff -r a49705ea1c95 user/kernel/dir.c --- a/user/kernel/dir.c Tue Mar 03 20:43:11 2009 -0800 +++ b/user/kernel/dir.c Wed Mar 04 03:26:33 2009 -0800 @@ -132,7 +132,7 @@ while (entry <= limit) { if (entry->rec_len == 0) { brelse(buffer); - tux_error(dir->i_sb, "zero-length directory entry"); + warn("[CORRUPTION] Zero length directory record detected!"); return -EIO; } name_len = TUX_REC_LEN(entry->name_len); diff -r a49705ea1c95 user/kernel/filemap.c --- a/user/kernel/filemap.c Tue Mar 03 20:43:11 2009 -0800 +++ b/user/kernel/filemap.c Wed Mar 04 03:26:33 2009 -0800 @@ -511,7 +511,11 @@ loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) { + struct sb *sb = tux_sb(mapping->host->i_sb); + int err; *pagep = NULL; + if ((err = reserve_credits(sb, (len >> sb->blockbits) + 10))) + return err; return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, tux3_da_get_block); } @@ -519,8 +523,10 @@ static int tux3_writepage(struct page *page, struct writeback_control *wbc) { struct sb *sb = tux_sb(page->mapping->host->i_sb); - change_begin(sb); - int err = block_write_full_page(page, tux3_get_block, wbc); + int err = change_begin(sb); + if (err) + return err; + err = block_write_full_page(page, tux3_get_block, wbc); change_end(sb); return err; } diff -r a49705ea1c95 user/kernel/inode.c --- a/user/kernel/inode.c Tue Mar 03 20:43:11 2009 -0800 +++ b/user/kernel/inode.c Wed Mar 04 03:26:33 2009 -0800 @@ -233,9 +233,10 @@ trace("save inode 0x%Lx", (L)tux_inode(inode)->inum); struct sb *sb = tux_sb(inode->i_sb); struct btree *itable = itable_btree(sb); + struct cursor *cursor; int err; - struct cursor *cursor = alloc_cursor(itable, 1); /* +1 for new depth */ - if (!cursor) + + if (!(cursor = alloc_cursor(itable, 1))) /* +1 for new depth */ return -ENOMEM; down_write(&cursor->btree->lock); if ((err = probe(cursor, tux_inode(inode)->inum))) diff -r a49705ea1c95 user/kernel/namei.c --- a/user/kernel/namei.c Tue Mar 03 20:43:11 2009 -0800 +++ b/user/kernel/namei.c Wed Mar 04 03:26:33 2009 -0800 @@ -60,13 +60,14 @@ static int tux3_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev) { + struct sb *sb = tux_sb(dir->i_sb); struct inode *inode; int err; if (!huge_valid_dev(rdev)) return -EINVAL; - - change_begin(tux_sb(dir->i_sb)); + if ((err = reserve_begin(sb))) + return err; inode = tux_create_inode(dir, mode, rdev); err = PTR_ERR(inode); if (!IS_ERR(inode)) { @@ -81,7 +82,7 @@ iput(inode); } out: - change_end(tux_sb(dir->i_sb)); + change_end(sb); return err; } @@ -100,13 +101,14 @@ static int tux3_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { + struct sb *sb = tux_sb(dir->i_sb); struct inode *inode = old_dentry->d_inode; int err; if (inode->i_nlink >= TUX_LINK_MAX) return -EMLINK; - - change_begin(tux_sb(inode->i_sb)); + if ((err = reserve_begin(sb))) + return err; inode->i_ctime = gettime(); inode_inc_link_count(inode); atomic_inc(&inode->i_count); @@ -115,17 +117,19 @@ inode_dec_link_count(inode); iput(inode); } - change_end(tux_sb(inode->i_sb)); + change_end(sb); return err; } static int tux3_symlink(struct inode *dir, struct dentry *dentry, const char *symname) { + struct sb *sb = tux_sb(dir->i_sb); struct inode *inode; int err; - change_begin(tux_sb(dir->i_sb)); + if ((err = reserve_begin(sb))) + return err; inode = tux_create_inode(dir, S_IFLNK | S_IRWXUGO, 0); err = PTR_ERR(inode); if (!IS_ERR(inode)) { @@ -139,41 +143,42 @@ iput(inode); } out: - change_end(tux_sb(dir->i_sb)); + change_end(sb); return err; } static int tux3_unlink(struct inode *dir, struct dentry *dentry) { + struct sb *sb = tux_sb(dir->i_sb); struct inode *inode = dentry->d_inode; - change_begin(tux_sb(inode->i_sb)); - int err = tux_del_dirent(dir, dentry); - if (!err) { + int err; + + if ((err = reduce_begin(sb))) + return err; + if (!(tux_del_dirent(dir, dentry))) { inode->i_ctime = dir->i_ctime; inode_dec_link_count(inode); } - change_end(tux_sb(inode->i_sb)); + change_end(sb); return err; } static int tux3_rmdir(struct inode *dir, struct dentry *dentry) { + struct sb *sb = tux_sb(dir->i_sb); struct inode *inode = dentry->d_inode; int err; - err = tux_dir_is_empty(inode); - if (!err) { - change_begin(tux_sb(inode->i_sb)); - err = tux_del_dirent(dir, dentry); - if (!err) { - inode->i_ctime = dir->i_ctime; - inode->i_size = 0; - clear_nlink(inode); - mark_inode_dirty(inode); - inode_dec_link_count(dir); - } - change_end(tux_sb(inode->i_sb)); + if ((err = reduce_begin(sb))) + return err; + if (!(tux_del_dirent(dir, dentry))) { + inode->i_ctime = dir->i_ctime; + inode->i_size = 0; + clear_nlink(inode); + mark_inode_dirty(inode); + inode_dec_link_count(dir); } + change_end(sb); return err; } @@ -184,6 +189,7 @@ struct inode *new_inode = new_dentry->d_inode; struct buffer_head *old_buffer, *new_buffer; tux_dirent *old_entry, *new_entry; + struct sb *sb = tux_sb(old_dir->i_sb); int err, new_subdir = 0; old_entry = tux_find_entry(old_dir, old_dentry->d_name.name, @@ -194,7 +200,8 @@ /* FIXME: is this needed? */ BUG_ON(from_be_u64(old_entry->inum) != tux_inode(old_inode)->inum); - change_begin(tux_sb(old_inode->i_sb)); + if ((err = reserve_begin(sb))) + return err; if (new_inode) { int old_is_dir = S_ISDIR(old_inode->i_mode); if (old_is_dir) { @@ -244,11 +251,11 @@ if (!err && new_subdir) inode_dec_link_count(old_dir); - change_end(tux_sb(old_inode->i_sb)); + change_end(sb); return err; error: - change_end(tux_sb(old_inode->i_sb)); + change_end(sb); brelse(old_buffer); return err; } diff -r a49705ea1c95 user/kernel/tux3.h --- a/user/kernel/tux3.h Tue Mar 03 20:43:11 2009 -0800 +++ b/user/kernel/tux3.h Wed Mar 04 03:26:33 2009 -0800 @@ -337,6 +337,9 @@ struct list_head pinned; /* dirty metadata not flushed per delta */ struct list_head commit; /* dirty metadata flushed per delta */ struct list_head dirty_inodes; /* dirty inodes list */ + unsigned credits; /* Blocks reserved for inflight operations */ + unsigned minchange; /* total blocks changed by smallest change */ + int margin; /* ENOSPC when freeblocks less than this */ #ifdef __KERNEL__ struct super_block *vfs_sb; /* Generic kernel superblock */ #else @@ -900,6 +903,9 @@ int load_itable(struct sb *sb); int change_begin(struct sb *sb); int change_end(struct sb *sb); +int reserve_credits(struct sb *sb, unsigned credits); +int reserve_begin(struct sb *sb); +int reduce_begin(struct sb *sb); /* temporary hack for buffer */ struct buffer_head *blockread(struct address_space *mapping, block_t iblock);
_______________________________________________ Tux3 mailing list Tux3@tux3.org http://mailman.tux3.org/cgi-bin/mailman/listinfo/tux3