The attached patch demonstrates deferred inode creation, based on
yesterday's split-up of ext2_new_inode into a front end part and a back
end part named ext2_assign_ino. Inode assignment is driven from the
directory flush routine, because inode allocation wants to know the
containing directory in order to guide its choice of inode number.
Actually, if a new inode is linked more than once before a flush, we
don't really know which directory it will be allocated into, a little
different from the current Ext2 behavior, which allocates it near the
directory it is first linked into. I doubt this makes much difference.
There is one puzzling glitch having to do with the inode dirty list.
I had to mask off the inode dirty state in ext2_assign_inode before
marking the inode dirty, otherwise the inode was not actually being
placed on the sb->s_dirty list. Meaning that somebody had set the inode
dirty flag(s) (plural, because there I_DIRTY is actually 3 dirty bits
orred together) without placing the inode on the dirty list. But I ran
out of time trying to find out who that was. The most likely suspect is
me of course, but nothing obvious jumped out.
My locking here is extremely suspect:
35 +int ext2_flush_dir(struct dentry *dir)
...
48 + if (!dentry->d_inode->i_ino) {
49 + show_inode("assign ino",
dentry->d_inode);
50 + spin_unlock(&dentry->d_lock); // this
is probably wrong
51 + spin_unlock(&dcache_lock);
52 + ext2_assign_ino(dir->d_inode,
dentry->d_inode);
53 + spin_lock(&dcache_lock);
54 + spin_lock(&dentry->d_lock);
55 + }
What are these locks protecting again? Why is it ok to drop them and
retake them here? (My theory is that the directory i_mutex is doing
the protecting, but if so, what is dcache_lock doing for us?)
Here is a test run:
mount /dev/ubdb /mnt && ls /mnt && touch /mnt/foo && fsync /mnt && umount /mnt
>>> ext2_sync_dir 098513c0 "/"
>>> ext2_sync_dir 098513c0 "/"
lost+found
>>> defer inode create: 0988bc90/1 0
>>> ext2_create: 09851a0c/1 0 00000000 "foo3"
--- state = 0
--- state = 7
>>> defer create: 09851a0c/2 0 0988bc90 "foo3"
>>> ext2_sync_dir 098513c0 "/"
>>> dentry: 09851a0c/1 0 0988bc90 "foo3"
>>> assign ino: 0988bc90/1 0
>>> deferred link: 09851a0c/1 0 0988bc90 "foo3"
We ended up with a correct Ext2 filesystem after umount, yay. I guess
this works, there is just rename to take care of now before seeing what
kind of latency improvement we see out of this. But on the other hand,
I think my priorities just changed re what to work on next. Christmas
is getting close and there is a bunch of work to do on atomic commit.
Regards,
Daniel
diff --git a/fs/dcache.c b/fs/dcache.c
index 6068c25..e00e86b 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1387,14 +1387,16 @@ out:
void d_delete(struct dentry * dentry)
{
- int isdir = 0;
+ int isdir = 0, hidden;
/*
* Are we the only user?
*/
spin_lock(&dcache_lock);
spin_lock(&dentry->d_lock);
isdir = S_ISDIR(dentry->d_inode->i_mode);
- if (atomic_read(&dentry->d_count) == 1) {
+ hidden = dentry->d_op && dentry->d_op->d_hide && dentry->d_op->d_hide(dentry);
+
+ if (atomic_read(&dentry->d_count) == 1 + hidden) {
dentry_iput(dentry);
fsnotify_nameremove(dentry, isdir);
return;
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index a78c6b4..058ecf6 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -270,6 +270,65 @@ static inline void ext2_set_de_type(ext2_dirent *de, struct inode *inode)
de->file_type = 0;
}
+int ext2_assign_ino(struct inode *dir, struct inode *inode);
+int real_unlink(struct inode *dir, struct dentry *dentry);
+
+int ext2_flush_dir(struct dentry *dir)
+{
+ struct list_head *next;
+ printk(">>> ext2_sync_dir %p \"%.*s\"\n", dir, dir->d_name.len, dir->d_name.name);
+ spin_lock(&dcache_lock);
+ for (next = dir->d_subdirs.next; next != &dir->d_subdirs;) {
+ struct dentry *dentry = list_entry(next, struct dentry, d_u.d_child);
+ next = next->next;
+ show_dentry("dentry", dentry);
+ if (d_unhashed(dentry))
+ continue;
+ spin_lock(&dentry->d_lock);
+ if (dentry->d_inode) {
+ if (!dentry->d_inode->i_ino) {
+ show_inode("assign ino", dentry->d_inode);
+ spin_unlock(&dentry->d_lock); // this is probably wrong
+ spin_unlock(&dcache_lock);
+ ext2_assign_ino(dir->d_inode, dentry->d_inode);
+ spin_lock(&dcache_lock);
+ spin_lock(&dentry->d_lock);
+ }
+ if (!(dentry->d_flags & DCACHE_BACKED)) {
+ int stale = dentry->d_flags & DCACHE_STALE;
+ show_dentry("deferred link", dentry);
+ dentry->d_flags &= ~DCACHE_STALE;
+ dentry->d_flags |= DCACHE_BACKED;
+ spin_unlock(&dentry->d_lock);
+ spin_unlock(&dcache_lock);
+ if (stale)
+ real_unlink(dir->d_inode, dentry);
+ if (ext2_add_link(dentry, dentry->d_inode)) { // report err!!!
+ inode_dec_link_count(dentry->d_inode);
+ iput(dentry->d_inode);
+ }
+ dput(dentry);
+ spin_lock(&dcache_lock);
+ continue;
+ }
+ } else {
+ if ((dentry->d_flags & DCACHE_STALE)) {
+ dentry->d_flags &= ~DCACHE_STALE;
+ show_dentry("deferred unlink", dentry);
+ spin_unlock(&dentry->d_lock);
+ spin_unlock(&dcache_lock);
+ real_unlink(dir->d_inode, dentry);
+ dput(dentry);
+ spin_lock(&dcache_lock);
+ continue;
+ }
+ }
+ spin_unlock(&dentry->d_lock);
+ }
+ spin_unlock(&dcache_lock);
+ return 0; // really??
+}
+
static int
ext2_readdir (struct file * filp, void * dirent, filldir_t filldir)
{
@@ -283,6 +342,8 @@ ext2_readdir (struct file * filp, void * dirent, filldir_t filldir)
unsigned char *types = NULL;
int need_revalidate = filp->f_version != inode->i_version;
+ ext2_flush_dir(filp->f_path.dentry);
+
if (pos > inode->i_size - EXT2_DIR_REC_LEN(1))
return 0;
@@ -699,6 +760,12 @@ not_empty:
return 0;
}
+int ext2_sync_dir(struct file *file, struct dentry *dir, int datasync)
+{
+ ext2_flush_dir(dir);
+ return ext2_sync_file(file, dir, datasync);
+}
+
const struct file_operations ext2_dir_operations = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
@@ -707,5 +774,5 @@ const struct file_operations ext2_dir_operations = {
#ifdef CONFIG_COMPAT
.compat_ioctl = ext2_compat_ioctl,
#endif
- .fsync = ext2_sync_file,
+ .fsync = ext2_sync_dir,
};
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 47d88da..beaf67d 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -117,7 +117,6 @@ extern void ext2_set_link(struct inode *, struct ext2_dir_entry_2 *, struct page
extern int ext2_sync_file (struct file *, struct dentry *, int);
/* ialloc.c */
-extern struct inode * ext2_new_inode (struct inode *, int);
extern void ext2_free_inode (struct inode *);
extern unsigned long ext2_count_free_inodes (struct super_block *);
extern void ext2_check_inodes_bitmap (struct super_block *);
@@ -183,3 +182,9 @@ ext2_group_first_block_no(struct super_block *sb, unsigned long group_no)
return group_no * (ext2_fsblk_t)EXT2_BLOCKS_PER_GROUP(sb) +
le32_to_cpu(EXT2_SB(sb)->s_es->s_first_data_block);
}
+
+static inline void show_inode(char *tag, struct inode *inode)
+{
+ printk(">>> %s: %p/%i %x\n", tag, inode,
+ atomic_read(&inode->i_count), inode->i_flags);
+}
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index f597413..df6aa95 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -435,25 +435,19 @@ found:
return group;
}
-struct inode *ext2_new_inode(struct inode *dir, int mode)
+int ext2_assign_ino(struct inode *dir, struct inode *inode)
{
- struct super_block *sb;
+ struct super_block *sb = inode->i_sb;
struct buffer_head *bitmap_bh = NULL;
struct buffer_head *bh2;
- int group, i;
+ int group, i, mode = inode->i_mode;
ino_t ino = 0;
- struct inode * inode;
struct ext2_group_desc *gdp;
struct ext2_super_block *es;
struct ext2_inode_info *ei;
struct ext2_sb_info *sbi;
int err;
- sb = dir->i_sb;
- inode = new_inode(sb);
- if (!inode)
- return ERR_PTR(-ENOMEM);
-
ei = EXT2_I(inode);
sbi = EXT2_SB(sb);
es = sbi->s_es;
@@ -550,41 +544,8 @@ got:
sb->s_dirt = 1;
mark_buffer_dirty(bh2);
- inode->i_uid = current->fsuid;
- if (test_opt (sb, GRPID))
- inode->i_gid = dir->i_gid;
- else if (dir->i_mode & S_ISGID) {
- inode->i_gid = dir->i_gid;
- if (S_ISDIR(mode))
- mode |= S_ISGID;
- } else
- inode->i_gid = current->fsgid;
- inode->i_mode = mode;
-
- inode->i_ino = ino;
- inode->i_blocks = 0;
- inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
- memset(ei->i_data, 0, sizeof(ei->i_data));
- ei->i_flags = EXT2_I(dir)->i_flags & ~EXT2_BTREE_FL;
- if (S_ISLNK(mode))
- ei->i_flags &= ~(EXT2_IMMUTABLE_FL|EXT2_APPEND_FL);
- /* dirsync is only applied to directories */
- if (!S_ISDIR(mode))
- ei->i_flags &= ~EXT2_DIRSYNC_FL;
- ei->i_faddr = 0;
- ei->i_frag_no = 0;
- ei->i_frag_size = 0;
- ei->i_file_acl = 0;
- ei->i_dir_acl = 0;
- ei->i_dtime = 0;
- ei->i_block_alloc_info = NULL;
ei->i_block_group = group;
- ei->i_dir_start_lookup = 0;
- ei->i_state = EXT2_STATE_NEW;
- ext2_set_inode_flags(inode);
- spin_lock(&sbi->s_next_gen_lock);
- inode->i_generation = sbi->s_next_generation++;
- spin_unlock(&sbi->s_next_gen_lock);
+ inode->i_ino = ino;
insert_inode_hash(inode);
if (DQUOT_ALLOC_INODE(inode)) {
@@ -600,10 +561,11 @@ got:
if (err)
goto fail_free_drop;
+ inode->i_state &= ~I_DIRTY;
mark_inode_dirty(inode);
ext2_debug("allocating inode %lu\n", inode->i_ino);
ext2_preread_inode(inode);
- return inode;
+ return 0;
fail_free_drop:
DQUOT_FREE_INODE(inode);
@@ -612,13 +574,11 @@ fail_drop:
DQUOT_DROP(inode);
inode->i_flags |= S_NOQUOTA;
inode->i_nlink = 0;
- iput(inode);
- return ERR_PTR(err);
+ return err;
fail:
make_bad_inode(inode);
- iput(inode);
- return ERR_PTR(err);
+ return err;
}
unsigned long ext2_count_free_inodes (struct super_block * sb)
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 384fc0d..a9ce89b 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -58,6 +58,7 @@ static inline int ext2_inode_is_fast_symlink(struct inode *inode)
*/
void ext2_delete_inode (struct inode * inode)
{
+ printk(">>> ext2_delete_inode\n");
truncate_inode_pages(&inode->i_data, 0);
if (is_bad_inode(inode))
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 80c97fd..35dfae7 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -36,16 +36,106 @@
#include "acl.h"
#include "xip.h"
-static inline int ext2_add_nondir(struct dentry *dentry, struct inode *inode)
+static int ext2_hide_dentry(struct dentry *dentry)
{
- int err = ext2_add_link(dentry, inode);
- if (!err) {
- d_instantiate(dentry, inode);
+ struct dentry *clone;
+ if (!(dentry->d_flags & DCACHE_BACKED)) {
+ /* converting unbacked to negative */
+ dput(dentry); /* Cancel dget from deferred create */
return 0;
}
- inode_dec_link_count(inode);
- iput(inode);
- return err;
+ if (atomic_read(&dentry->d_count) == 1) {
+ BUG_ON(!dentry->d_inode);
+ show_dentry("hide dentry", dentry);
+ dentry->d_flags &= ~DCACHE_BACKED;
+ dentry->d_flags |= DCACHE_STALE;
+ dget(dentry);
+ return 1;
+ }
+ show_dentry("busy dentry", dentry);
+ spin_unlock(&dentry->d_lock);
+ spin_unlock(&dcache_lock);
+ clone = d_alloc(dentry->d_parent, &dentry->d_name);
+ show_dentry("clone dentry", clone);
+ clone->d_flags |= DCACHE_STALE;
+ d_instantiate(clone, NULL);
+ d_rehash(clone);
+ spin_lock(&dentry->d_lock);
+ spin_lock(&dcache_lock);
+ return atomic_read(&dentry->d_count) == 1;
+}
+
+static struct dentry_operations ext2_dentry_operations = {
+ .d_hide = ext2_hide_dentry,
+};
+
+int ext2_assign_ino(struct inode *dir, struct inode *inode);
+
+static struct inode *ext2_new_inode(struct inode *dir, int mode) {
+ struct super_block *sb = dir->i_sb;
+ struct ext2_sb_info *sbi = EXT2_SB(sb);
+ struct inode *inode = new_inode(sb);
+ struct ext2_inode_info *ei;
+
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+
+ ei = EXT2_I(inode);
+ inode->i_mode = mode;
+ inode->i_uid = current->fsuid;
+ inode->i_ino = 0;
+ if (test_opt(sb, GRPID))
+ inode->i_gid = dir->i_gid;
+ else if (dir->i_mode & S_ISGID) {
+ inode->i_gid = dir->i_gid;
+ if (S_ISDIR(mode))
+ mode |= S_ISGID;
+ } else
+ inode->i_gid = current->fsgid;
+
+ inode->i_blocks = 0;
+ inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
+ memset(ei->i_data, 0, sizeof(ei->i_data));
+ ei->i_flags = EXT2_I(dir)->i_flags & ~EXT2_BTREE_FL;
+ if (S_ISLNK(mode))
+ ei->i_flags &= ~(EXT2_IMMUTABLE_FL|EXT2_APPEND_FL);
+ /* dirsync is only applied to directories */
+ if (!S_ISDIR(mode))
+ ei->i_flags &= ~EXT2_DIRSYNC_FL;
+ ei->i_faddr = 0;
+ ei->i_frag_no = 0;
+ ei->i_frag_size = 0;
+ ei->i_file_acl = 0;
+ ei->i_dir_acl = 0;
+ ei->i_dtime = 0;
+ ei->i_block_alloc_info = NULL;
+ ei->i_dir_start_lookup = 0;
+ ei->i_state = EXT2_STATE_NEW;
+ ext2_set_inode_flags(inode);
+ spin_lock(&sbi->s_next_gen_lock);
+ inode->i_generation = sbi->s_next_generation++;
+ spin_unlock(&sbi->s_next_gen_lock);
+
+ show_inode("defer inode create", inode);
+ return inode;
+}
+
+static int ext2_unlink(struct inode *dir, struct dentry *dentry)
+{
+ show_dentry("defer unlink", dentry);
+ dentry->d_inode->i_ctime = dir->i_ctime;
+ inode_dec_link_count(dentry->d_inode);
+ return 0;
+}
+
+static int ext2_add_nondir(struct dentry *dentry, struct inode *inode)
+{
+ if (!(dentry->d_flags & DCACHE_STALE)) /* already deferred? */
+ dget(dentry);
+ dentry->d_op = &ext2_dentry_operations;
+ d_instantiate(dentry, inode);
+ show_dentry("defer create", dentry);
+ return 0;
}
/*
@@ -66,7 +156,10 @@ static struct dentry *ext2_lookup(struct inode * dir, struct dentry *dentry, str
inode = ext2_iget(dir->i_sb, ino);
if (IS_ERR(inode))
return ERR_CAST(inode);
+ dentry->d_flags |= DCACHE_BACKED;
+ show_dentry("found real dirent", dentry);
}
+ dentry->d_op = &ext2_dentry_operations;
return d_splice_alias(inode, dentry);
}
@@ -119,7 +212,10 @@ static int ext2_create (struct inode * dir, struct dentry * dentry, int mode, st
inode->i_mapping->a_ops = &ext2_aops;
inode->i_fop = &ext2_file_operations;
}
+ show_dentry("ext2_create", dentry);
+ printk("--- state = %lx\n", inode->i_state);
mark_inode_dirty(inode);
+ printk("--- state = %lx\n", inode->i_state);
err = ext2_add_nondir(dentry, inode);
}
return err;
@@ -237,6 +333,7 @@ static int ext2_mkdir(struct inode * dir, struct dentry * dentry, int mode)
if (err)
goto out_fail;
+ dentry->d_op = &ext2_dentry_operations;
d_instantiate(dentry, inode);
out:
return err;
@@ -250,26 +347,12 @@ out_dir:
goto out;
}
-static int ext2_unlink(struct inode * dir, struct dentry *dentry)
+int real_unlink(struct inode *dir, struct dentry *dentry)
{
- struct inode * inode = dentry->d_inode;
- struct ext2_dir_entry_2 * de;
- struct page * page;
- int err = -ENOENT;
-
- de = ext2_find_entry (dir, dentry, &page);
- if (!de)
- goto out;
-
- err = ext2_delete_entry (de, page);
- if (err)
- goto out;
-
- inode->i_ctime = dir->i_ctime;
- inode_dec_link_count(inode);
- err = 0;
-out:
- return err;
+ struct page *page;
+ struct ext2_dir_entry_2 *de = ext2_find_entry(dir, dentry, &page);
+ show_dentry("ext2_unlink", dentry);
+ return de ? ext2_delete_entry(de, page) : -ENOENT;
}
static int ext2_rmdir (struct inode * dir, struct dentry *dentry)
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index ef50cbc..5273930 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -295,17 +295,46 @@ static ssize_t ext2_quota_read(struct super_block *sb, int type, char *data, siz
static ssize_t ext2_quota_write(struct super_block *sb, int type, const char *data, size_t len, loff_t off);
#endif
+extern spinlock_t inode_lock;
+
+static void defer_drop_inode(struct inode *inode)
+{
+ if (inode->i_nlink) {
+ generic_drop_inode(inode);
+ return;
+ }
+ show_inode("defer inode delete", inode);
+ inode->i_state |= I_DIRTY;
+ list_move(&inode->i_list, &EXT2_SB(inode->i_sb)->deletes);
+ spin_unlock(&inode_lock);
+}
+
+extern struct list_head inode_unused;
+
+static int ext2_sync_fs(struct super_block *sb, int wait)
+{
+ while (!list_empty(&EXT2_SB(sb)->deletes)) {
+ struct inode *inode = list_entry(EXT2_SB(sb)->deletes.next, struct inode, i_list);
+ show_inode("delete deferred inode", inode);
+ spin_lock(&inode_lock);
+ generic_delete_inode(inode); /* removes from list and drops lock */
+ }
+ return 0;
+}
+
static const struct super_operations ext2_sops = {
.alloc_inode = ext2_alloc_inode,
.destroy_inode = ext2_destroy_inode,
.write_inode = ext2_write_inode,
.delete_inode = ext2_delete_inode,
+ .drop_inode = defer_drop_inode,
.put_super = ext2_put_super,
.write_super = ext2_write_super,
.statfs = ext2_statfs,
.remount_fs = ext2_remount,
.clear_inode = ext2_clear_inode,
.show_options = ext2_show_options,
+ .sync_fs = ext2_sync_fs,
#ifdef CONFIG_QUOTA
.quota_read = ext2_quota_read,
.quota_write = ext2_quota_write,
@@ -614,6 +643,7 @@ static int ext2_setup_super (struct super_block * sb,
EXT2_BLOCKS_PER_GROUP(sb),
EXT2_INODES_PER_GROUP(sb),
sbi->s_mount_opt);
+ INIT_LIST_HEAD(&sbi->deletes);
return res;
}
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index d982eb8..11b4ecb 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -133,6 +133,7 @@ struct dentry_operations {
void (*d_release)(struct dentry *);
void (*d_iput)(struct dentry *, struct inode *);
char *(*d_dname)(struct dentry *, char *, int);
+ int (*d_hide)(struct dentry *);
};
/* the dentry parameter passed to d_hash and d_compare is the parent
@@ -176,6 +177,9 @@ d_iput: no no no yes
#define DCACHE_INOTIFY_PARENT_WATCHED 0x0020 /* Parent inode is watched */
+#define DCACHE_STALE 0x0040 /* FS has wrong dirent */
+#define DCACHE_BACKED 0x0080 /* FS has dirent */
+
extern spinlock_t dcache_lock;
extern seqlock_t rename_lock;
@@ -363,4 +367,11 @@ extern struct dentry *lookup_create(struct nameidata *nd, int is_dir);
extern int sysctl_vfs_cache_pressure;
+static inline void show_dentry(char *tag, struct dentry *dentry)
+{
+ printk(">>> %s: %p/%i %x %p \"%.*s\"\n", tag, dentry,
+ atomic_read(&dentry->d_count), dentry->d_flags, dentry->d_inode,
+ dentry->d_name.len, dentry->d_name.name);
+}
+
#endif /* __LINUX_DCACHE_H */
diff --git a/include/linux/ext2_fs_sb.h b/include/linux/ext2_fs_sb.h
index f273415..7198aab 100644
--- a/include/linux/ext2_fs_sb.h
+++ b/include/linux/ext2_fs_sb.h
@@ -106,6 +106,7 @@ struct ext2_sb_info {
spinlock_t s_rsv_window_lock;
struct rb_root s_rsv_window_root;
struct ext2_reserve_window_node s_rsv_window_head;
+ struct list_head deletes;
};
#endif /* _LINUX_EXT2_FS_SB */
_______________________________________________
Tux3 mailing list
[email protected]
http://mailman.tux3.org/cgi-bin/mailman/listinfo/tux3