On Friday, November 16, 2001 05:07:59 PM -0500 "Philip R. Auld"
<[EMAIL PROTECTED]> wrote:
> We see much better performance without O_SYNC jobs. With it
> the journaling really hurts. So that seems to be the issue
> here.
>
Ok, you really want the attached patch then, porting it back to 2.4.7 should
not be hard at all. But, I can do it over the weekend if needed. This
eventually ended up in 2.4.10, and is stable.
-chris
Index: 0.33/fs/reiserfs/stree.c
--- 0.33/fs/reiserfs/stree.c Wed, 10 Oct 2001 10:26:52 -0400 root (linux/34_stree.c
1.1.2.1.4.1.1.1 644)
+++ 0.34/fs/reiserfs/stree.c Wed, 10 Oct 2001 10:36:16 -0400 root (linux/34_stree.c
+1.1.2.1.4.1.1.2 644)
@@ -1827,6 +1827,7 @@
journal_end(th, p_s_inode->i_sb, orig_len_alloc) ;
journal_begin(th, p_s_inode->i_sb, orig_len_alloc) ;
+ reiserfs_update_inode_transaction(p_s_inode) ;
}
} while ( n_file_size > ROUND_UP (n_new_file_size) &&
search_for_position_by_key(p_s_inode->i_sb, &s_item_key, &s_search_path)
== POSITION_FOUND ) ;
Index: 0.33/fs/reiserfs/file.c
--- 0.33/fs/reiserfs/file.c Sun, 23 Sep 2001 20:11:16 -0400 root (linux/36_file.c
1.1.2.1 644)
+++ 0.34/fs/reiserfs/file.c Wed, 10 Oct 2001 10:36:16 -0400 root (linux/36_file.c
+1.1.2.1.5.1 644)
@@ -42,6 +42,7 @@
lock_kernel() ;
down (&inode->i_sem);
journal_begin(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 3) ;
+ reiserfs_update_inode_transaction(inode) ;
#ifdef REISERFS_PREALLOCATE
reiserfs_discard_prealloc (&th, inode);
@@ -75,10 +76,7 @@
int datasync
) {
struct inode * p_s_inode = p_s_dentry->d_inode;
- struct reiserfs_transaction_handle th ;
int n_err;
- int windex ;
- int jbegin_count = 1 ;
lock_kernel() ;
@@ -87,14 +85,7 @@
n_err = fsync_inode_buffers(p_s_inode) ;
n_err |= fsync_inode_data_buffers(p_s_inode);
- /* commit the current transaction to flush any metadata
- ** changes. sys_fsync takes care of flushing the dirty pages for us
- */
- journal_begin(&th, p_s_inode->i_sb, jbegin_count) ;
- windex = push_journal_writer("sync_file") ;
- reiserfs_update_sd(&th, p_s_inode);
- pop_journal_writer(windex) ;
- journal_end_sync(&th, p_s_inode->i_sb,jbegin_count) ;
+ reiserfs_commit_for_inode(p_s_inode) ;
unlock_kernel() ;
return ( n_err < 0 ) ? -EIO : 0;
}
Index: 0.33/fs/reiserfs/dir.c
--- 0.33/fs/reiserfs/dir.c Wed, 10 Oct 2001 10:26:52 -0400 root (linux/44_dir.c
1.1.2.1.3.1.1.1 644)
+++ 0.34/fs/reiserfs/dir.c Wed, 10 Oct 2001 10:36:16 -0400 root (linux/44_dir.c
+1.1.2.1.3.1.1.2 644)
@@ -39,22 +39,10 @@
};
int reiserfs_dir_fsync(struct file *filp, struct dentry *dentry, int datasync) {
- int ret = 0 ;
- int windex ;
- struct reiserfs_transaction_handle th ;
-
lock_kernel();
-
- journal_begin(&th, dentry->d_inode->i_sb, 1) ;
- windex = push_journal_writer("dir_fsync") ;
- reiserfs_prepare_for_journal(th.t_super, SB_BUFFER_WITH_SB(th.t_super), 1) ;
- journal_mark_dirty(&th, dentry->d_inode->i_sb, SB_BUFFER_WITH_SB
(dentry->d_inode->i_sb)) ;
- pop_journal_writer(windex) ;
- journal_end_sync(&th, dentry->d_inode->i_sb, 1) ;
-
- unlock_kernel();
-
- return ret ;
+ reiserfs_commit_for_inode(dentry->d_inode) ;
+ unlock_kernel() ;
+ return 0 ;
}
Index: 0.33/fs/reiserfs/inode.c
--- 0.33/fs/reiserfs/inode.c Wed, 10 Oct 2001 10:26:52 -0400 root (linux/45_inode.c
1.1.2.1.8.1.1.1 644)
+++ 0.34/fs/reiserfs/inode.c Wed, 10 Oct 2001 10:36:16 -0400 root (linux/45_inode.c
+1.1.2.1.8.1.1.2 644)
@@ -35,6 +35,7 @@
down (&inode->i_sem);
journal_begin(&th, inode->i_sb, jbegin_count) ;
+ reiserfs_update_inode_transaction(inode) ;
windex = push_journal_writer("delete_inode") ;
reiserfs_delete_object (&th, inode);
@@ -230,6 +231,7 @@
reiserfs_update_sd(th, inode) ;
journal_end(th, s, len) ;
journal_begin(th, s, len) ;
+ reiserfs_update_inode_transaction(inode) ;
}
// it is called by get_block when create == 0. Returns block number
@@ -569,6 +571,7 @@
TYPE_ANY, 3/*key length*/);
if ((new_offset + inode->i_sb->s_blocksize - 1) > inode->i_size) {
journal_begin(&th, inode->i_sb, jbegin_count) ;
+ reiserfs_update_inode_transaction(inode) ;
transaction_started = 1 ;
}
research:
@@ -593,6 +596,7 @@
if (!transaction_started) {
pathrelse(&path) ;
journal_begin(&th, inode->i_sb, jbegin_count) ;
+ reiserfs_update_inode_transaction(inode) ;
transaction_started = 1 ;
goto research ;
}
@@ -661,6 +665,7 @@
*/
pathrelse(&path) ;
journal_begin(&th, inode->i_sb, jbegin_count) ;
+ reiserfs_update_inode_transaction(inode) ;
transaction_started = 1 ;
goto research;
}
@@ -1305,6 +1310,10 @@
return ;
}
lock_kernel() ;
+
+ /* this is really only used for atime updates, so they don't have
+ ** to be included in O_SYNC or fsync
+ */
journal_begin(&th, inode->i_sb, 1) ;
reiserfs_update_sd (&th, inode);
journal_end(&th, inode->i_sb, 1) ;
@@ -1680,6 +1689,7 @@
** (it will unmap bh if it packs).
*/
journal_begin(&th, p_s_inode->i_sb, JOURNAL_PER_BALANCE_CNT * 2 ) ;
+ reiserfs_update_inode_transaction(p_s_inode) ;
windex = push_journal_writer("reiserfs_vfs_truncate_file") ;
reiserfs_do_truncate (&th, p_s_inode, page, update_timestamps) ;
pop_journal_writer(windex) ;
@@ -1726,6 +1736,7 @@
start_over:
lock_kernel() ;
journal_begin(&th, inode->i_sb, jbegin_count) ;
+ reiserfs_update_inode_transaction(inode) ;
make_cpu_key(&key, inode, byte_offset, TYPE_ANY, 3) ;
@@ -1953,27 +1964,38 @@
return generic_block_bmap(as, block, reiserfs_bmap) ;
}
-
static int reiserfs_commit_write(struct file *f, struct page *page,
unsigned from, unsigned to) {
struct inode *inode = page->mapping->host ;
+ loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
int ret ;
reiserfs_wait_on_write_block(inode->i_sb) ;
+
+ /* generic_commit_write does this for us, but does not update the
+ ** transaction tracking stuff when the size changes. So, we have
+ ** to do the i_size updates here.
+ */
+ if (pos > inode->i_size) {
+ struct reiserfs_transaction_handle th ;
+ lock_kernel() ;
+ journal_begin(&th, inode->i_sb, 1) ;
+ reiserfs_update_inode_transaction(inode) ;
+ inode->i_size = pos ;
+ reiserfs_update_sd(&th, inode) ;
+ journal_end(&th, inode->i_sb, 1) ;
+ unlock_kernel() ;
+ }
+
ret = generic_commit_write(f, page, from, to) ;
/* we test for O_SYNC here so we can commit the transaction
** for any packed tails the file might have had
*/
if (f->f_flags & O_SYNC) {
- struct reiserfs_transaction_handle th ;
lock_kernel() ;
- journal_begin(&th, inode->i_sb, 1) ;
- reiserfs_prepare_for_journal(inode->i_sb,
- SB_BUFFER_WITH_SB(inode->i_sb), 1) ;
- journal_mark_dirty(&th, inode->i_sb, SB_BUFFER_WITH_SB(inode->i_sb)) ;
- journal_end_sync(&th, inode->i_sb, 1) ;
- unlock_kernel();
+ reiserfs_commit_for_inode(inode) ;
+ unlock_kernel();
}
return ret ;
}
Index: 0.33/fs/reiserfs/namei.c
--- 0.33/fs/reiserfs/namei.c Wed, 10 Oct 2001 10:26:52 -0400 root (linux/50_namei.c
1.1.2.1.8.1.1.1 644)
+++ 0.34/fs/reiserfs/namei.c Wed, 10 Oct 2001 10:36:16 -0400 root (linux/50_namei.c
+1.1.2.1.8.1.1.2 644)
@@ -554,6 +554,8 @@
iput (inode);
return retval;
}
+ reiserfs_update_inode_transaction(inode) ;
+ reiserfs_update_inode_transaction(dir) ;
d_instantiate(dentry, inode);
pop_journal_writer(windex) ;
@@ -596,6 +598,9 @@
//FIXME: needed for block and char devices only
reiserfs_update_sd (&th, inode);
+ reiserfs_update_inode_transaction(inode) ;
+ reiserfs_update_inode_transaction(dir) ;
+
retval = reiserfs_add_entry (&th, dir, dentry->d_name.name, dentry->d_name.len,
inode, 1/*visible*/);
if (retval) {
@@ -651,6 +656,8 @@
journal_end(&th, dir->i_sb, jbegin_count) ;
return retval;
}
+ reiserfs_update_inode_transaction(inode) ;
+ reiserfs_update_inode_transaction(dir) ;
inode->i_op = &reiserfs_dir_inode_operations;
inode->i_fop = &reiserfs_dir_operations;
@@ -719,6 +726,9 @@
}
inode = dentry->d_inode;
+ reiserfs_update_inode_transaction(inode) ;
+ reiserfs_update_inode_transaction(dir) ;
+
if (de.de_objectid != inode->i_ino) {
// FIXME: compare key of an object and a key found in the
// entry
@@ -792,6 +802,9 @@
}
inode = dentry->d_inode;
+ reiserfs_update_inode_transaction(inode) ;
+ reiserfs_update_inode_transaction(dir) ;
+
if (de.de_objectid != inode->i_ino) {
// FIXME: compare key of an object and a key found in the
// entry
@@ -881,6 +894,9 @@
return retval;
}
+ reiserfs_update_inode_transaction(inode) ;
+ reiserfs_update_inode_transaction(dir) ;
+
inode->i_op = &page_symlink_inode_operations;
inode->i_mapping->a_ops = &reiserfs_address_space_operations;
@@ -936,6 +952,10 @@
/* create new entry */
retval = reiserfs_add_entry (&th, dir, dentry->d_name.name, dentry->d_name.len,
inode, 1/*visible*/);
+
+ reiserfs_update_inode_transaction(inode) ;
+ reiserfs_update_inode_transaction(dir) ;
+
if (retval) {
pop_journal_writer(windex) ;
journal_end(&th, dir->i_sb, jbegin_count) ;
@@ -1082,6 +1102,16 @@
return retval;
}
+ reiserfs_update_inode_transaction(old_dir) ;
+ reiserfs_update_inode_transaction(new_dir) ;
+
+ /* this makes it so an fsync on an open fd for the old name will
+ ** commit the rename operation
+ */
+ reiserfs_update_inode_transaction(old_inode) ;
+
+ if (new_inode)
+ reiserfs_update_inode_transaction(new_inode) ;
while (1) {
// look for old name using corresponding entry key (found by
reiserfs_find_entry)
Index: 0.33/fs/reiserfs/journal.c
--- 0.33/fs/reiserfs/journal.c Wed, 10 Oct 2001 10:26:52 -0400 root
(linux/b/0_journal.c 1.1.2.1.3.1.1.1 644)
+++ 0.34/fs/reiserfs/journal.c Wed, 10 Oct 2001 10:36:16 -0400 root
+(linux/b/0_journal.c 1.1.2.1.3.1.1.2 644)
@@ -2319,6 +2319,11 @@
** will wait until the current transaction is done/commited before returning
*/
int journal_end_sync(struct reiserfs_transaction_handle *th, struct super_block
*p_s_sb, unsigned long nblocks) {
+
+ if (SB_JOURNAL(p_s_sb)->j_len == 0) {
+ reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ;
+ journal_mark_dirty(th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ;
+ }
return do_journal_end(th, p_s_sb, nblocks, COMMIT_NOW | WAIT) ;
}
@@ -2601,6 +2606,41 @@
}
}
return 0 ;
+}
+
+void reiserfs_update_inode_transaction(struct inode *inode) {
+
+ inode->u.reiserfs_i.i_trans_index = SB_JOURNAL_LIST_INDEX(inode->i_sb);
+
+ inode->u.reiserfs_i.i_trans_id = SB_JOURNAL(inode->i_sb)->j_trans_id ;
+}
+
+static int reiserfs_inode_in_this_transaction(struct inode *inode) {
+ if (inode->u.reiserfs_i.i_trans_id == SB_JOURNAL(inode->i_sb)->j_trans_id ||
+ inode->u.reiserfs_i.i_trans_id == 0) {
+ return 1;
+ }
+ return 0 ;
+}
+
+void reiserfs_commit_for_inode(struct inode *inode) {
+ struct reiserfs_journal_list *jl ;
+ struct reiserfs_transaction_handle th ;
+ struct super_block *sb = inode->i_sb ;
+
+ jl = SB_JOURNAL_LIST(sb) + inode->u.reiserfs_i.i_trans_index ;
+
+ /* is it from the current transaction, or from an unknown transaction? */
+ if (reiserfs_inode_in_this_transaction(inode)) {
+ journal_join(&th, sb, 1) ;
+ reiserfs_update_inode_transaction(inode) ;
+ journal_end_sync(&th, sb, 1) ;
+ } else if (jl->j_trans_id == inode->u.reiserfs_i.i_trans_id) {
+ flush_commit_list(sb, jl, 1) ;
+ }
+ /* if the transaction id does not match, this list is long since flushed
+ ** and we don't have to do anything here
+ */
}
void reiserfs_restore_prepared_buffer(struct super_block *p_s_sb,
Index: 0.33/include/linux/reiserfs_fs.h
--- 0.33/include/linux/reiserfs_fs.h Wed, 10 Oct 2001 10:26:52 -0400 root
(linux/l/d/13_reiserfs_f 1.1.2.1.3.1.1.1 644)
+++ 0.34/include/linux/reiserfs_fs.h Wed, 10 Oct 2001 10:36:16 -0400 root
+(linux/l/d/13_reiserfs_f 1.1.2.1.3.1.1.2 644)
@@ -1738,6 +1738,8 @@
*/
#define JOURNAL_BUFFER(j,n) ((j)->j_ap_blocks[((j)->j_start + (n)) %
JOURNAL_BLOCK_COUNT])
+void reiserfs_commit_for_inode(struct inode *) ;
+void reiserfs_update_inode_transaction(struct inode *) ;
void reiserfs_wait_on_write_block(struct super_block *s) ;
void reiserfs_block_writes(struct reiserfs_transaction_handle *th) ;
void reiserfs_allow_writes(struct super_block *s) ;
Index: 0.33/include/linux/reiserfs_fs_i.h
--- 0.33/include/linux/reiserfs_fs_i.h Sun, 23 Sep 2001 16:21:54 -0400 root
(linux/o/d/28_reiserfs_f 1.1 644)
+++ 0.34/include/linux/reiserfs_fs_i.h Wed, 10 Oct 2001 10:36:16 -0400 root
+(linux/o/d/28_reiserfs_f 1.1.6.1 644)
@@ -40,6 +40,12 @@
is a comment you should make.... -Hans */
//nopack-attribute
int nopack;
+
+ /* we use these for fsync or O_SYNC to decide which transaction needs
+ ** to be committed in order for this inode to be properly flushed
+ */
+ unsigned long i_trans_id ;
+ unsigned long i_trans_index ;
};