[PATCH] Add i_version_hi for 64-bit version

2007-04-03 Thread Kalpak Shah
Hi,

I have changed the i_version_hi field to a le32 and also cleaned up the 
whitespace. 

This patch adds a 32-bit i_version_hi field to ext4_inode, which can be used 
for 64-bit inode versions.

Signed-off-by: Andreas Dilger [EMAIL PROTECTED]
Signed-off-by: Kalpak Shah [EMAIL PROTECTED]

Index: linux-2.6.19/include/linux/ext4_fs.h
===
--- linux-2.6.19.orig/include/linux/ext4_fs.h
+++ linux-2.6.19/include/linux/ext4_fs.h
@@ -336,6 +336,7 @@ struct ext4_inode {
__le32  i_atime_extra;  /* extra Access time  (nsec  2 | epoch) */
__le32  i_crtime;   /* File Creation time */
__le32  i_crtime_extra; /* extra File Creation time (nsec  2 | epoch) 
*/
+   __le32  i_version_hi;   /* high 32 bits for 64-bit version */
 };
 
 #define i_size_highi_dir_acl

Thanks,
Kalpak Shah.

-
To unsubscribe from this list: send the line unsubscribe linux-ext4 in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/2] Add extent related functions

2007-04-03 Thread Aneesh Kumar K.V
From: Aneesh Kumar K.V [EMAIL PROTECTED]

The code is  derived out of the latest ext4 kernel source. I
have tried to keep the code as close as possible to the kernel
sources. This makes sure that any fixes for the tree building
code in kernel should be easily applied to ext4migrate.  The
ext3_ext naming convention instead of ext4_ext found in kernel is to
make sure we are in sync with rest of e2fsprogs source

Signed-off-by: Aneesh Kumar K.V [EMAIL PROTECTED]
---
 ext4migrate/extents.c |  737 +
 ext4migrate/extents.h |   10 +
 2 files changed, 747 insertions(+), 0 deletions(-)
 create mode 100644 ext4migrate/extents.c
 create mode 100644 ext4migrate/extents.h

diff --git a/ext4migrate/extents.c b/ext4migrate/extents.c
new file mode 100644
index 000..727755f
--- /dev/null
+++ b/ext4migrate/extents.c
@@ -0,0 +1,737 @@
+#include stdio.h
+#include stdlib.h
+
+#include migrate.h
+#include extents.h
+
+struct ext4_ext_path {
+   ext4_fsblk_tp_block;
+   __u16   p_depth;
+   struct ext3_extent  *p_ext;
+   struct ext3_extent_idx  *p_idx;
+   struct ext3_extent_header   *p_hdr;
+};
+
+/*
+ * ext_pblock:
+ * combine low and high parts of physical block number into ext4_fsblk_t
+ */
+ext4_fsblk_t ext_pblock(struct ext3_extent *ex)
+{
+   ext4_fsblk_t block;
+
+   block = ex-ee_start;
+   block |= ((ext4_fsblk_t) ex-ee_start_hi  31)  1;
+   return block;
+}
+
+/*
+ * idx_pblock:
+ * combine low and high parts of a leaf physical block number into ext4_fsblk_t
+ */
+ext4_fsblk_t idx_pblock(struct ext3_extent_idx *ix)
+{
+   ext4_fsblk_t block;
+
+   block = ix-ei_leaf;
+   block |= ((ext4_fsblk_t) ix-ei_leaf_hi  31)  1;
+   return block;
+}
+
+/*
+ * ext3_ext_store_pblock:
+ * stores a large physical block number into an extent struct,
+ * breaking it into parts
+ */
+void ext3_ext_store_pblock(struct ext3_extent *ex, ext4_fsblk_t pb)
+{
+   ex-ee_start = (unsigned long) (pb  0x);
+   ex-ee_start_hi = (unsigned long) ((pb  31)  1)  0x;
+}
+
+/*
+ * ext3_idx_store_pblock:
+ * stores a large physical block number into an index struct,
+ * breaking it into parts
+ */
+static void ext3_idx_store_pblock(struct ext3_extent_idx *ix, ext4_fsblk_t pb)
+{
+   ix-ei_leaf = (unsigned long) (pb  0x);
+   ix-ei_leaf_hi = (unsigned long) ((pb  31)  1)  0x;
+}
+
+
+static int __ext3_ext_space_block(ext2_filsys filesys)
+{
+   int size;
+
+   size = (filesys-blocksize - sizeof(struct ext3_extent_header))
+   / sizeof(struct ext3_extent);
+   return size;
+}
+static int ext3_ext_space_block()
+{
+   return  __ext3_ext_space_block(current_fs);
+}
+
+static int __ext3_ext_space_block_idx(ext2_filsys filesys)
+{
+   int size;
+
+   size = (filesys-blocksize - sizeof(struct ext3_extent_header))
+   / sizeof(struct ext3_extent_idx);
+   return size;
+}
+static int ext3_ext_space_block_idx()
+{
+   return __ext3_ext_space_block_idx(current_fs);
+
+}
+
+int ext3_ext_space_root(void)
+{
+   int size;
+
+   size = EXT2_N_BLOCKS*sizeof(blk_t);
+   size -= sizeof(struct ext3_extent_header);
+   size /= sizeof(struct ext3_extent);
+   return size;
+}
+
+static int ext3_ext_space_root_idx(void)
+{
+   int size;
+
+   size = EXT2_N_BLOCKS*sizeof(blk_t);
+   size -= sizeof(struct ext3_extent_header);
+   size /= sizeof(struct ext3_extent_idx);
+   return size;
+}
+static void ext3_ext_binsearch_idx(struct ext4_ext_path *path, blk_t 
logical_blk)
+{
+   struct ext3_extent_header *eh = path-p_hdr;
+   struct ext3_extent_idx *r, *l, *m;
+
+   l = EXT_FIRST_INDEX(eh) + 1;
+   r = EXT_FIRST_INDEX(eh) + eh-eh_entries - 1;
+   while (l = r) {
+   m = l + (r - l) / 2;
+   if (logical_blk  m-ei_block)
+   r = m - 1;
+   else
+   l = m + 1;
+   }
+   path-p_idx = l - 1;
+}
+
+static void ext3_ext_binsearch(struct ext4_ext_path *path, blk_t logical_blk)
+{
+   struct ext3_extent_header *eh = path-p_hdr;
+   struct ext3_extent *r, *l, *m;
+
+   if (eh-eh_entries == 0) {
+   /*
+* this leaf is empty:
+* we get such a leaf in split/add case
+*/
+   return;
+   }
+
+   l = EXT_FIRST_EXTENT(eh) + 1;
+   r = EXT_FIRST_EXTENT(eh) + eh-eh_entries - 1;
+
+   while (l = r) {
+   m = l + (r - l) / 2;
+   if (logical_blk  m-ee_block)
+   r = m - 1;
+   else
+   l = m + 1;
+   }
+
+   path-p_ext = l - 1;
+}
+
+/*
+ * ext3_ext_insert_index:
+ * insert new index [EMAIL PROTECTED];@ptr] into the block at @curp;
+ * check where to insert: before @curp or after @curp
+ */

[PATCH 2/2] e2fsprogs: Add ext4migrate

2007-04-03 Thread Aneesh Kumar K.V
From: Aneesh Kumar K.V [EMAIL PROTECTED]

Add ext4migrate utility that helps in migrating a ext3 block mapped
inode to ext4 extent mapped inode.

ext4migrate command takes the below syntax
ext4migrate [--display | --migrate ] image_name filename

The --display option helps in displaying the block map details for an ext3 inode
and extent map details for an ext4 inode. The --migrate option convert a block 
mapped
ext3 inode to extent mapped ext4 inode.

This needs to be run on an unmounted file system (offline migration).

The inode modification is done only at the last stage. This is to make sure 
that if we
fail at any intermediate stage, we exit without touching the disk.

Signed-off-by: Aneesh Kumar K.V [EMAIL PROTECTED]
---
 Makefile.in |3 +-
 configure.in|   21 ++-
 ext4migrate/Makefile.in |   66 +++
 ext4migrate/migrate.c   |  491 +++
 ext4migrate/migrate.h   |   27 +++
 5 files changed, 606 insertions(+), 2 deletions(-)
 create mode 100644 ext4migrate/Makefile.in
 create mode 100644 ext4migrate/migrate.c
 create mode 100644 ext4migrate/migrate.h

diff --git a/Makefile.in b/Makefile.in
index 0d31caa..9d8d291 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -9,9 +9,10 @@ INSTALL = @INSTALL@
 
 @[EMAIL PROTECTED] resize
 @[EMAIL PROTECTED] debugfs
[EMAIL PROTECTED]@EXT4MIGRATE_DIR= ext4migrate
 
 LIB_SUBDIRS=lib/et lib/ss lib/e2p lib/ext2fs lib/uuid lib/blkid intl
-PROG_SUBDIRS=e2fsck $(DEBUGFS_DIR) misc $(RESIZE_DIR) tests/progs po
+PROG_SUBDIRS=e2fsck $(DEBUGFS_DIR) misc $(RESIZE_DIR) tests/progs po 
$(EXT4MIGRATE_DIR)
 SUBDIRS=util $(LIB_SUBDIRS) $(PROG_SUBDIRS) tests
 
 SUBS= lib/ext2fs/ext2_types.h lib/blkid/blkid_types.h lib/uuid/uuid_types.h
diff --git a/configure.in b/configure.in
index 44b718d..5ae5e0b 100644
--- a/configure.in
+++ b/configure.in
@@ -436,6 +436,24 @@ echo Building e2fsck statically by default
 )
 AC_SUBST(E2FSCK_TYPE)
 dnl
+dnl handle --enable-ext4migrate
+dnl
+AC_ARG_ENABLE([ext4migrate],
+[  --disable-ext4migrate disable support of ext4migrate program],
+if test $enableval = no
+then
+   echo Disabling ext4migrate support
+   EXT4MIGRATE_CMT=#
+else
+   EXT4MIGRATE_CMT=
+   echo Enabling ext4migrate support
+fi
+,
+echo Enabling ext4migrate support by default
+EXT4MIGRATE_CMT=
+)
+AC_SUBST(EXT4MIGRATE_CMT)
+dnl
 dnl See whether to install the `fsck' wrapper program (that calls e2fsck)
 dnl
 AC_ARG_ENABLE([fsck],
@@ -862,7 +880,8 @@ for i in MCONFIG Makefile e2fsprogs.spec \
lib/e2p/e2p.pc lib/blkid/blkid.pc lib/ext2fs/ext2fs.pc \
misc/Makefile ext2ed/Makefile e2fsck/Makefile \
debugfs/Makefile tests/Makefile tests/progs/Makefile \
-   resize/Makefile doc/Makefile intl/Makefile po/Makefile.in ; do
+   resize/Makefile doc/Makefile intl/Makefile po/Makefile.in \
+   ext4migrate/Makefile ; do
if test -d `dirname ${srcdir}/$i` ; then
outlist=$outlist $i
fi
diff --git a/ext4migrate/Makefile.in b/ext4migrate/Makefile.in
new file mode 100644
index 000..2596508
--- /dev/null
+++ b/ext4migrate/Makefile.in
@@ -0,0 +1,66 @@
+#
+# Standard e2fsprogs prologue
+#
+
+srcdir = @srcdir@
+top_srcdir = @top_srcdir@
+VPATH = @srcdir@
+top_builddir = ..
+my_dir = util
+INSTALL = @INSTALL@
[EMAIL PROTECTED]@
+
+SRCS = $(srcdir)/migrate.c
+
+LIBS= $(LIBEXT2FS)  $(LIBCOM_ERR) 
+DEPLIBS= $(LIBEXT2FS)  $(LIBCOM_ERR) 
+
+.c.o:
+   @echo  CC $
+   @$(CC) -c $(ALL_CFLAGS) $ -o $@
+   @#cc -g -I../lib/  -Wunreachable-code -Wunused -Wunused-function 
+   @#-Wunused-label  -Wunused-parameter -Wunused-value  -Wunused-variable  
-c migrate.c
+
+PROGS= ext4migrate
+
+all:: $(PROGS)
+
+ext4migrate: migrate.o  extents.o $(DEPLIBS)
+   @echo  LD $@
+   @$(CC) $(ALL_LDFLAGS) -o ext4migrate migrate.o extents.o $(LIBS)
+
+installdirs:
+   @echo  MKINSTALLDIRS $(root_sbindir)
+   @$(MKINSTALLDIRS) $(DESTDIR)$(root_sbindir) 
+
+install: $(PROGS) installdirs
+   @for i in $(PROGS); do \
+   echo   INSTALL $(root_sbindir)/$$i; \
+   $(INSTALL_PROGRAM) $$i $(DESTDIR)$(root_sbindir)/$$i; \
+   done
+
+install-strip: install
+   @for i in $(PROGS); do \
+   echo   STRIP $(root_sbindir)/$$i; \
+   $(STRIP) $(DESTDIR)$(root_sbindir)/$$i; \
+   done
+
+uninstall:
+   for i in $(PROGS); do \
+   $(RM) -f $(DESTDIR)$(root_sbindir)/$$i; \
+   done
+clean:
+   $(RM) -f $(PROGS) \#* *.s *.o *.a *~ core
+
+mostlyclean: clean
+
+distclean: clean
+   $(RM) -f .depend Makefile $(srcdir)/TAGS $(srcdir)/Makefile.in.old
+
+# +++ Dependency line eater +++
+#
+# Makefile dependencies follow.  This must be the last section in
+# the Makefile.in file
+#
+migrate.o: $(srcdir)/migrate.c
+extents.o: $(srcdir)/extents.c
diff --git a/ext4migrate/migrate.c b/ext4migrate/migrate.c
new file mode 100644
index 000..0d851cc
--- /dev/null
+++ 

Re: tune2fs -l stale info

2007-04-03 Thread Vincent Caron
On jeu, 2007-03-29 at 13:59 -0600, Andreas Dilger wrote:
 On Mar 29, 2007  14:17 +0200, Vincent Caron wrote:
I just noticed that 'tune2fs -l' did not returned a lively updated
  information regarding the free inodes count (looks like it's always
  correct after unmounting).
 
 This is a bit of a defect in all 2.6 kernels.  They never update the
 on disk superblock free blocks/inodes information to avoid lock contention,
 even if this info is available.

  It turns out it is okay in my case since 'df -i' reports correct
numbers.

 Can you please give the following patch a try?  It fixes this issue,
 and also makes statfs MUCH more efficient for large filesystems, because
 the filesystem overhead is constant unless the filesystem size changes
 and checking that for 16k groups is slow (hence hack to add cond_resched()
 instead of fixing problem correctly).  It has not been tested much, but
 is very straight forward.
 
 Only the last part is strictly necessary to fix your particular problem
 (setting of es-s_free_inodes_count and es-s_free_blocks_count).  This
 is lazy, in the sense that you need a statfs to update the count, and
 then a truncate or unlink or rmdir in order to dirty the superblock to
 flush it to disk.  However, it will be correct in the buffer cache, and
 it is a lot better than what we have now.  We don't want a non-lazy version
 anyways, because of performance.

  Unfortunately the problem shows on a production machine and I don't
have a similar one to test properly (it's a heavy-loaded filer).

  BTW, if ondisk superblocks are not updated until specific events occur
(umount, statfs), what is the consequence of a system crash ? Does
journalization come into play (superblock=metadata?), does fsck fixes
figures from other ondisk structures ? Just being curious...

-
To unsubscribe from this list: send the line unsubscribe linux-ext4 in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Correction to check_filetype()

2007-04-03 Thread Andreas Dilger
On Mar 31, 2007  10:39 -0400, Theodore Tso wrote:
 I'm going to let this one soak for a bit to make sure we don't pick up
 any fase positives or negatives in the hueristics.
 
 @@ -133,11 +133,10 @@ int e2fsck_pass1_check_device_inode(ext2
 +  * If the index flag is set, then this is a bogus
 +  * device/fifo/socket
*/
 - if ((ext2fs_inode_data_blocks(fs, inode) != 0) ||
 - (inode-i_flags  EXT2_INDEX_FL))
 + if (inode-i_flags  EXT2_INDEX_FL)
   return 0;

There were ancient versions of the kernel that left EXT2_INDEX_FL set
on all files, instead of just directories...  I'm not sure if those
were in actual released kernels, or just in patches.

 +static void check_is_really_dir(e2fsck_t ctx, struct problem_context *pctx,
 + char *buf)
 +{
 + if (ext2fs_read_dir_block(ctx-fs, inode-i_block[0], buf))
 + return;

Do we call check_blocks() on this inode shortly thereafter?  If we do then
the overhead of reading the first block is offset by not reading it again
later.  Otherwise, this could slow things down.

 + dirent = (struct ext2_dir_entry *) buf;
 + if (((dirent-name_len  0xFF) != 1) ||
 + (dirent-name[0] != '.') ||
 + (dirent-inode != pctx-ino) ||
 + (dirent-rec_len  12) ||
 + (dirent-rec_len % 4) ||
 + (dirent-rec_len = ctx-fs-blocksize - 12))
 + return;
 +
 + dirent = (struct ext2_dir_entry *) (buf + dirent-rec_len);
 + if (((dirent-name_len  0xFF) != 2) ||
 + (dirent-name[0] != '.') ||
 + (dirent-name[1] != '.') ||
 + (dirent-rec_len  12) ||
 + (dirent-rec_len % 4))
 + return;
 +
 + if (fix_problem(ctx, PR_1_TREAT_AS_DIRECTORY, pctx)) {
 + inode-i_mode = (inode-i_mode  0) | LINUX_S_IFDIR;
 + e2fsck_write_inode_full(ctx, pctx-ino, inode, 
 + EXT2_INODE_SIZE(ctx-fs-super), 
 + check_is_really_dir);
   }

The one worry I have here (though I don't think it is necessarily IN
the code you propose) is that someone could create a regular file which
looks like a directory and somehow get it linked into the filesystem
tree, giving them escalated access (e.g. device files owned by them,
suid executables, links to otherwise unreadable files, etc).

It would seem that this is only a danger if the mode on the file is
corrupted, which shouldn't really be doable by a regular user, but
it is definitely something to consider.

I take it that this code fixes the test image I previously posted?

Cheers, Andreas
--
Andreas Dilger
Principal Software Engineer
Cluster File Systems, Inc.

-
To unsubscribe from this list: send the line unsubscribe linux-ext4 in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Correction to check_filetype()

2007-04-03 Thread Theodore Tso
On Tue, Apr 03, 2007 at 11:37:16AM -0600, Andreas Dilger wrote:
 On Mar 31, 2007  10:39 -0400, Theodore Tso wrote:
  I'm going to let this one soak for a bit to make sure we don't pick up
  any fase positives or negatives in the hueristics.
  
  @@ -133,11 +133,10 @@ int e2fsck_pass1_check_device_inode(ext2
  +* If the index flag is set, then this is a bogus
  +* device/fifo/socket
   */
  -   if ((ext2fs_inode_data_blocks(fs, inode) != 0) ||
  -   (inode-i_flags  EXT2_INDEX_FL))
  +   if (inode-i_flags  EXT2_INDEX_FL)
  return 0;
 
 There were ancient versions of the kernel that left EXT2_INDEX_FL set
 on all files, instead of just directories...  I'm not sure if those
 were in actual released kernels, or just in patches.

Well, we've been running with this in e2fsprogs for quite some time
(August 2002, e2fsprogs 1.28), and no one has complained, so I think
we're safe...


  +static void check_is_really_dir(e2fsck_t ctx, struct problem_context *pctx,
  +   char *buf)
  +{
  +   if (ext2fs_read_dir_block(ctx-fs, inode-i_block[0], buf))
  +   return;
 
 Do we call check_blocks() on this inode shortly thereafter?  If we do then
 the overhead of reading the first block is offset by not reading it again
 later.  Otherwise, this could slow things down.

This is why we only do this on special devices; check_is_really_dir()
doesn't do anything on directory or regular files.

 The one worry I have here (though I don't think it is necessarily IN
 the code you propose) is that someone could create a regular file which
 looks like a directory and somehow get it linked into the filesystem
 tree, giving them escalated access (e.g. device files owned by them,
 suid executables, links to otherwise unreadable files, etc).

I thought of that, but I didn't worry about it because I'm not doing
this check on regular files.  Come to think of it I should add a check
so we don't do this for long symlinks, for similar reasons.  It would
only matter if the user can force filesystem check, which makes it a
long shot, but we should avoid that issue as well.

 I take it that this code fixes the test image I previously posted?

Yup.  Take a look at what I checked in into Mercurial.  I added two
separate test cases.  One of them directly tests the filetype issue,
and the other tests a mutated directory into a char device case.

 - Ted
-
To unsubscribe from this list: send the line unsubscribe linux-ext4 in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC][take 2] e2fsprogs: Add ext4migrate

2007-04-03 Thread Andreas Dilger
On Apr 03, 2007  15:37 +0530, Aneesh Kumar K.V wrote:
 The extent insert code is derived out of the latest ext4 kernel
 source. I have tried to keep the code as close as possible to the
 kernel sources. This makes sure that any fixes for the tree building
 code in kernel should be easily applied to ext4migrate.  The ext3_ext
 naming convention instead of ext4_ext found in kernel is to make sure
 we are in sync with rest of e2fsprogs source.

Of course, the other way to do this would be to temporarily mount the
filesystem as ext4, copy non-extent files via cp (can use lsattr to
check for extent flag) and then rename new file over old one.  Care
must be taken to not mount filesystem on visible mountpoint, so that
users cannot be changing the filesystem while copy is being done.

This can be done to convert an ext4 filesystem back to ext3 also, if
the ext4 filesystem is mounted with noextents (to disable creation
of new files with extent mapping).

The only minor issue is that the inode numbers of the files will change.

 The inode modification is done only at the last stage. This is to make
 sure that if we fail at any intermediate stage, we exit without touching
 the disk.
 
 The inode update is done as below
 a) Walk the extent index blocks and write them to the disk. If failed exit
 b) Write the inode. if failed exit.
 c) Write the updated block bitmap. if failed exit ( This could be a problem
because we have already updated the inode i_block filed to point to new
blocks.). But such inconsistancy between inode i_block and block bitmap
can be fixed by fsck IIUC.

Why not mark all the relevant blocks in use (for both exent- and block-mapped
copies) until the copy is done, then write everything out, and only mark the
block-mapped file blocks free after the inode is written to disk?  This avoids
the danger that the new extent-mapped file's blocks are marked free and get
double-allocated (corrupting the file data, possibly the whole filesystem).

I don't think there is a guarantee that an impatient user will run a lengthy
e2fsck after interrupting the migrate.  Also, you should mark the filesystem
unclean at first change unless everything completes successfully.  That way
e2fsck will at least run automatically on the next boot.


Other general notes:
- wrap lines at 80 columns
- would be good to have a -R mode that walked the whole filesystem,
  since startup time is very long for large filesystems
- also allow specifying multiple files on the command-line
- changing the operation to be multi-file allows avoiding sync of bitmaps
  two times (once after extents are allocated and inode written, once after
  indirect blocks are freed).  There only needs to be one sync per file.

Cheers, Andreas
--
Andreas Dilger
Principal Software Engineer
Cluster File Systems, Inc.

-
To unsubscribe from this list: send the line unsubscribe linux-ext4 in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC: 2.6.16 patch] jbd: journal_dirty_data re-check for unmapped buffers

2007-04-03 Thread Adrian Bunk
This patch also seems to make sense for 2.6.16, or do I miss anything?

TIA
Adrian


commit f58a74dca88d48b0669609b4957f3dd757bdc898
Author: Eric Sandeen [EMAIL PROTECTED]
Date:   Sat Oct 28 10:38:27 2006 -0700

[PATCH] jbd: journal_dirty_data re-check for unmapped buffers

When running several fsx's and other filesystem stress tests, we found
cases where an unmapped buffer was still being sent to submit_bh by the
ext3 dirty data journaling code.

I saw this happen in two ways, both related to another thread doing a
truncate which would unmap the buffer in question.

Either we would get into journal_dirty_data with a bh which was already
unmapped (although journal_dirty_data_fn had checked for this earlier, the
state was not locked at that point), or it would get unmapped in the middle
of journal_dirty_data when we dropped locks to call sync_dirty_buffer.

By re-checking for mapped state after we've acquired the bh state lock, we
should avoid these races.  If we find a buffer which is no longer mapped,
we essentially ignore it, because journal_unmap_buffer has already decided
that this buffer can go away.

I've also added tracepoints in these two cases, and made a couple other
tracepoint changes that I found useful in debugging this.

Signed-off-by: Eric Sandeen [EMAIL PROTECTED]
Cc: linux-ext4@vger.kernel.org
Signed-off-by: Andrew Morton [EMAIL PROTECTED]
Signed-off-by: Linus Torvalds [EMAIL PROTECTED]

diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index d5c6304..4f82bcd 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -967,6 +967,13 @@ int journal_dirty_data(handle_t *handle, struct 
buffer_head *bh)
 */
jbd_lock_bh_state(bh);
spin_lock(journal-j_list_lock);
+
+   /* Now that we have bh_state locked, are we really still mapped? */
+   if (!buffer_mapped(bh)) {
+   JBUFFER_TRACE(jh, unmapped buffer, bailing out);
+   goto no_journal;
+   }
+
if (jh-b_transaction) {
JBUFFER_TRACE(jh, has transaction);
if (jh-b_transaction != handle-h_transaction) {
@@ -1028,6 +1035,11 @@ int journal_dirty_data(handle_t *handle, struct 
buffer_head *bh)
sync_dirty_buffer(bh);
jbd_lock_bh_state(bh);
spin_lock(journal-j_list_lock);
+   /* Since we dropped the lock... */
+   if (!buffer_mapped(bh)) {
+   JBUFFER_TRACE(jh, buffer got 
unmapped);
+   goto no_journal;
+   }
/* The buffer may become locked again at any
   time if it is redirtied */
}
@@ -1824,6 +1836,7 @@ static int journal_unmap_buffer(journal_t *journal, 
struct buffer_head *bh)
}
}
} else if (transaction == journal-j_committing_transaction) {
+   JBUFFER_TRACE(jh, on committing transaction);
if (jh-b_jlist == BJ_Locked) {
/*
 * The buffer is on the committing transaction's locked
@@ -1838,7 +1851,6 @@ static int journal_unmap_buffer(journal_t *journal, 
struct buffer_head *bh)
 * can remove it's next_transaction pointer from the
 * running transaction if that is set, but nothing
 * else. */
-   JBUFFER_TRACE(jh, on committing transaction);
set_buffer_freed(bh);
if (jh-b_next_transaction) {
J_ASSERT(jh-b_next_transaction ==
@@ -1858,6 +1870,7 @@ static int journal_unmap_buffer(journal_t *journal, 
struct buffer_head *bh)
 * i_size already for this truncate so recovery will not
 * expose the disk blocks we are discarding here.) */
J_ASSERT_JH(jh, transaction == journal-j_running_transaction);
+   JBUFFER_TRACE(jh, on running transaction);
may_free = __dispose_buffer(jh, transaction);
}
 
-
To unsubscribe from this list: send the line unsubscribe linux-ext4 in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: tune2fs -l stale info

2007-04-03 Thread Andreas Dilger
On Apr 03, 2007  15:55 +0200, Vincent Caron wrote:
   BTW, if ondisk superblocks are not updated until specific events occur
 (umount, statfs), what is the consequence of a system crash ? Does
 journalization come into play (superblock=metadata?), does fsck fixes
 figures from other ondisk structures ? Just being curious...

The free blocks/inodes count in the superblock aren't really used for
anything these days.  At mount time the kernel sums up the free inode
and block counts from the group descriptors to put into the percpu
counters, and only the group descriptors are kept uptodate.

Cheers, Andreas
--
Andreas Dilger
Principal Software Engineer
Cluster File Systems, Inc.

-
To unsubscribe from this list: send the line unsubscribe linux-ext4 in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html