date:20070208

[RFC][PATCH 1/3] Allocate new contiguous blocks

2007-02-08 Thread Takashi Sato

Search contiguous free blocks with Alex's mutil-block allocation
and allocate them for the temporary inode.

This patch applies on top of Alex's patches.
[RFC] delayed allocation, mballoc, etc
http://marc.theaimsgroup.com/?l=linux-ext4m=116493228301966w=2

Signed-off-by: Takashi Sato [EMAIL PROTECTED]
---
diff -Nrup -X linux-2.6.19-rc6.org/Documentation/dontdiff 
linux-2.6.19-rc6.org/fs/ext4/extents.c linux-2.6.19-rc6-1/fs/ext4/extents.c
--- linux-2.6.19-rc6.org/fs/ext4/extents.c  2007-02-08 08:40:48.0 
+0900
+++ linux-2.6.19-rc6-1/fs/ext4/extents.c2007-02-08 14:13:49.0 
+0900
@@ -2335,10 +2335,658 @@ int ext4_ext_calc_metadata_amount(struct
return num;
 }
 
+/*
+ * this structure is used to gather extents from the tree via ioctl
+ */
+struct ext4_extent_buf {
+   ext4_fsblk_t start;
+   int buflen;
+   void *buffer;
+   void *cur;
+   int err;
+};
+
+/*
+ * this structure is used to collect stats info about the tree
+ */
+struct ext4_extent_tree_stats {
+   int depth;
+   int extents_num;
+   int leaf_num;
+};
+
+static int
+ext4_ext_store_extent_cb(struct inode *inode,
+   struct ext4_ext_path *path,
+   struct ext4_ext_cache *newex,
+   struct ext4_extent_buf *buf)
+{
+
+   if (newex-ec_type != EXT4_EXT_CACHE_EXTENT)
+   return EXT_CONTINUE;
+
+   if (buf-err  0)
+   return EXT_BREAK;
+   if (buf-cur - buf-buffer + sizeof(*newex)  buf-buflen)
+   return EXT_BREAK;
+
+   if (!copy_to_user(buf-cur, newex, sizeof(*newex))) {
+   buf-err++;
+   buf-cur += sizeof(*newex);
+   } else {
+   buf-err = -EFAULT;
+   return EXT_BREAK;
+   }
+   return EXT_CONTINUE;
+}
+
+static int
+ext4_ext_collect_stats_cb(struct inode *inode,
+   struct ext4_ext_path *path,
+   struct ext4_ext_cache *ex,
+   struct ext4_extent_tree_stats *buf)
+{
+   int depth;
+
+   if (ex-ec_type != EXT4_EXT_CACHE_EXTENT)
+   return EXT_CONTINUE;
+
+   depth = ext_depth(inode);
+   buf-extents_num++;
+   if (path[depth].p_ext == EXT_FIRST_EXTENT(path[depth].p_hdr))
+   buf-leaf_num++;
+   return EXT_CONTINUE;
+}
+
+int ext4_ext_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
+   unsigned long arg)
+{
+   int err = 0;
+   if (!(EXT4_I(inode)-i_flags  EXT4_EXTENTS_FL))
+   return -EINVAL;
+
+   if (cmd == EXT4_IOC_GET_EXTENTS) {
+   struct ext4_extent_buf buf;
+
+   if (copy_from_user(buf, (void *) arg, sizeof(buf)))
+   return -EFAULT;
+
+   buf.cur = buf.buffer;
+   buf.err = 0;
+   mutex_lock(EXT4_I(inode)-truncate_mutex);
+   err = ext4_ext_walk_space(inode, buf.start, EXT_MAX_BLOCK,
+   (void *)ext4_ext_store_extent_cb, buf);
+   mutex_unlock(EXT4_I(inode)-truncate_mutex);
+   if (err == 0)
+   err = buf.err;
+   } else if (cmd == EXT4_IOC_GET_TREE_STATS) {
+   struct ext4_extent_tree_stats buf;
+
+   mutex_lock(EXT4_I(inode)-truncate_mutex);
+   buf.depth = ext_depth(inode);
+   buf.extents_num = 0;
+   buf.leaf_num = 0;
+   err = ext4_ext_walk_space(inode, 0, EXT_MAX_BLOCK,
+   (void *)ext4_ext_collect_stats_cb, buf);
+   mutex_unlock(EXT4_I(inode)-truncate_mutex);
+   if (!err)
+   err = copy_to_user((void *) arg, buf, sizeof(buf));
+   } else if (cmd == EXT4_IOC_GET_TREE_DEPTH) {
+   mutex_lock(EXT4_I(inode)-truncate_mutex);
+   err = ext_depth(inode);
+   mutex_unlock(EXT4_I(inode)-truncate_mutex);
+   } else if (cmd == EXT4_IOC_FIBMAP) {
+   ext4_fsblk_t __user *p = (ext4_fsblk_t __user *)arg;
+   ext4_fsblk_t block = 0;
+   struct address_space *mapping = filp-f_mapping;
+
+   if (copy_from_user(block, (ext4_fsblk_t __user *)arg,
+   sizeof(block)))
+   return -EFAULT;
+
+   lock_kernel();
+   block = ext4_bmap(mapping, block);
+   unlock_kernel();
+
+   return put_user(block, p);
+   } else if (cmd == EXT4_IOC_DEFRAG) {
+   struct ext4_ext_defrag_data defrag;
+
+   if (copy_from_user(defrag,
+   (struct ext4_ext_defrag_data __user *)arg,
+   sizeof(defrag)))
+   return -EFAULT;
+
+   err = ext4_ext_defrag(filp, defrag.start_offset,
+   defrag.defrag_size, defrag.goal);
+

[RFC][PATCH 2/3] Move the file data to the new blocks

2007-02-08 Thread Takashi Sato

Move the blocks on the temporary inode to the original inode
by a page.
1. Read the file data from the old blocks to the page
2. Move the block on the temporary inode to the original inode
3. Write the file data on the page into the new blocks

Signed-off-by: Takashi Sato [EMAIL PROTECTED]
---
diff -Nrup -X linux-2.6.19-rc6.org/Documentation/dontdiff 
linux-2.6.19-rc6-1/fs/ext4/extents.c linux-2.6.19-rc6-full/fs/ext4/extents.c
--- linux-2.6.19-rc6-1/fs/ext4/extents.c2007-02-08 14:13:49.0 
+0900
+++ linux-2.6.19-rc6-full/fs/ext4/extents.c 2007-02-08 14:09:43.0 
+0900
@@ -2533,6 +2533,653 @@ ext4_ext_next_extent(struct inode *inode
 }
 
 /**
+ * ext4_ext_merge_across - merge extents across leaf block
+ * 
+ * @handle journal handle
+ * @inode  target file's inode
+ * @o_startfirst original extent to be defraged
+ * @o_end  last original extent to be defraged
+ * @start_ext  first new extent to be merged
+ * @new_extmiddle of new extent to be merged
+ * @end_extlast new extent to be merged
+ *
+ * This function returns 0 if succeed, otherwise returns error value.
+ */
+static int
+ext4_ext_merge_across_blocks(handle_t *handle, struct inode *inode,
+   struct ext4_extent *o_start,
+   struct ext4_extent *o_end, struct ext4_extent *start_ext,
+   struct ext4_extent *new_ext,struct ext4_extent *end_ext)
+{
+   struct ext4_ext_path *org_path = NULL;
+   unsigned long eblock = 0;
+   int err = 0;
+   int new_flag = 0;
+   int end_flag = 0;
+
+   if (le16_to_cpu(start_ext-ee_len) 
+   le16_to_cpu(new_ext-ee_len) 
+   le16_to_cpu(end_ext-ee_len)) {
+
+   if ((o_start) == (o_end)) {
+
+   /*   start_ext   new_extend_ext
+* dest |-|---||
+* org  |--|
+*/
+
+   ext4_free_blocks(handle, inode, ext_pblock(o_start) +
+le16_to_cpu(start_ext-ee_len),
+le16_to_cpu(new_ext-ee_len), 0);
+
+   end_flag = 1;
+
+   } else {
+
+   /*   start_ext   new_ext   end_ext
+* dest |-|--|-|
+* org  |---|--|
+*/
+
+   ext4_free_blocks(handle, inode, ext_pblock(o_start) +
+   le16_to_cpu(start_ext-ee_len),
+   le16_to_cpu(o_start-ee_len)
+   - le16_to_cpu(start_ext-ee_len), 0);
+
+   ext4_free_blocks(handle, inode, ext_pblock(o_end),
+   le16_to_cpu(o_end-ee_len)
+   - le16_to_cpu(end_ext-ee_len), 0);
+
+   o_end-ee_block = end_ext-ee_block;
+   o_end-ee_len = end_ext-ee_len;
+   ext4_ext_store_pblock(o_end, ext_pblock(end_ext));
+   }
+
+   o_start-ee_len = start_ext-ee_len;
+   new_flag = 1;
+
+   } else if ((le16_to_cpu(start_ext-ee_len)) 
+   (le16_to_cpu(new_ext-ee_len)) 
+   (!le16_to_cpu(end_ext-ee_len)) 
+   ((o_start) == (o_end))) {
+
+   /* start_extnew_ext
+* dest |--|---|
+* org  |--|
+*/
+
+   ext4_free_blocks(handle, inode, ext_pblock(o_start) +
+   le16_to_cpu(start_ext-ee_len),
+   le16_to_cpu(new_ext-ee_len), 0);
+
+   o_start-ee_len = start_ext-ee_len;
+   new_flag = 1;
+
+   } else if ((!le16_to_cpu(start_ext-ee_len)) 
+   (le16_to_cpu(new_ext-ee_len)) 
+   (le16_to_cpu(end_ext-ee_len)) 
+   ((o_start) == (o_end))) {
+
+   /*  new_extend_ext
+* dest |--|---|
+* org  |--|
+*/
+
+   ext4_free_blocks(handle, inode, ext_pblock(o_end),
+   le16_to_cpu(new_ext-ee_len), 0);
+
+   o_end-ee_block = end_ext-ee_block;
+   o_end-ee_len = end_ext-ee_len;
+   ext4_ext_store_pblock(o_end, ext_pblock(end_ext));
+
+   /* If new_ext was first block */
+   if (!new_ext-ee_block)
+   eblock = 0;
+   else
+   eblock = le32_to_cpu(new_ext-ee_block);
+
+   new_flag = 1;
+   } else {
+   printk(Unexpected case \n);
+   return -EIO;
+   }
+
+   if (new_flag) {
+

Re: [RFC][PATCH 2/3] Move the file data to the new blocks

2007-02-08 Thread Andrew Morton

On Thu, 8 Feb 2007 10:29:45 +0100 Jan Kara [EMAIL PROTECTED] wrote:

 On Wed 07-02-07 12:56:59, Andrew Morton wrote:
  On Wed, 7 Feb 2007 13:46:57 -0700
  Andreas Dilger [EMAIL PROTECTED] wrote:
  
   On Feb 06, 2007  17:35 -0800, Andrew Morton wrote:
On Mon, 5 Feb 2007 14:12:04 +0100
Jan Kara [EMAIL PROTECTED] wrote:
  Move the blocks on the temporary inode to the original inode
  by a page.
  1. Read the file data from the old blocks to the page
  2. Move the block on the temporary inode to the original inode
  3. Write the file data on the page into the new blocks
   I have one thing - it's probably not good to use page cache for
 defragmentation.

Then it is no longer online defragmentation.  The issues with 
maintaining
correctness and coherency with ongoing VFS activity would be truly 
ghastly.

If we're worried about pagecache pollution then it would be better to 
control
that from userspace via fadvise().
   
   It should be possible to have the online defrag tool lock the inode 
   against
   any changes,
  
  Sounds easy when you say it fast.  But how do we lock against, say, a
  read pagefault?  Only by writing back then removing the pagecache page then
  reinstantiating it as a locked, not-uptodate page and then removing it from
  pagecache afterwards prior to unlocking it.  Or something.
  
  I don't think we want to go there.
   I though Andreas meant any write changes - i.e. you check that noone
 has open file descriptor for writing and block any new open for writing.
 That can be done quite easily.
   Anyway, I agree with you that userspace solution to a possible page
 cache pollution is preferable after thinking about it for a while.
 As I've been thinking about it, we could actually do the copying
 from user space. We could do something like:
   block any writes to file (as I described above)
   craft new inode with blocks allocated as we want (using preallocation,
 we should mostly have the kernel infrastructure we need)
   copy data using splice syscall
   call the kernel to switch data
 

I don't think we need to block any writes to any file or anything.

To move a page within a file:

fd = open(file);
p = mmap(fd);
the_page_was_in_core = mincore(p, offset);
munmap(p);
ioctl(fd, ..., new_block);

kernel
read_cache_page(inode, offset);
lock_page(page);
if (try_to_free_buffers(page)) {
relocate the page
set_page_dirty(page);
}
unlock_page(page);

if (the_page_was_in_core) {
sync_file_range(fd, offset SYNC_FILE_RANGE_WAIT_BEFORE|
SYNC_FILE_RANGE_WRITE|
SYNC_FILE_RANGE_WAIT_AFTER);
fadvise(fd, offset, FADV_DONTNEED);
}

completely coherent with pagecache, quite safe in the presence of mmap,
mlock, O_DIRECT, everything else.  Also fully journallable in-kernel.

-
To unsubscribe from this list: send the line unsubscribe linux-ext4 in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC][PATCH 2/3] Move the file data to the new blocks

2007-02-08 Thread Jan Kara

On Thu 08-02-07 01:45:29, Andrew Morton wrote:
 snip
I though Andreas meant any write changes - i.e. you check that noone
  has open file descriptor for writing and block any new open for writing.
  That can be done quite easily.
Anyway, I agree with you that userspace solution to a possible page
  cache pollution is preferable after thinking about it for a while.
  As I've been thinking about it, we could actually do the copying
  from user space. We could do something like:
block any writes to file (as I described above)
craft new inode with blocks allocated as we want (using preallocation,
  we should mostly have the kernel infrastructure we need)
copy data using splice syscall
call the kernel to switch data
  
 
 I don't think we need to block any writes to any file or anything.
 
 To move a page within a file:
 
   fd = open(file);
   p = mmap(fd);
   the_page_was_in_core = mincore(p, offset);
   munmap(p);
   ioctl(fd, ..., new_block);
 
   kernel
   read_cache_page(inode, offset);
   lock_page(page);
   if (try_to_free_buffers(page)) {
   relocate the page
   set_page_dirty(page);
   }
   unlock_page(page);
 
   if (the_page_was_in_core) {
   sync_file_range(fd, offset SYNC_FILE_RANGE_WAIT_BEFORE|
   SYNC_FILE_RANGE_WRITE|
   SYNC_FILE_RANGE_WAIT_AFTER);
   fadvise(fd, offset, FADV_DONTNEED);
   }
 
 completely coherent with pagecache, quite safe in the presence of mmap,
 mlock, O_DIRECT, everything else.  Also fully journallable in-kernel.
  Yes, this is the simple way. But I see two disadvantages:
1) You'd like to relocate metadata (indirect blocks) too. For that you need
   a different mechanism. In my approach, you can mostly assume you've got
   sanely laid out metadata and so the existence of such mechanism is not
   so important.
2) You'd like to allocate new blocks in big chunks. So your kernel function
   should rather take a range. Also when you fail in the middle of
   relocating a file (for example the block you'd like to use is already
   taken by someone else), I find it nice if you can return at least to the
   original state. But that's probably not important.

Honza

-- 
Jan Kara [EMAIL PROTECTED]
SuSE CR Labs
-
To unsubscribe from this list: send the line unsubscribe linux-ext4 in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC] [PATCH 1/1] Nanosecond timestamps

2007-02-08 Thread Johann Lombardi

On Wed, Feb 07, 2007 at 01:39:46PM -0700, Andreas Dilger wrote:
 This has been a bug in several places already, and I wonder if the
 le*_to_cpu() and cpu_to_le*() macros shouldn't do some type checking
 instead of just casting the variable to the specified type?

That would be great.

 The only problem is if casting constants it would be a bit of a pain
 to have to cast them explicitly, though we could have something like:
 
 #define le16_to_cpu(var) (__builtin_constant(var) || !typecheck(__u16, var) ? 
 \
 __constant_cpu_to_le16(var) : __le16_to_cpu(var))

Very good idea!

 The only question is whether typecheck adds extra variables on the stack
 or if the compiler will always optimize them away.

I tend to think it will always be optimized by the compiler.
 
  If the inode size is EXT3_GOOD_OLD_INODE_SIZE, sbi-s_want_extra_isize won't
  be initialized. However, it should not be an issue because the ext3_sb_info
  is set to zero in ext3_fill_super().
 
 So I'm not sure I understand if you have an objection or if this is just a
 comment.

Just a useless comment :)

 sbi-s_want_extra_isize will be zero and it is not possible for
 sbi-s_inode_size  EXT3_GOOD_OLD_INODE_SIZE so this case won't be hit.

I agree.

Cheers,
Johann
-
To unsubscribe from this list: send the line unsubscribe linux-ext4 in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC] [PATCH 1/1] Nanosecond timestamps

2007-02-08 Thread Johann Lombardi

On Wed, Feb 07, 2007 at 03:05:39PM -0600, Dave Kleikamp wrote:
 On Wed, 2007-02-07 at 13:39 -0700, Andreas Dilger wrote:
  You are right - this works fine on little endian systems, but fails on
  big endian systems where you will get the other half of the word.
  
  This has been a bug in several places already, and I wonder if the
  le*_to_cpu() and cpu_to_le*() macros shouldn't do some type checking
  instead of just casting the variable to the specified type?
 
 I think that sparse will catch this.  To get the endian checks you need
 to do something like this:
 
 make C=2 CF=-D__CHECK_ENDIAN__'

Indeed:

  CHECK   fs/ext3/super.c
  fs/ext3/super.c:1787:8: warning: cast to restricted type
  fs/ext3/super.c:1789:6: warning: cast to restricted type
  fs/ext3/super.c:1791:8: warning: cast to restricted type
  fs/ext3/super.c:1793:6: warning: cast to restricted type

Thanks,
Johann
-
To unsubscribe from this list: send the line unsubscribe linux-ext4 in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC][PATCH 2/3] Move the file data to the new blocks

2007-02-08 Thread Andrew Morton

On Thu, 8 Feb 2007 11:21:02 +0100 Jan Kara [EMAIL PROTECTED] wrote:

 On Thu 08-02-07 01:45:29, Andrew Morton wrote:
  snip
 I though Andreas meant any write changes - i.e. you check that noone
   has open file descriptor for writing and block any new open for writing.
   That can be done quite easily.
 Anyway, I agree with you that userspace solution to a possible page
   cache pollution is preferable after thinking about it for a while.
   As I've been thinking about it, we could actually do the copying
   from user space. We could do something like:
 block any writes to file (as I described above)
 craft new inode with blocks allocated as we want (using preallocation,
   we should mostly have the kernel infrastructure we need)
 copy data using splice syscall
 call the kernel to switch data
   
  
  I don't think we need to block any writes to any file or anything.
  
  To move a page within a file:
  
  fd = open(file);
  p = mmap(fd);
  the_page_was_in_core = mincore(p, offset);
  munmap(p);
  ioctl(fd, ..., new_block);
  
  kernel
  read_cache_page(inode, offset);
  lock_page(page);
  if (try_to_free_buffers(page)) {
  relocate the page
  set_page_dirty(page);
  }
  unlock_page(page);
  
  if (the_page_was_in_core) {
  sync_file_range(fd, offset SYNC_FILE_RANGE_WAIT_BEFORE|
  SYNC_FILE_RANGE_WRITE|
  SYNC_FILE_RANGE_WAIT_AFTER);
  fadvise(fd, offset, FADV_DONTNEED);
  }
  
  completely coherent with pagecache, quite safe in the presence of mmap,
  mlock, O_DIRECT, everything else.  Also fully journallable in-kernel.
   Yes, this is the simple way. But I see two disadvantages:
 1) You'd like to relocate metadata (indirect blocks) too.

Well.  Do we really?  Are we looking for a 100% solution here, or a 90% one?

Relocating data is the main thing.  After that, yeah, relocating metadata,
inodes and directories is probably a second-order thing.

 For that you need
a different mechanism.

I suspect a similar approach will work there: load and lock the
buffer_heads (or maybe just the top-level buffer_head) and then alter their
contents.  It could be that verify_chain() will just magically do the right
thing there, but some changes might be needed.

 In my approach, you can mostly assume you've got
sanely laid out metadata and so the existence of such mechanism is not
so important.
 2) You'd like to allocate new blocks in big chunks. So your kernel function
should rather take a range. Also when you fail in the middle of
relocating a file (for example the block you'd like to use is already
taken by someone else), I find it nice if you can return at least to the
original state. But that's probably not important.

Well yes, that was a minimal sketch.
-
To unsubscribe from this list: send the line unsubscribe linux-ext4 in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC][PATCH 2/3] Move the file data to the new blocks

2007-02-08 Thread Jan Kara

On Thu 08-02-07 02:32:13, Andrew Morton wrote:
 On Thu, 8 Feb 2007 11:21:02 +0100 Jan Kara [EMAIL PROTECTED] wrote:
 
  On Thu 08-02-07 01:45:29, Andrew Morton wrote:
   snip
  I though Andreas meant any write changes - i.e. you check that noone
has open file descriptor for writing and block any new open for writing.
That can be done quite easily.
  Anyway, I agree with you that userspace solution to a possible page
cache pollution is preferable after thinking about it for a while.
As I've been thinking about it, we could actually do the copying
from user space. We could do something like:
  block any writes to file (as I described above)
  craft new inode with blocks allocated as we want (using preallocation,
we should mostly have the kernel infrastructure we need)
  copy data using splice syscall
  call the kernel to switch data

   
   I don't think we need to block any writes to any file or anything.
   
   To move a page within a file:
   
 fd = open(file);
 p = mmap(fd);
 the_page_was_in_core = mincore(p, offset);
 munmap(p);
 ioctl(fd, ..., new_block);
   
 kernel
 read_cache_page(inode, offset);
 lock_page(page);
 if (try_to_free_buffers(page)) {
 relocate the page
 set_page_dirty(page);
 }
 unlock_page(page);
   
 if (the_page_was_in_core) {
 sync_file_range(fd, offset SYNC_FILE_RANGE_WAIT_BEFORE|
 SYNC_FILE_RANGE_WRITE|
 SYNC_FILE_RANGE_WAIT_AFTER);
 fadvise(fd, offset, FADV_DONTNEED);
 }
   
   completely coherent with pagecache, quite safe in the presence of mmap,
   mlock, O_DIRECT, everything else.  Also fully journallable in-kernel.
Yes, this is the simple way. But I see two disadvantages:
  1) You'd like to relocate metadata (indirect blocks) too.
 
 Well.  Do we really?  Are we looking for a 100% solution here, or a 90% one?
  Umm, I think that for ext3 having data on one end of the disk and
indirect blocks on the other end of the disk does not quite help (not
mentioning that it can create bad free space fragmentation over the time).
I have not measured it but I'd guess that it would erase the effect of
moving data closer together. At least for sequential reads..

 Relocating data is the main thing.  After that, yeah, relocating metadata,
 inodes and directories is probably a second-order thing.
 
  For that you need
 a different mechanism.
 
 I suspect a similar approach will work there: load and lock the
 buffer_heads (or maybe just the top-level buffer_head) and then alter their
 contents.  It could be that verify_chain() will just magically do the right
 thing there, but some changes might be needed.
  Yes, it could be done. I just wanted to point to the fact that things may
not be as simple in your solution either...

Honza
-- 
Jan Kara [EMAIL PROTECTED]
SuSE CR Labs
-
To unsubscribe from this list: send the line unsubscribe linux-ext4 in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC][PATCH 3/3] Online defrag command

2007-02-08 Thread Jens Axboe

On Thu, Feb 08 2007, Takashi Sato wrote:
 The defrag command.  Usage is as follows:
 o Put the multiple files closer together.
   # e4defrag -r directory-name
 o Defrag for a single file.
   # e4defrag file-name
 o Defrag for all files on ext4.
   # e4defrag device-name

Would it be possible to provide support for putting multiple files close
together? Ala

# e4defrag file1 file2 file3 ... fileN

I'm thinking boot speedup, gather the list of read files and put them
close on disk.

-- 
Jens Axboe

-
To unsubscribe from this list: send the line unsubscribe linux-ext4 in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] e2fsck journal recovery can corrupt all superblock backups

2007-02-08 Thread Andreas Dilger

Ted,
this was sent with the first patch, and it looks like it is a very
serious problem.

Looking through the e2fsck code it would also seem possible to move the
setting of EXT2_FLAG_MASTER_SB_ONLY before the journal replay.  That
is the second patch.  I'm not sure which one is better.

Jim Garlick wrote:
 When fsck replays the journal, it clears the EXT3_FEATURE_INCOMPAT_RECOVER
 feature, dirties the superblock, and closes the file system.
 Unfortunately, the file system EXT2_FLAG_MASTER_SB_ONLY flag is not set
 at this time, so it copies the primary superblock and group descriptors
 over all the backups.  Then fsck restarts and checks the superblock for
 consistancy.  If the superblock or group descriptors are then found to
 be bad, all your backups are now also bad.

Index: e2fsprogs+chaos/lib/ext2fs/openfs.c
===
--- e2fsprogs+chaos.orig/lib/ext2fs/openfs.c
+++ e2fsprogs+chaos/lib/ext2fs/openfs.c
@@ -101,6 +101,8 @@ errcode_t ext2fs_open2(const char *name,
memset(fs, 0, sizeof(struct struct_ext2_filsys));
fs-magic = EXT2_ET_MAGIC_EXT2FS_FILSYS;
fs-flags = flags;
+   /* don't overwrite sb backups unless flag is explicitly cleared */
+   fs-flags |= EXT2_FLAG_MASTER_SB_ONLY; 
fs-umask = 022;
retval = ext2fs_get_mem(strlen(name)+1, fs-device_name);
if (retval)

---

Index: e2fsprogs/e2fsck/unix.c
===
--- e2fsprogs.orig/e2fsck/unix.c2006-12-27 17:12:23.0 -0700
+++ e2fsprogs/e2fsck/unix.c 2007-02-08 22:05:13.0 -0700
@@ -1153,6 +1153,15 @@ restart:
}
 
/*
+* We only update the master superblock because (a) paranoia;
+* we don't want to corrupt the backup superblocks, and (b) we
+* don't need to update the mount count and last checked
+* fields in the backup superblock (the kernel doesn't
+* update the backup superblocks anyway).
+*/
+   fs-flags |= EXT2_FLAG_MASTER_SB_ONLY;
+
+   /*
 * Check to see if we need to do ext3-style recovery.  If so,
 * do it, and then restart the fsck.
 */
@@ -1227,15 +1236,6 @@ restart:
!(ctx-options  E2F_OPT_READONLY))
ext2fs_mark_super_dirty(fs);
 
-   /*
-* We only update the master superblock because (a) paranoia;
-* we don't want to corrupt the backup superblocks, and (b) we
-* don't need to update the mount count and last checked
-* fields in the backup superblock (the kernel doesn't
-* update the backup superblocks anyway).
-*/
-   fs-flags |= EXT2_FLAG_MASTER_SB_ONLY;
-
ehandler_init(fs-io);
 
if (ctx-superblock)

Cheers, Andreas
--
Andreas Dilger
Principal Software Engineer
Cluster File Systems, Inc.

-
To unsubscribe from this list: send the line unsubscribe linux-ext4 in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[RFC][PATCH 1/3] Allocate new contiguous blocks

[RFC][PATCH 2/3] Move the file data to the new blocks

Re: [RFC][PATCH 2/3] Move the file data to the new blocks

Re: [RFC][PATCH 2/3] Move the file data to the new blocks

Re: [RFC] [PATCH 1/1] Nanosecond timestamps

Re: [RFC] [PATCH 1/1] Nanosecond timestamps

Re: [RFC][PATCH 2/3] Move the file data to the new blocks

Re: [RFC][PATCH 2/3] Move the file data to the new blocks

Re: [RFC][PATCH 3/3] Online defrag command

[PATCH] e2fsck journal recovery can corrupt all superblock backups

10 matches

Site Navigation

Mail list logo

Footer information