[PATCH 2/2] FIEMAP ioctl for ext4

2007-11-12 Thread Kalpak Shah
Recently there was discussion about an FIle Extent MAP(FIEMAP) ioctl for 
efficiently mapping the extents and holes of a file. This will be many times 
more efficient than FIBMAP by cutting down the number of ioctls.

This patch adds the FIEMAP ioctl for ext4. The spec for the FIEMAP ioctl was 
posted earlier by Andreas Dilger and can be found at:
http://www.mail-archive.com/[EMAIL PROTECTED]/msg03944.html

Signed-off-by: Andreas Dilger [EMAIL PROTECTED]
Signed-off-by: Kalpak Shah [EMAIL PROTECTED] 

Index: linux-2.6.23.1/fs/ext4/ioctl.c
===
--- linux-2.6.23.1.orig/fs/ext4/ioctl.c
+++ linux-2.6.23.1/fs/ext4/ioctl.c
@@ -16,6 +16,7 @@
 #include linux/compat.h
 #include linux/smp_lock.h
 #include asm/uaccess.h
+#include linux/fiemap.h
 
 int ext4_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
unsigned long arg)
@@ -248,6 +249,9 @@ flags_err:
 
return err;
}
+   case EXT4_IOC_FIEMAP: {
+   return ext4_fiemap(inode, filp, cmd, arg);
+   }
 
default:
return -ENOTTY;
Index: linux-2.6.23.1/include/linux/ext4_fs.h
===
--- linux-2.6.23.1.orig/include/linux/ext4_fs.h
+++ linux-2.6.23.1/include/linux/ext4_fs.h
@@ -228,15 +228,20 @@ struct ext4_new_group_data {
 #defineEXT4_IOC_SETFLAGS   FS_IOC_SETFLAGS
 #defineEXT4_IOC_GETVERSION _IOR('f', 3, long)
 #defineEXT4_IOC_SETVERSION _IOW('f', 4, long)
+#define EXT4_IOC_GETRSVSZ  _IOR('f', 5, long)
+#define EXT4_IOC_SETRSVSZ  _IOW('f', 6, long)
 #define EXT4_IOC_GROUP_EXTEND  _IOW('f', 7, unsigned long)
 #define EXT4_IOC_GROUP_ADD _IOW('f', 8,struct ext4_new_group_input)
+#define EXT4_IOC_FIEMAP_IOWR('f', 10, struct fiemap)
 #defineEXT4_IOC_GETVERSION_OLD FS_IOC_GETVERSION
 #defineEXT4_IOC_SETVERSION_OLD FS_IOC_SETVERSION
 #ifdef CONFIG_JBD2_DEBUG
 #define EXT4_IOC_WAIT_FOR_READONLY _IOR('f', 99, long)
 #endif
-#define EXT4_IOC_GETRSVSZ  _IOR('f', 5, long)
-#define EXT4_IOC_SETRSVSZ  _IOW('f', 6, long)
+
+/* ext4 only handles a single LUN, ignore LUN_OFFSET flag */
+#define EXT4_FIEMAP_FLAG_INCOMPAT_UNSUPP (FIEMAP_FLAG_INCOMPAT\
+ ~(FIEMAP_FLAG_LUN_OFFSET))
 
 /*
  * ioctl commands in 32 bit emulation
@@ -1067,6 +1072,8 @@ ext4_get_blocks_wrap(handle_t *handle, s
return ext4_get_blocks_handle(handle, inode, block, max_blocks, bh,
create, extend_disksize);
 }
+extern int ext4_fiemap(struct inode *inode, struct file *filp, unsigned int 
cmd,
+  unsigned long arg);
 

 #endif /* __KERNEL__ */
Index: linux-2.6.23.1/include/linux/ext4_fs_extents.h
===
--- linux-2.6.23.1.orig/include/linux/ext4_fs_extents.h
+++ linux-2.6.23.1/include/linux/ext4_fs_extents.h
@@ -131,8 +131,8 @@ struct ext4_ext_path {
  * callback must return valid extent (passed or newly created)
  */
 typedef int (*ext_prepare_callback)(struct inode *, struct ext4_ext_path *,
-   struct ext4_ext_cache *,
-   void *);
+   struct ext4_ext_cache *,
+   struct ext4_extent *, void *);
 
 #define EXT_CONTINUE   0
 #define EXT_BREAK  1
Index: linux-2.6.23.1/fs/ext4/extents.c
===
--- linux-2.6.23.1.orig/fs/ext4/extents.c
+++ linux-2.6.23.1/fs/ext4/extents.c
@@ -41,6 +41,7 @@
 #include linux/slab.h
 #include linux/falloc.h
 #include linux/ext4_fs_extents.h
+#include linux/fiemap.h
 #include asm/uaccess.h
 

@@ -1512,7 +1513,7 @@ int ext4_ext_walk_space(struct inode *in
}
 
BUG_ON(cbex.ec_len == 0);
-   err = func(inode, path, cbex, cbdata);
+   err = func(inode, path, cbex, ex, cbdata);
ext4_ext_drop_refs(path);
 
if (err  0)
@@ -2629,3 +2630,163 @@ retry:
 
return ret  0 ? ret2 : ret;
 }
+
+struct fiemap_internal {
+   struct fiemap   *fiemap_s;
+   struct fiemap_extentfm_extent;
+   size_t  tot_mapping_len;
+   char*cur_ext_ptr;
+   int current_extent;
+   int err;
+};
+
+/*
+ * Callback function called for each extent to gather FIEMAP information.
+ */
+int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
+  struct ext4_ext_cache *newex, struct ext4_extent *ex,
+  void *data)
+{
+   struct fiemap_internal *fiemap_i = data;
+   struct fiemap *fiemap_s = fiemap_i

[PATCH 1/2] Add FIEMAP header file

2007-11-12 Thread Kalpak Shah
Recently there was discussion about an FIle Extent MAP(FIEMAP) ioctl for 
efficiently mapping the extents and holes of a file. This will be many times 
more efficient than FIBMAP by cutting down the number of ioctls.

This patch adds the FIEMAP header file in include/linux.

Signed-off-by: Andreas Dilger [EMAIL PROTECTED]
Signed-off-by: Kalpak Shah [EMAIL PROTECTED] 

Index: linux-2.6.23.1/include/linux/fiemap.h
===
--- /dev/null
+++ linux-2.6.23.1/include/linux/fiemap.h
@@ -0,0 +1,49 @@
+/*
+ * include/linux/fiemap.h
+ *
+ * Copyright (C) 2007 Sun Microsystems, Inc.
+ *
+ * Author: Kalpak Shah [EMAIL PROTECTED]
+ *Andreas Dilger [EMAIL PROTECTED]
+ */
+
+#ifndef _LINUX_FIEMAP_H
+#define _LINUX_FIEMAP_H
+
+struct fiemap_extent {
+   __u64   fe_offset; /* offset in bytes for the start of the extent */
+   __u64   fe_length; /* length in bytes for the extent */
+   __u32   fe_flags;  /* returned FIEMAP_EXTENT_* flags for the extent */
+   __u32   fe_lun;/* logical device number for extent (starting at 0)*/
+};
+
+struct fiemap {
+   __u64   fm_start;/* logical starting byte offset (in/out) */
+   __u64   fm_length;   /* logical length of map (in/out) */
+   __u32   fm_flags;/* FIEMAP_FLAG_* flags for request (in/out) */
+   __u32   fm_extent_count; /* number of extents in fm_extents (in/out) */
+   __u64   fm_end_offset;   /* logical offset of end of mapping in last 
ioctl */
+   struct fiemap_extentfm_extents[0];
+};
+
+#define FIEMAP_FLAG_SYNC   0x0001 /* sync file data before map */
+#define FIEMAP_FLAG_HSM_READ   0x0002 /* get data from HSM before map */
+#define FIEMAP_FLAG_NUM_EXTENTS0x0004 /* return only number of 
extents */
+#define FIEMAP_FLAG_INCOMPAT   0xff00 /* error for unknown flags in here */
+
+#define FIEMAP_FLAG_LUN_OFFSET 0x0100 /* use lun offsets, instead of
+   * logical file offsets */
+
+#define FIEMAP_EXTENT_HOLE 0x0001 /* has no data or space allocation */
+#define FIEMAP_EXTENT_UNWRITTEN0x0002 /* space allocated, but no 
data */
+#define FIEMAP_EXTENT_UNMAPPED 0x0004 /* has data but no space allocation*/
+#define FIEMAP_EXTENT_ERROR0x0008 /* mapping error, errno in fe_start*/
+#define FIEMAP_EXTENT_NO_DIRECT0x0010 /* cannot access data 
directly */
+#define FIEMAP_EXTENT_LAST 0x0020 /* last extent in the file */
+#define FIEMAP_EXTENT_DELALLOC 0x0040 /* has data but not yet written,
+   * must have EXTENT_UNKNOWN set */
+#define FIEMAP_EXTENT_SECONDARY0x0080 /* data (also) in secondary 
storage,
+   * not in primary if EXTENT_UNKNOWN*/
+#define FIEMAP_EXTENT_EOF  0x0100 /* if fm_start+fm_len is beyond EOF*/
+
+#endif /* _LINUX_FIEMAP_H */


-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [EXT4 set 3][PATCH 1/1] ext4 nanosecond timestamp

2007-07-17 Thread Kalpak Shah
On Mon, 2007-07-16 at 17:49 -0700, Mingming Cao wrote:
 On Tue, 2007-07-10 at 16:30 -0700, Andrew Morton wrote:
  On Sun, 01 Jul 2007 03:36:56 -0400
  Mingming Cao [EMAIL PROTECTED] wrote:
   +static inline __le32 ext4_encode_extra_time(struct timespec *time)
   +{
   +   return cpu_to_le32((sizeof(time-tv_sec)  4 ?
   +time-tv_sec  32 : 0) |
   +((time-tv_nsec  2)  EXT4_NSEC_MASK));
   +}
   +
   +static inline void ext4_decode_extra_time(struct timespec *time, __le32 
   extra)
   +{
   +   if (sizeof(time-tv_sec)  4)
   +time-tv_sec |= (__u64)(le32_to_cpu(extra)  EXT4_EPOCH_MASK)
   + 32;
   +   time-tv_nsec = (le32_to_cpu(extra)  EXT4_NSEC_MASK)  2;
   +}
  
  Consider uninlining these functions.
  
 I got compile warining after apply Kalpal's update nanosecond patch,
 which makes these two function inline. It complains these functions are
 defined but not used. It's being used only in the following
 micros(EXT4_INODE_SET_XTIME etc).  So if the .c file included the
 ext4_fs.h but not using the micros, the compile will think these two
 functions are not used.

The compile warnings were introduced because the functions were
uninlined. So we can either keep these functions inlined or consider
adding a __used attribute to these two functions. 

Thanks,
Kalpak.

-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [EXT4 set 7][PATCH 1/1]Remove 32000 subdirs limit.

2007-07-17 Thread Kalpak Shah
On Fri, 2007-07-13 at 09:53 -0700, Andrew Morton wrote:
 On Fri, 13 Jul 2007 16:00:48 +0530 Kalpak Shah [EMAIL PROTECTED] wrote:
 
 
-   if (inode-i_nlink = EXT4_LINK_MAX)
+   if (EXT4_DIR_LINK_MAX(inode))
return -EMLINK;
   
   argh.  WHY_IS_EXT4_FULL_OF_UPPER_CASE_MACROS_WHICH_COULD_BE_IMPLEMENTED
   as_lower_case_inlines?  Sigh.  It's all the old-timers, I guess.
   
   EXT4_DIR_LINK_MAX() is buggy: it evaluates its arg twice.
  
  #define EXT4_DIR_LINK_MAX(dir) (!is_dx(dir)  (dir)-i_nlink = 
  EXT4_LINK_MAX)
  
  This just checks if directory has hash indexing in which case we need not 
  worry about EXT4_LINK_MAX subdir limit. If directory is not hash indexed 
  then we will need to enforce a max subdir limit. 
  
  Sorry, I didn't understand what is the problem with this macro?
 
 Macros should never evaluate their argument more than once, because if they
 do they will misbehave when someone passes them an
 expression-with-side-effects:
 
   struct inode *p = q;
 
   EXT4_DIR_LINK_MAX(p++);
 
 one expects `p' to have the value q+1 here.  But it might be q+2.
 
 and
 
   EXT4_DIR_LINK_MAX(some_function());
 
 might cause some_function() to be called twice.
 
 
 This is one of the many problems which gets fixed when we write code in C
 rather than in cpp.

Thanks. Here is the fix converting these macros into inlined functions.



The EXT4_DIR_LINK_MAX(dir) and EXT4_DIR_LINK_EMPTY(dir) macros were
evaluating their argument twice so convert them into inlined functions.

Signed-off-by: Kalpak Shah [EMAIL PROTECTED]

Index: linux-2.6.22/fs/ext4/namei.c
===
--- linux-2.6.22.orig/fs/ext4/namei.c
+++ linux-2.6.22/fs/ext4/namei.c
@@ -1742,7 +1742,7 @@ static int ext4_mkdir(struct inode * dir
struct ext4_dir_entry_2 * de;
int err, retries = 0;

-   if (EXT4_DIR_LINK_MAX(dir))
+   if (ext4_dir_link_max(dir))
return -EMLINK;

 retry:
@@ -2062,7 +2062,7 @@ static int ext4_rmdir (struct inode * di
retval = ext4_delete_entry(handle, dir, de, bh);
if (retval)
goto end_rmdir;
-   if (!EXT4_DIR_LINK_EMPTY(inode))
+   if (!ext4_dir_link_empty(inode))
ext4_warning (inode-i_sb, ext4_rmdir,
  empty directory has too many links (%d),
  inode-i_nlink);
@@ -2201,7 +2201,7 @@ static int ext4_link (struct dentry * ol
struct inode *inode = old_dentry-d_inode;
int err, retries = 0;

-   if (EXT4_DIR_LINK_MAX(inode))
+   if (ext4_dir_link_max(inode))
return -EMLINK;

/*
Index: linux-2.6.22/include/linux/ext4_fs.h
===
--- linux-2.6.22.orig/include/linux/ext4_fs.h
+++ linux-2.6.22/include/linux/ext4_fs.h
@@ -797,12 +797,18 @@ struct ext4_dir_entry_2 {
   #define is_dx(dir) (EXT4_HAS_COMPAT_FEATURE(dir-i_sb, \
  EXT4_FEATURE_COMPAT_DIR_INDEX)  
\
  (EXT4_I(dir)-i_flags  EXT4_INDEX_FL))
-#define EXT4_DIR_LINK_MAX(dir) (!is_dx(dir)  (dir)-i_nlink = EXT4_LINK_MAX)
-#define EXT4_DIR_LINK_EMPTY(dir) ((dir)-i_nlink == 2 || (dir)-i_nlink == 1)
+static inline int ext4_dir_link_max(struct inode *dir)
+{
+   return (!is_dx(dir)  (dir)-i_nlink = EXT4_LINK_MAX);
+}
+static inline int ext4_dir_link_empty(struct inode *dir)
+{
+   return ((dir)-i_nlink == 2 || (dir)-i_nlink == 1);
+}
 #else
   #define is_dx(dir) 0
-#define EXT4_DIR_LINK_MAX(dir) ((dir)-i_nlink = EXT4_LINK_MAX)
-#define EXT4_DIR_LINK_EMPTY(dir) ((dir)-i_nlink == 2)
+#define ext4_dir_link_max(dir) ((dir)-i_nlink = EXT4_LINK_MAX)
+#define ext4_dir_link_empty(dir) ((dir)-i_nlink == 2)
 #endif

 /* Legal values for the dx_root hash_version field: */



-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [EXT4 set 3][PATCH 1/1] ext4 nanosecond timestamp

2007-07-13 Thread Kalpak Shah
On Fri, 2007-07-13 at 09:59 +0530, Aneesh Kumar K.V wrote:
 
 Kalpak Shah wrote:
  On Tue, 2007-07-10 at 16:30 -0700, Andrew Morton wrote:
  On Sun, 01 Jul 2007 03:36:56 -0400
  Mingming Cao [EMAIL PROTECTED] wrote:
 
  This patch is a spinoff of the old nanosecond patches.
  I don't know what the old nanosecond patches are.  A link to a suitable
  changlog for those patches would do in a pinch.  Preferable would be to
  write a proper changelog for this patch.
  
  The incremental patch contains a proper changelog describing the patch.
  
 
 
 Instead of  putting incremental patches it would be nice if we can have 
 replacement patches.
 for the already existing patches with the comments addressed. For example if 
 we have a 
 review comment on the patch message ( commit log ) then adding an incremental 
 patch doesn't help.

I think that it would be easier to review just the changes that have
been made to the patches instead of having people go through the entire
patch again. I was hoping that someone with write access to ext4-git
would update the commit logs.

If replacement patches are preferred, then I will send them again.

Thanks,
Kalpak.

 
 
 -aneesh
 -
 To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
 the body of a message to [EMAIL PROTECTED]
 More majordomo info at  http://vger.kernel.org/majordomo-info.html

-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [EXT4 set 7][PATCH 1/1]Remove 32000 subdirs limit.

2007-07-13 Thread Kalpak Shah
The updated patch is attached. comments inline...

On Tue, 2007-07-10 at 22:40 -0700, Andrew Morton wrote:
  If we exceed 65000 subdirectories in an htree directory it sets the
  inode link count to 1 and no longer counts subdirectories.  The
  directory link count is not actually used when determining if a
  directory is empty, as that only counts subdirectories and not regular
  files that might be in there. 
  
  A EXT4_FEATURE_RO_COMPAT_DIR_NLINK flag has been added and it is set if
  the subdir count for any directory crosses 65000.
  
 
 Would I be correct in assuming that a later fsck will clear
 EXT4_FEATURE_RO_COMPAT_DIR_NLINK if there are no longer any 65000 subdir
 directories?
 
 If so, that is worth a mention in the changelog, perhaps?

The changelog has been updated to include this.

   
  +static inline void ext4_inc_count(handle_t *handle, struct inode *inode)
  +{
  +   inc_nlink(inode);
  +   if (is_dx(inode)  inode-i_nlink  1) {
  +   /* limit is 16-bit i_links_count */
  +   if (inode-i_nlink = EXT4_LINK_MAX || inode-i_nlink == 2) {
  +   inode-i_nlink = 1;
  +   EXT4_SET_RO_COMPAT_FEATURE(inode-i_sb,
  + EXT4_FEATURE_RO_COMPAT_DIR_NLINK);
  +   }
  +   }
  +}
 
 Looks too big to be inlined.
 
 Why do we set EXT4_FEATURE_RO_COMPAT_DIR_NLINK if i_nlink==2?

I have added a comment for this. (since it indicates that nlinks==1
previously).

 
  +static inline void ext4_dec_count(handle_t *handle, struct inode *inode)
  +{
  +   drop_nlink(inode);
  +   if (S_ISDIR(inode-i_mode)  inode-i_nlink == 0)
  +   inc_nlink(inode);
  +}
 
 Probably too big to inline.

Removed the inline.

   
  -   if (inode-i_nlink = EXT4_LINK_MAX)
  +   if (EXT4_DIR_LINK_MAX(inode))
  return -EMLINK;
 
 argh.  WHY_IS_EXT4_FULL_OF_UPPER_CASE_MACROS_WHICH_COULD_BE_IMPLEMENTED
 as_lower_case_inlines?  Sigh.  It's all the old-timers, I guess.
 
 EXT4_DIR_LINK_MAX() is buggy: it evaluates its arg twice.

#define EXT4_DIR_LINK_MAX(dir) (!is_dx(dir)  (dir)-i_nlink = EXT4_LINK_MAX)

This just checks if directory has hash indexing in which case we need not worry 
about EXT4_LINK_MAX subdir limit. If directory is not hash indexed then we will 
need to enforce a max subdir limit. 

Sorry, I didn't understand what is the problem with this macro?

Thanks,
Kalpak.
This patch adds support to ext4 for allowing more than 65000
subdirectories. Currently the maximum number of subdirectories is capped
at 32000.

If we exceed 65000 subdirectories in an htree directory it sets the
inode link count to 1 and no longer counts subdirectories.  The
directory link count is not actually used when determining if a
directory is empty, as that only counts subdirectories and not regular
files that might be in there. 

A EXT4_FEATURE_RO_COMPAT_DIR_NLINK flag has been added and it is set if
the subdir count for any directory crosses 65000. A later fsck will clear
EXT4_FEATURE_RO_COMPAT_DIR_NLINK if there are no longer any directory
with 65000 subdirs.

Signed-off-by: Andreas Dilger [EMAIL PROTECTED]
Signed-off-by: Kalpak Shah [EMAIL PROTECTED]


---
 fs/ext4/namei.c |   52 +++-
 include/linux/ext4_fs.h |4 ++-
 2 files changed, 41 insertions(+), 15 deletions(-)

Index: linux-2.6.22/fs/ext4/namei.c
===
--- linux-2.6.22.orig/fs/ext4/namei.c
+++ linux-2.6.22/fs/ext4/namei.c
@@ -1617,6 +1617,35 @@ static int ext4_delete_entry (handle_t *
 	return -ENOENT;
 }
 
+/*
+ * DIR_NLINK feature is set if 1) nlinks  EXT4_LINK_MAX or 2) nlinks == 2,
+ * since this indicates that nlinks count was previously 1.
+ */
+static void ext4_inc_count(handle_t *handle, struct inode *inode)
+{
+	inc_nlink(inode);
+	if (is_dx(inode)  inode-i_nlink  1) {
+		/* limit is 16-bit i_links_count */
+		if (inode-i_nlink = EXT4_LINK_MAX || inode-i_nlink == 2) {
+			inode-i_nlink = 1;
+			EXT4_SET_RO_COMPAT_FEATURE(inode-i_sb,
+	  EXT4_FEATURE_RO_COMPAT_DIR_NLINK);
+		}
+	}
+}
+
+/*
+ * If a directory had nlink == 1, then we should let it be 1. This indicates
+ * directory has EXT4_LINK_MAX subdirs.
+ */
+static void ext4_dec_count(handle_t *handle, struct inode *inode)
+{
+	drop_nlink(inode);
+	if (S_ISDIR(inode-i_mode)  inode-i_nlink == 0)
+		inc_nlink(inode);
+}
+
+
 static int ext4_add_nondir(handle_t *handle,
 		struct dentry *dentry, struct inode *inode)
 {
@@ -1713,7 +1742,7 @@ static int ext4_mkdir(struct inode * dir
 	struct ext4_dir_entry_2 * de;
 	int err, retries = 0;
 
-	if (dir-i_nlink = EXT4_LINK_MAX)
+	if (EXT4_DIR_LINK_MAX(dir))
 		return -EMLINK;
 
 retry:
@@ -1736,7 +1765,7 @@ retry:
 	inode-i_size = EXT4_I(inode)-i_disksize = inode-i_sb-s_blocksize;
 	dir_block = ext4_bread (handle, inode, 0, 1, err);
 	if (!dir_block) {
-		drop_nlink(inode); /* is this nlink == 0? */
+		ext4_dec_count(handle, inode); /* is this nlink == 0

Re: [EXT4 set 5][PATCH 1/1] expand inode i_extra_isize to support features in larger inode

2007-07-12 Thread Kalpak Shah
On Tue, 2007-07-10 at 16:32 -0700, Andrew Morton wrote:
 On Sun, 01 Jul 2007 03:38:01 -0400
 Mingming Cao [EMAIL PROTECTED] wrote:
 
  This patch is on top of the nanosecond timestamp and i_version_hi
  patches. 
 
 This sort of information isn't needed (or desired) when this patch hits the
 git tree.  Please ensure that things like this are cleaned up before the
 patches go to Linus.

The incremental patch I have attached contains an updated Changelog
entry which cleans this up.

  +   !(EXT4_I(inode)-i_state  EXT4_STATE_NO_EXPAND)) {
  +   /* We need extra buffer credits since we may write into EA block
  +* with this same handle */
  +   if ((jbd2_journal_extend(handle,
  +EXT4_DATA_TRANS_BLOCKS(inode-i_sb))) == 0) {
  +   ret = ext4_expand_extra_isize(inode,
  +   EXT4_SB(inode-i_sb)-s_want_extra_isize,
  +   iloc, handle);
  +   if (ret) {
  +   EXT4_I(inode)-i_state |= EXT4_STATE_NO_EXPAND;
  +   if (!expand_message) {
  +   ext4_warning(inode-i_sb, __FUNCTION__,
  +   Unable to expand inode %lu. Delete
  +some EAs or run e2fsck.,
  +   inode-i_ino);
  +   expand_message = 1;
  +   }
  +   }
  +   }
  +   }
 
 Maybe that message should come out once per mount rather than once per boot
 (or once per modprobe)?

Done. Now the message gets printed the first time s_mnt_count changes,
which means once per mount.

  +   if (free  new_extra_isize) {
  +   if (!tried_min_extra_isize  s_min_extra_isize) {
  +   tried_min_extra_isize++;
  +   new_extra_isize = s_min_extra_isize;
 
 Aren't we missing a brelse(bh) here?

I have corrected this.

   
  +#define IHDR(inode, raw_inode) \
  +   ((struct ext4_xattr_ibody_header *) \
  +   ((void *)raw_inode + \
  +   EXT4_GOOD_OLD_INODE_SIZE + \
  +   EXT4_I(inode)-i_extra_isize))
  +#define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1))
 
 Neither of these _have_ to be implemented as macros and hence they should
 not be.

These macros existed before and have just been moved. There are similar
such macros in the ext3/4 xattr code and they should probably be changed
to inlines. But that should be done in a different patch.

Thanks,
Kalpak.
We need to make sure that existing ext3 filesystems can also avail the new
fields that have been added to the ext4 inode. We use s_want_extra_isize and
s_min_extra_isize to decide by how much we should expand the inode. If
EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE feature is set then we expand the inode by
max(s_want_extra_isize, s_min_extra_isize , sizeof(ext4_inode) -
EXT4_GOOD_OLD_INODE_SIZE) bytes. Actually it is still an open question
about whether users should be able to set s_*_extra_isize smaller than
the known fields or not.

This patch also adds the functionality to expand inodes to include the
newly added fields. We start by trying to expand by s_want_extra_isize
bytes and if its fails we try to expand by s_min_extra_isize bytes. This
is done by changing the i_extra_isize if enough space is available in
the inode and no EAs are present. If EAs are present and there is enough
space in the inode then the EAs in the inode are shifted to make space.
If enough space is not available in the inode due to the EAs then 1 or
more EAs are shifted to the external EA block. In the worst case when
even the external EA block does not have enough space we inform the user
that some EA would need to be deleted or s_min_extra_isize would have to
be reduced.

This would be online expansion of inodes. expand_extra_isize option will also
be added to e2fsck which will expand all the inodes and if for any reason 
expansion fails for any inode then the EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE
feature will be reset.

Signed-off-by: Andreas Dilger [EMAIL PROTECTED]
Signed-off-by: Kalpak Shah [EMAIL PROTECTED]
Signed-off-by: Mingming Cao [EMAIL PROTECTED]

Index: linux-2.6.22/fs/ext4/inode.c
===
--- linux-2.6.22.orig/fs/ext4/inode.c
+++ linux-2.6.22/fs/ext4/inode.c
@@ -3130,9 +3130,8 @@ int ext4_expand_extra_isize(struct inode
 	struct ext4_xattr_ibody_header *header;
 	struct ext4_xattr_entry *entry;
 
-	if (EXT4_I(inode)-i_extra_isize = new_extra_isize) {
+	if (EXT4_I(inode)-i_extra_isize = new_extra_isize)
 		return 0;
-	}
 
 	raw_inode = ext4_raw_inode(iloc);
 
@@ -3148,9 +3147,9 @@ int ext4_expand_extra_isize(struct inode
 		return 0;
 	}
 
-	/* try to expand with EA present */
+	/* try to expand with EAs present */
 	return ext4_expand_extra_isize_ea(inode, new_extra_isize,
-		raw_inode, handle

Re: [EXT4 set 3][PATCH 1/1] ext4 nanosecond timestamp

2007-07-12 Thread Kalpak Shah
On Tue, 2007-07-10 at 16:30 -0700, Andrew Morton wrote:
 On Sun, 01 Jul 2007 03:36:56 -0400
 Mingming Cao [EMAIL PROTECTED] wrote:
 
  This patch is a spinoff of the old nanosecond patches.
 
 I don't know what the old nanosecond patches are.  A link to a suitable
 changlog for those patches would do in a pinch.  Preferable would be to
 write a proper changelog for this patch.

The incremental patch contains a proper changelog describing the patch.

  +static inline __le32 ext4_encode_extra_time(struct timespec *time)
  +{
  +   return cpu_to_le32((sizeof(time-tv_sec)  4 ?
  +  time-tv_sec  32 : 0) |
  +  ((time-tv_nsec  2)  EXT4_NSEC_MASK));
  +}
  +
  +static inline void ext4_decode_extra_time(struct timespec *time, __le32 
  extra)
  +{
  +   if (sizeof(time-tv_sec)  4)
  +  time-tv_sec |= (__u64)(le32_to_cpu(extra)  EXT4_EPOCH_MASK)
  +   32;
  +   time-tv_nsec = (le32_to_cpu(extra)  EXT4_NSEC_MASK)  2;
  +}
 
 Consider uninlining these functions.

Done.

This patch adds comments, few small corrections and an appropriate
changelog entry.

Thanks,
Kalpak.
This patch adds nanosecond timestamps for ext4. This involves adding
*time_extra fields to the ext4_inode to extend the timestamps to 64-bits.
Creation time is also added by this patch.

These extended fields will fit into an inode if the filesystem was formatted
with large inodes (-I 256 or larger) and there are currently no EAs
consuming all of the available space. For new inodes we always reserve
enough space for the kernel's known extended fields, but for inodes
created with an old kernel this might not have been the case. So this patch
also adds the EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE feature flag(ro-compat so that
older kernels can't create inodes with a smaller extra_isize). which indicates
if the fields fitting inside s_min_extra_isize are available or not.
If the expansion of inodes if unsuccessful then this feature will be disabled.
This feature is only enabled if requested by the sysadmin.

None of the extended inode fields is critical for correct filesystem operation.

Signed-off-by: Andreas Dilger [EMAIL PROTECTED]
Signed-off-by: Kalpak Shah [EMAIL PROTECTED]
Signed-off-by: Eric Sandeen [EMAIL PROTECTED]
Signed-off-by: Dave Kleikamp [EMAIL PROTECTED]
Signed-off-by: Mingming Cao [EMAIL PROTECTED]

Index: linux-2.6.22/include/linux/ext4_fs.h
===
--- linux-2.6.22.orig/include/linux/ext4_fs.h
+++ linux-2.6.22/include/linux/ext4_fs.h
@@ -350,20 +350,30 @@ struct ext4_inode {
 #define EXT4_EPOCH_MASK ((1  EXT4_EPOCH_BITS) - 1)
 #define EXT4_NSEC_MASK  (~0UL  EXT4_EPOCH_BITS)
 
+/*
+ * Extended fields will fit into an inode if the filesystem was formatted
+ * with large inodes (-I 256 or larger) and there are not currently any EAs
+ * consuming all of the available space. For new inodes we always reserve
+ * enough space for the kernel's known extended fields, but for inodes
+ * created with an old kernel this might not have been the case. None of
+ * the extended inode fields is critical for correct filesystem operation.
+ * This macro checks if a certain field fits in the inode. Note that
+ * inode-size = GOOD_OLD_INODE_SIZE + i_extra_isize
+ */
 #define EXT4_FITS_IN_INODE(ext4_inode, einode, field)	\
 	((offsetof(typeof(*ext4_inode), field) +	\
 	  sizeof((ext4_inode)-field))			\
 	= (EXT4_GOOD_OLD_INODE_SIZE +			\
 	(einode)-i_extra_isize))			\
 
-static inline __le32 ext4_encode_extra_time(struct timespec *time)
+static __le32 ext4_encode_extra_time(struct timespec *time)
 {
return cpu_to_le32((sizeof(time-tv_sec)  4 ?
 			   time-tv_sec  32 : 0) |
 			   ((time-tv_nsec  2)  EXT4_NSEC_MASK));
 }
 
-static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra)
+static void ext4_decode_extra_time(struct timespec *time, __le32 extra)
 {
if (sizeof(time-tv_sec)  4)
 	   time-tv_sec |= (__u64)(le32_to_cpu(extra)  EXT4_EPOCH_MASK)
Index: linux-2.6.22/include/linux/ext4_fs_i.h
===
--- linux-2.6.22.orig/include/linux/ext4_fs_i.h
+++ linux-2.6.22/include/linux/ext4_fs_i.h
@@ -153,6 +153,10 @@ struct ext4_inode_info {
 
 	unsigned long i_ext_generation;
 	struct ext4_ext_cache i_cached_extent;
+	/*
+	 * File creation time. Its function is same as that of
+	 * struct timespec i_{a,c,m}time in the generic inode.
+	 */
 	struct timespec i_crtime;
 };
 


Re: [EXT4 set 5][PATCH 1/1] expand inode i_extra_isize to support features in larger inode

2007-07-11 Thread Kalpak Shah
On Wed, 2007-07-11 at 10:34 -0700, Andrew Morton wrote:
 On Wed, 11 Jul 2007 06:10:56 -0600 Andreas Dilger [EMAIL PROTECTED] wrote:
 
  On Jul 10, 2007  16:32 -0700, Andrew Morton wrote:
err = ext4_reserve_inode_write(handle, inode, iloc);
+   if (EXT4_I(inode)-i_extra_isize 
+   EXT4_SB(inode-i_sb)-s_want_extra_isize 
+   !(EXT4_I(inode)-i_state  EXT4_STATE_NO_EXPAND)) {
+   /* We need extra buffer credits since we may write into 
EA block
+* with this same handle */
+   if ((jbd2_journal_extend(handle,
+EXT4_DATA_TRANS_BLOCKS(inode-i_sb))) == 
0) {
+   ret = ext4_expand_extra_isize(inode,
+   
EXT4_SB(inode-i_sb)-s_want_extra_isize,
+   iloc, handle);
+   if (ret) {
+   EXT4_I(inode)-i_state |= 
EXT4_STATE_NO_EXPAND;
+   if (!expand_message) {
+   ext4_warning(inode-i_sb, 
__FUNCTION__,
+   Unable to expand inode %lu. 
Delete
+some EAs or run e2fsck.,
+   inode-i_ino);
+   expand_message = 1;
+   }
+   }
+   }
+   }
   
   Maybe that message should come out once per mount rather than once per 
   boot
   (or once per modprobe)?
  
  Probably true.
  
   What are the consequences of a jbd2_journal_extend() failure in here?
  
  Not fatal, just like every user of i_extra_isize.  If the inode isn't a
  large inode, or it can't be expanded then there will be a minor loss of
  functionality on that inode.  If the i_extra_isize is critical, then
  the sysadmin will have run e2fsck to force s_min_extra_isize large enough.
  
  Note that this is only applicable for filesystems which are upgraded.  For
  new inodes (i.e. all inodes that exist in the filesystem if it was always
  run on a kernel with the currently understood extra fields) then this will
  never be invoked (until such a time new extra fields are added).
 
 I'd suggest that we get a comment in the code explaining this: this
 unchecked error does rather stand out.
 
+   if (EXT4_I(inode)-i_file_acl) {
+   bh = sb_bread(inode-i_sb, EXT4_I(inode)-i_file_acl);
+   error = -EIO;
+   if (!bh)
+   goto cleanup;
+   if (ext4_xattr_check_block(bh)) {
+   ext4_error(inode-i_sb, __FUNCTION__,
+   inode %lu: bad block %llu, 
inode-i_ino,
+   EXT4_I(inode)-i_file_acl);
+   error = -EIO;
+   goto cleanup;
+   }
+   base = BHDR(bh);
+   first = BFIRST(bh);
+   end = bh-b_data + bh-b_size;
+   min_offs = end - base;
+   free = ext4_xattr_free_space(first, min_offs, base,
+total_blk);
+   if (free  new_extra_isize) {
+   if (!tried_min_extra_isize  
s_min_extra_isize) {
+   tried_min_extra_isize++;
+   new_extra_isize = s_min_extra_isize;
   
   Aren't we missing a brelse(bh) here?
  
  Seems likely, yes.
 
 OK - could we get a positive ack from someone indicating that this will get
 looked at?  Because I am about to forget about it.

I will send a patch to add the comments and make the suggested
corrections.

Thanks,
Kalpak.

 To unsubscribe from this list: send the line unsubscribe linux-ext4 in
 the body of a message to [EMAIL PROTECTED]
 More majordomo info at  http://vger.kernel.org/majordomo-info.html

-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [EXT4 set 3][PATCH 1/1] ext4 nanosecond timestamp

2007-07-03 Thread Kalpak Shah
On Sun, 2007-07-01 at 03:36 -0400, Mingming Cao wrote:
 +
 +#define EXT4_INODE_GET_XTIME(xtime, inode, raw_inode)
\
 +do {\
 + (inode)-xtime.tv_sec = le32_to_cpu((raw_inode)-xtime);   \
 + if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) \
 + ext4_decode_extra_time((inode)-xtime,\
 +raw_inode-xtime ## _extra);\
 +} while (0)
 +
 +#define EXT4_EINODE_GET_XTIME(xtime, einode, raw_inode)  
\
 +do {\
 + if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime))  \
 + (einode)-xtime.tv_sec = le32_to_cpu((raw_inode)-xtime);  \
 + if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra))\
 + ext4_decode_extra_time((einode)-xtime,   \
 +raw_inode-xtime ## _extra);\
 +} while (0)
 +

This nanosecond patch seems to be missing the fix below which is required for 
http://bugzilla.kernel.org/show_bug.cgi?id=5079

If the timestamp is set to before epoch i.e. a negative timestamp then the file 
may have its date set into the future on 64-bit systems. So when the timestamp 
is read it must be cast as signed.

Index: linux-2.6.21/include/linux/ext4_fs.h
===
--- linux-2.6.21.orig/include/linux/ext4_fs.h
+++ linux-2.6.21/include/linux/ext4_fs.h
@@ -390,7 +390,7 @@ do {
   \

 #define EXT4_INODE_GET_XTIME(xtime, inode, raw_inode) \
 do {  \
-   (inode)-xtime.tv_sec = le32_to_cpu((raw_inode)-xtime);   \
+   (inode)-xtime.tv_sec = (signed)le32_to_cpu((raw_inode)-xtime);   \
if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) \
ext4_decode_extra_time((inode)-xtime,\
   raw_inode-xtime ## _extra);\
@@ -399,7 +399,8 @@ do {
   \
 #define EXT4_EINODE_GET_XTIME(xtime, einode, raw_inode)
   \
 do {  \
if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime))  \
-   (einode)-xtime.tv_sec = le32_to_cpu((raw_inode)-xtime);  \
+   (einode)-xtime.tv_sec =   \
+   (signed)le32_to_cpu((raw_inode)-xtime);   \
if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra))\
ext4_decode_extra_time((einode)-xtime,   \
   raw_inode-xtime ## _extra);\


Thanks,
Kalpak.

-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Testing framework

2007-04-23 Thread Kalpak Shah
On Mon, 2007-04-23 at 02:16 +0530, Karuna sagar K wrote:
 Hi,
 
 For some time I had been working on this file system test framework.
 Now I have a implementation for the same and below is the explanation.
 Any comments are welcome.
 
 Introduction:
 The testing tools and benchmarks available around do not take into
 account the repair and recovery aspects of file systems. The test
 framework described here focuses on repair and recovery capabilities
 of file systems. Since most file systems use 'fsck' to recover from
 file system inconsistencies, the test framework characterizes file
 systems based on outcomes of running 'fsck'.

snip

 Higher level perspective/approach:
 In this approach the file system is viewed as a tree of nodes, where
 nodes are either files or directories. The metadata information
 corresponding to some randomly chosen nodes of the tree are corrupted.
 Nodes which are corrupted are marked or recorded to be able to replay
 later. This file system is called source file system while the file
 system on which we need to replay the corruption is called target file
 system. The assumption is that the target file system contains a set
 of files and directories which is a superset of that in the source
 file system. Hence to replay the corruption we need point out which
 nodes in the source file system were corrupted in the source file
 system and corrupt the corresponding nodes in the target file system.
 
 A major disadvantage with this approach is that on-disk structures
 (like superblocks, block group descriptors, etc.) are not considered
 for corruption.
 
 Lower level perspective/approach:
 The file system is looked upon as a set of blocks (more precisely
 metadata blocks). We randomly choose from this set of blocks to
 corrupt. Hence we would be able to overcome the deficiency of the
 previous approach. However this approach makes it difficult to have a
 replayable corruption. Further thought about this approach has to be
 given.
 

Fill a test filesystem with data and save it. Corrupt it by copying a
chunk of data from random locations A to B. Save positions A and B so
that you can reproduce the corruption. 

Or corrupt random bits (ideally in metadata blocks) and maintain the
list of the bit numbers for reproducing the corruption.

 We could have a blend of both the approaches in the program to
 compromise between corruption and replayability.
 
 Repair Phase:
 The corrupted file system is repaired and recovered with 'fsck' or any
 other tools; this phase considers the repair and recovery action on
 the file system as a black box. The time taken to repair by the tool
 is measured.

I see that you are running fsck just once on the test filesystem. It
might be a good idea to run it twice and if second fsck does not find
the filesystem to be completely clean that means it is a bug in fsck.

snip

 Summary Phase:
 This is the final phase in the model. A report file is prepared which
 summarizes the result of this test run. The summary contains:
 
 Average time taken for recovery
 Number of files lost at the end of each iteration
 Number of files with metadata corruption at the end of each iteration
 Number of files with data corruption at the end of each iteration
 Number of files lost and found at the end of each iteration
 
 Putting it all together:
 The Corruption, Repair and Comparison phases could be repeated a
 number of times (each repetition is called an iteration) before the
 summary of that test run is prepared.
 
 TODO:
 Account for files in the lost+found directory during the comparison phase.
 Support for other file systems (only ext2 is supported currently)
 State of the either file system is stored, which may be huge, time
 consuming and not necessary. So, we could have better ways of storing
 the state.

Also, people may want to test with different mount options, so something
like mount -t $fstype -o loop,$MOUNT_OPTIONS $imgname $mountpt may be
useful. Similarly it may also be useful to have MKFS_OPTIONS while
formatting the filesystem.

Thanks,
Kalpak.

 
 Comments are welcome!!
 
 Thanks,
 Karuna

-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html