[PATCH 1/2] Add FIEMAP header file

2007-11-12 Thread Kalpak Shah
Recently there was discussion about an "FIle Extent MAP"(FIEMAP) ioctl for 
efficiently mapping the extents and holes of a file. This will be many times 
more efficient than FIBMAP by cutting down the number of ioctls.

This patch adds the FIEMAP header file in include/linux.

Signed-off-by: Andreas Dilger <[EMAIL PROTECTED]>
Signed-off-by: Kalpak Shah <[EMAIL PROTECTED]> 

Index: linux-2.6.23.1/include/linux/fiemap.h
===
--- /dev/null
+++ linux-2.6.23.1/include/linux/fiemap.h
@@ -0,0 +1,49 @@
+/*
+ * include/linux/fiemap.h
+ *
+ * Copyright (C) 2007 Sun Microsystems, Inc.
+ *
+ * Author: Kalpak Shah <[EMAIL PROTECTED]>
+ *Andreas Dilger <[EMAIL PROTECTED]>
+ */
+
+#ifndef _LINUX_FIEMAP_H
+#define _LINUX_FIEMAP_H
+
+struct fiemap_extent {
+   __u64   fe_offset; /* offset in bytes for the start of the extent */
+   __u64   fe_length; /* length in bytes for the extent */
+   __u32   fe_flags;  /* returned FIEMAP_EXTENT_* flags for the extent */
+   __u32   fe_lun;/* logical device number for extent (starting at 0)*/
+};
+
+struct fiemap {
+   __u64   fm_start;/* logical starting byte offset (in/out) */
+   __u64   fm_length;   /* logical length of map (in/out) */
+   __u32   fm_flags;/* FIEMAP_FLAG_* flags for request (in/out) */
+   __u32   fm_extent_count; /* number of extents in fm_extents (in/out) */
+   __u64   fm_end_offset;   /* logical offset of end of mapping in last 
ioctl */
+   struct fiemap_extentfm_extents[0];
+};
+
+#define FIEMAP_FLAG_SYNC   0x0001 /* sync file data before map */
+#define FIEMAP_FLAG_HSM_READ   0x0002 /* get data from HSM before map */
+#define FIEMAP_FLAG_NUM_EXTENTS0x0004 /* return only number of 
extents */
+#define FIEMAP_FLAG_INCOMPAT   0xff00 /* error for unknown flags in here */
+
+#define FIEMAP_FLAG_LUN_OFFSET 0x0100 /* use lun offsets, instead of
+   * logical file offsets */
+
+#define FIEMAP_EXTENT_HOLE 0x0001 /* has no data or space allocation */
+#define FIEMAP_EXTENT_UNWRITTEN0x0002 /* space allocated, but no 
data */
+#define FIEMAP_EXTENT_UNMAPPED 0x0004 /* has data but no space allocation*/
+#define FIEMAP_EXTENT_ERROR0x0008 /* mapping error, errno in fe_start*/
+#define FIEMAP_EXTENT_NO_DIRECT0x0010 /* cannot access data 
directly */
+#define FIEMAP_EXTENT_LAST 0x0020 /* last extent in the file */
+#define FIEMAP_EXTENT_DELALLOC 0x0040 /* has data but not yet written,
+   * must have EXTENT_UNKNOWN set */
+#define FIEMAP_EXTENT_SECONDARY0x0080 /* data (also) in secondary 
storage,
+   * not in primary if EXTENT_UNKNOWN*/
+#define FIEMAP_EXTENT_EOF  0x0100 /* if fm_start+fm_len is beyond EOF*/
+
+#endif /* _LINUX_FIEMAP_H */


-
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/2] FIEMAP ioctl for ext4

2007-11-12 Thread Kalpak Shah
Recently there was discussion about an "FIle Extent MAP"(FIEMAP) ioctl for 
efficiently mapping the extents and holes of a file. This will be many times 
more efficient than FIBMAP by cutting down the number of ioctls.

This patch adds the FIEMAP ioctl for ext4. The spec for the FIEMAP ioctl was 
posted earlier by Andreas Dilger and can be found at:
http://www.mail-archive.com/[EMAIL PROTECTED]/msg03944.html

Signed-off-by: Andreas Dilger <[EMAIL PROTECTED]>
Signed-off-by: Kalpak Shah <[EMAIL PROTECTED]> 

Index: linux-2.6.23.1/fs/ext4/ioctl.c
===
--- linux-2.6.23.1.orig/fs/ext4/ioctl.c
+++ linux-2.6.23.1/fs/ext4/ioctl.c
@@ -16,6 +16,7 @@
 #include 
 #include 
 #include 
+#include 
 
 int ext4_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
unsigned long arg)
@@ -248,6 +249,9 @@ flags_err:
 
return err;
}
+   case EXT4_IOC_FIEMAP: {
+   return ext4_fiemap(inode, filp, cmd, arg);
+   }
 
default:
return -ENOTTY;
Index: linux-2.6.23.1/include/linux/ext4_fs.h
===
--- linux-2.6.23.1.orig/include/linux/ext4_fs.h
+++ linux-2.6.23.1/include/linux/ext4_fs.h
@@ -228,15 +228,20 @@ struct ext4_new_group_data {
 #defineEXT4_IOC_SETFLAGS   FS_IOC_SETFLAGS
 #defineEXT4_IOC_GETVERSION _IOR('f', 3, long)
 #defineEXT4_IOC_SETVERSION _IOW('f', 4, long)
+#define EXT4_IOC_GETRSVSZ  _IOR('f', 5, long)
+#define EXT4_IOC_SETRSVSZ  _IOW('f', 6, long)
 #define EXT4_IOC_GROUP_EXTEND  _IOW('f', 7, unsigned long)
 #define EXT4_IOC_GROUP_ADD _IOW('f', 8,struct ext4_new_group_input)
+#define EXT4_IOC_FIEMAP_IOWR('f', 10, struct fiemap)
 #defineEXT4_IOC_GETVERSION_OLD FS_IOC_GETVERSION
 #defineEXT4_IOC_SETVERSION_OLD FS_IOC_SETVERSION
 #ifdef CONFIG_JBD2_DEBUG
 #define EXT4_IOC_WAIT_FOR_READONLY _IOR('f', 99, long)
 #endif
-#define EXT4_IOC_GETRSVSZ  _IOR('f', 5, long)
-#define EXT4_IOC_SETRSVSZ  _IOW('f', 6, long)
+
+/* ext4 only handles a single LUN, ignore LUN_OFFSET flag */
+#define EXT4_FIEMAP_FLAG_INCOMPAT_UNSUPP (FIEMAP_FLAG_INCOMPAT &   \
+ ~(FIEMAP_FLAG_LUN_OFFSET))
 
 /*
  * ioctl commands in 32 bit emulation
@@ -1067,6 +1072,8 @@ ext4_get_blocks_wrap(handle_t *handle, s
return ext4_get_blocks_handle(handle, inode, block, max_blocks, bh,
create, extend_disksize);
 }
+extern int ext4_fiemap(struct inode *inode, struct file *filp, unsigned int 
cmd,
+  unsigned long arg);
 

 #endif /* __KERNEL__ */
Index: linux-2.6.23.1/include/linux/ext4_fs_extents.h
===
--- linux-2.6.23.1.orig/include/linux/ext4_fs_extents.h
+++ linux-2.6.23.1/include/linux/ext4_fs_extents.h
@@ -131,8 +131,8 @@ struct ext4_ext_path {
  * callback must return valid extent (passed or newly created)
  */
 typedef int (*ext_prepare_callback)(struct inode *, struct ext4_ext_path *,
-   struct ext4_ext_cache *,
-   void *);
+   struct ext4_ext_cache *,
+   struct ext4_extent *, void *);
 
 #define EXT_CONTINUE   0
 #define EXT_BREAK  1
Index: linux-2.6.23.1/fs/ext4/extents.c
===
--- linux-2.6.23.1.orig/fs/ext4/extents.c
+++ linux-2.6.23.1/fs/ext4/extents.c
@@ -41,6 +41,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 

@@ -1512,7 +1513,7 @@ int ext4_ext_walk_space(struct inode *in
}
 
BUG_ON(cbex.ec_len == 0);
-   err = func(inode, path, &cbex, cbdata);
+   err = func(inode, path, &cbex, ex, cbdata);
ext4_ext_drop_refs(path);
 
if (err < 0)
@@ -2629,3 +2630,163 @@ retry:
 
return ret > 0 ? ret2 : ret;
 }
+
+struct fiemap_internal {
+   struct fiemap   *fiemap_s;
+   struct fiemap_extentfm_extent;
+   size_t  tot_mapping_len;
+   char*cur_ext_ptr;
+   int current_extent;
+   int err;
+};
+
+/*
+ * Callback function called for each extent to gather FIEMAP information.
+ */
+int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
+  struct ext4_ext_cache *newex, struct ext4_extent *ex,
+  void *data)
+{
+   struct fiemap_internal *fiemap_i = data;
+   struct 

Re: [EXT4 set 7][PATCH 1/1]Remove 32000 subdirs limit.

2007-07-17 Thread Kalpak Shah
On Fri, 2007-07-13 at 09:53 -0700, Andrew Morton wrote:
> On Fri, 13 Jul 2007 16:00:48 +0530 Kalpak Shah <[EMAIL PROTECTED]> wrote:
> 
> > > >  
> > > > -   if (inode->i_nlink >= EXT4_LINK_MAX)
> > > > +   if (EXT4_DIR_LINK_MAX(inode))
> > > > return -EMLINK;
> > > 
> > > argh.  WHY_IS_EXT4_FULL_OF_UPPER_CASE_MACROS_WHICH_COULD_BE_IMPLEMENTED
> > > as_lower_case_inlines?  Sigh.  It's all the old-timers, I guess.
> > > 
> > > EXT4_DIR_LINK_MAX() is buggy: it evaluates its arg twice.
> > 
> > #define EXT4_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= 
> > EXT4_LINK_MAX)
> > 
> > This just checks if directory has hash indexing in which case we need not 
> > worry about EXT4_LINK_MAX subdir limit. If directory is not hash indexed 
> > then we will need to enforce a max subdir limit. 
> > 
> > Sorry, I didn't understand what is the problem with this macro?
> 
> Macros should never evaluate their argument more than once, because if they
> do they will misbehave when someone passes them an
> expression-with-side-effects:
> 
>   struct inode *p = q;
> 
>   EXT4_DIR_LINK_MAX(p++);
> 
> one expects `p' to have the value q+1 here.  But it might be q+2.
> 
> and
> 
>   EXT4_DIR_LINK_MAX(some_function());
> 
> might cause some_function() to be called twice.
> 
> 
> This is one of the many problems which gets fixed when we write code in C
> rather than in cpp.

Thanks. Here is the fix converting these macros into inlined functions.



The EXT4_DIR_LINK_MAX(dir) and EXT4_DIR_LINK_EMPTY(dir) macros were
evaluating their argument twice so convert them into inlined functions.

Signed-off-by: Kalpak Shah <[EMAIL PROTECTED]>

Index: linux-2.6.22/fs/ext4/namei.c
===
--- linux-2.6.22.orig/fs/ext4/namei.c
+++ linux-2.6.22/fs/ext4/namei.c
@@ -1742,7 +1742,7 @@ static int ext4_mkdir(struct inode * dir
struct ext4_dir_entry_2 * de;
int err, retries = 0;

-   if (EXT4_DIR_LINK_MAX(dir))
+   if (ext4_dir_link_max(dir))
return -EMLINK;

 retry:
@@ -2062,7 +2062,7 @@ static int ext4_rmdir (struct inode * di
retval = ext4_delete_entry(handle, dir, de, bh);
if (retval)
goto end_rmdir;
-   if (!EXT4_DIR_LINK_EMPTY(inode))
+   if (!ext4_dir_link_empty(inode))
ext4_warning (inode->i_sb, "ext4_rmdir",
  "empty directory has too many links (%d)",
  inode->i_nlink);
@@ -2201,7 +2201,7 @@ static int ext4_link (struct dentry * ol
struct inode *inode = old_dentry->d_inode;
int err, retries = 0;

-   if (EXT4_DIR_LINK_MAX(inode))
+   if (ext4_dir_link_max(inode))
return -EMLINK;

/*
Index: linux-2.6.22/include/linux/ext4_fs.h
===
--- linux-2.6.22.orig/include/linux/ext4_fs.h
+++ linux-2.6.22/include/linux/ext4_fs.h
@@ -797,12 +797,18 @@ struct ext4_dir_entry_2 {
   #define is_dx(dir) (EXT4_HAS_COMPAT_FEATURE(dir->i_sb, \
  EXT4_FEATURE_COMPAT_DIR_INDEX) && 
\
  (EXT4_I(dir)->i_flags & EXT4_INDEX_FL))
-#define EXT4_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT4_LINK_MAX)
-#define EXT4_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1)
+static inline int ext4_dir_link_max(struct inode *dir)
+{
+   return (!is_dx(dir) && (dir)->i_nlink >= EXT4_LINK_MAX);
+}
+static inline int ext4_dir_link_empty(struct inode *dir)
+{
+   return ((dir)->i_nlink == 2 || (dir)->i_nlink == 1);
+}
 #else
   #define is_dx(dir) 0
-#define EXT4_DIR_LINK_MAX(dir) ((dir)->i_nlink >= EXT4_LINK_MAX)
-#define EXT4_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2)
+#define ext4_dir_link_max(dir) ((dir)->i_nlink >= EXT4_LINK_MAX)
+#define ext4_dir_link_empty(dir) ((dir)->i_nlink == 2)
 #endif

 /* Legal values for the dx_root hash_version field: */



-
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [EXT4 set 3][PATCH 1/1] ext4 nanosecond timestamp

2007-07-17 Thread Kalpak Shah
On Mon, 2007-07-16 at 17:49 -0700, Mingming Cao wrote:
> On Tue, 2007-07-10 at 16:30 -0700, Andrew Morton wrote:
> > On Sun, 01 Jul 2007 03:36:56 -0400
> > Mingming Cao <[EMAIL PROTECTED]> wrote:
> > > +static inline __le32 ext4_encode_extra_time(struct timespec *time)
> > > +{
> > > +   return cpu_to_le32((sizeof(time->tv_sec) > 4 ?
> > > +time->tv_sec >> 32 : 0) |
> > > +((time->tv_nsec << 2) & EXT4_NSEC_MASK));
> > > +}
> > > +
> > > +static inline void ext4_decode_extra_time(struct timespec *time, __le32 
> > > extra)
> > > +{
> > > +   if (sizeof(time->tv_sec) > 4)
> > > +time->tv_sec |= (__u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK)
> > > +<< 32;
> > > +   time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> 2;
> > > +}
> > 
> > Consider uninlining these functions.
> > 
> I got compile warining after apply Kalpal's update nanosecond patch,
> which makes these two function inline. It complains these functions are
> defined but not used. It's being used only in the following
> micros(EXT4_INODE_SET_XTIME etc).  So if the .c file included the
> ext4_fs.h but not using the micros, the compile will think these two
> functions are not used.

The compile warnings were introduced because the functions were
uninlined. So we can either keep these functions inlined or consider
adding a "__used" attribute to these two functions. 

Thanks,
Kalpak.

-
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [EXT4 set 7][PATCH 1/1]Remove 32000 subdirs limit.

2007-07-13 Thread Kalpak Shah
The updated patch is attached. comments inline...

On Tue, 2007-07-10 at 22:40 -0700, Andrew Morton wrote:
> > If we exceed 65000 subdirectories in an htree directory it sets the
> > inode link count to 1 and no longer counts subdirectories.  The
> > directory link count is not actually used when determining if a
> > directory is empty, as that only counts subdirectories and not regular
> > files that might be in there. 
> > 
> > A EXT4_FEATURE_RO_COMPAT_DIR_NLINK flag has been added and it is set if
> > the subdir count for any directory crosses 65000.
> > 
> 
> Would I be correct in assuming that a later fsck will clear
> EXT4_FEATURE_RO_COMPAT_DIR_NLINK if there are no longer any >65000 subdir
> directories?
> 
> If so, that is worth a mention in the changelog, perhaps?

The changelog has been updated to include this.

> >  
> > +static inline void ext4_inc_count(handle_t *handle, struct inode *inode)
> > +{
> > +   inc_nlink(inode);
> > +   if (is_dx(inode) && inode->i_nlink > 1) {
> > +   /* limit is 16-bit i_links_count */
> > +   if (inode->i_nlink >= EXT4_LINK_MAX || inode->i_nlink == 2) {
> > +   inode->i_nlink = 1;
> > +   EXT4_SET_RO_COMPAT_FEATURE(inode->i_sb,
> > + EXT4_FEATURE_RO_COMPAT_DIR_NLINK);
> > +   }
> > +   }
> > +}
> 
> Looks too big to be inlined.
> 
> Why do we set EXT4_FEATURE_RO_COMPAT_DIR_NLINK if i_nlink==2?

I have added a comment for this. (since it indicates that nlinks==1
previously).

> 
> > +static inline void ext4_dec_count(handle_t *handle, struct inode *inode)
> > +{
> > +   drop_nlink(inode);
> > +   if (S_ISDIR(inode->i_mode) && inode->i_nlink == 0)
> > +   inc_nlink(inode);
> > +}
> 
> Probably too big to inline.

Removed the inline.

> >  
> > -   if (inode->i_nlink >= EXT4_LINK_MAX)
> > +   if (EXT4_DIR_LINK_MAX(inode))
> > return -EMLINK;
> 
> argh.  WHY_IS_EXT4_FULL_OF_UPPER_CASE_MACROS_WHICH_COULD_BE_IMPLEMENTED
> as_lower_case_inlines?  Sigh.  It's all the old-timers, I guess.
> 
> EXT4_DIR_LINK_MAX() is buggy: it evaluates its arg twice.

#define EXT4_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT4_LINK_MAX)

This just checks if directory has hash indexing in which case we need not worry 
about EXT4_LINK_MAX subdir limit. If directory is not hash indexed then we will 
need to enforce a max subdir limit. 

Sorry, I didn't understand what is the problem with this macro?

Thanks,
Kalpak.
This patch adds support to ext4 for allowing more than 65000
subdirectories. Currently the maximum number of subdirectories is capped
at 32000.

If we exceed 65000 subdirectories in an htree directory it sets the
inode link count to 1 and no longer counts subdirectories.  The
directory link count is not actually used when determining if a
directory is empty, as that only counts subdirectories and not regular
files that might be in there. 

A EXT4_FEATURE_RO_COMPAT_DIR_NLINK flag has been added and it is set if
the subdir count for any directory crosses 65000. A later fsck will clear
EXT4_FEATURE_RO_COMPAT_DIR_NLINK if there are no longer any directory
with >65000 subdirs.

Signed-off-by: Andreas Dilger <[EMAIL PROTECTED]>
Signed-off-by: Kalpak Shah <[EMAIL PROTECTED]>


---
 fs/ext4/namei.c |   52 +++-
 include/linux/ext4_fs.h |4 ++-
 2 files changed, 41 insertions(+), 15 deletions(-)

Index: linux-2.6.22/fs/ext4/namei.c
===
--- linux-2.6.22.orig/fs/ext4/namei.c
+++ linux-2.6.22/fs/ext4/namei.c
@@ -1617,6 +1617,35 @@ static int ext4_delete_entry (handle_t *
 	return -ENOENT;
 }
 
+/*
+ * DIR_NLINK feature is set if 1) nlinks > EXT4_LINK_MAX or 2) nlinks == 2,
+ * since this indicates that nlinks count was previously 1.
+ */
+static void ext4_inc_count(handle_t *handle, struct inode *inode)
+{
+	inc_nlink(inode);
+	if (is_dx(inode) && inode->i_nlink > 1) {
+		/* limit is 16-bit i_links_count */
+		if (inode->i_nlink >= EXT4_LINK_MAX || inode->i_nlink == 2) {
+			inode->i_nlink = 1;
+			EXT4_SET_RO_COMPAT_FEATURE(inode->i_sb,
+	  EXT4_FEATURE_RO_COMPAT_DIR_NLINK);
+		}
+	}
+}
+
+/*
+ * If a directory had nlink == 1, then we should let it be 1. This indicates
+ * directory has >EXT4_LINK_MAX subdirs.
+ */
+static void ext4_dec_count(handle_t *handle, struct inode *inode)
+{
+	drop_nlink(inode);
+	if (S_ISDIR(inode->i_mode) && inode->i_nlink == 0)
+		inc_nlink(inode);
+}
+
+
 static int ext4_add_nondir(handle_t *handle,
 		struct dentry *dentry, struct 

Re: [EXT4 set 3][PATCH 1/1] ext4 nanosecond timestamp

2007-07-13 Thread Kalpak Shah
On Fri, 2007-07-13 at 09:59 +0530, Aneesh Kumar K.V wrote:
> 
> Kalpak Shah wrote:
> > On Tue, 2007-07-10 at 16:30 -0700, Andrew Morton wrote:
> >> On Sun, 01 Jul 2007 03:36:56 -0400
> >> Mingming Cao <[EMAIL PROTECTED]> wrote:
> >>
> >>> This patch is a spinoff of the old nanosecond patches.
> >> I don't know what the "old nanosecond patches" are.  A link to a suitable
> >> changlog for those patches would do in a pinch.  Preferable would be to
> >> write a proper changelog for this patch.
> > 
> > The incremental patch contains a proper changelog describing the patch.
> > 
> 
> 
> Instead of  putting incremental patches it would be nice if we can have 
> replacement patches.
> for the already existing patches with the comments addressed. For example if 
> we have a 
> review comment on the patch message ( commit log ) then adding an incremental 
> patch doesn't help.

I think that it would be easier to review just the changes that have
been made to the patches instead of having people go through the entire
patch again. I was hoping that someone with write access to ext4-git
would update the commit logs.

If replacement patches are preferred, then I will send them again.

Thanks,
Kalpak.

> 
> 
> -aneesh
> -
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to [EMAIL PROTECTED]
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

-
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [EXT4 set 3][PATCH 1/1] ext4 nanosecond timestamp

2007-07-12 Thread Kalpak Shah
On Tue, 2007-07-10 at 16:30 -0700, Andrew Morton wrote:
> On Sun, 01 Jul 2007 03:36:56 -0400
> Mingming Cao <[EMAIL PROTECTED]> wrote:
> 
> > This patch is a spinoff of the old nanosecond patches.
> 
> I don't know what the "old nanosecond patches" are.  A link to a suitable
> changlog for those patches would do in a pinch.  Preferable would be to
> write a proper changelog for this patch.

The incremental patch contains a proper changelog describing the patch.

> > +static inline __le32 ext4_encode_extra_time(struct timespec *time)
> > +{
> > +   return cpu_to_le32((sizeof(time->tv_sec) > 4 ?
> > +  time->tv_sec >> 32 : 0) |
> > +  ((time->tv_nsec << 2) & EXT4_NSEC_MASK));
> > +}
> > +
> > +static inline void ext4_decode_extra_time(struct timespec *time, __le32 
> > extra)
> > +{
> > +   if (sizeof(time->tv_sec) > 4)
> > +  time->tv_sec |= (__u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK)
> > +  << 32;
> > +   time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> 2;
> > +}
> 
> Consider uninlining these functions.

Done.

This patch adds comments, few small corrections and an appropriate
changelog entry.

Thanks,
Kalpak.
This patch adds nanosecond timestamps for ext4. This involves adding
*time_extra fields to the ext4_inode to extend the timestamps to 64-bits.
Creation time is also added by this patch.

These extended fields will fit into an inode if the filesystem was formatted
with large inodes (-I 256 or larger) and there are currently no EAs
consuming all of the available space. For new inodes we always reserve
enough space for the kernel's known extended fields, but for inodes
created with an old kernel this might not have been the case. So this patch
also adds the EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE feature flag(ro-compat so that
older kernels can't create inodes with a smaller extra_isize). which indicates
if the fields fitting inside s_min_extra_isize are available or not.
If the expansion of inodes if unsuccessful then this feature will be disabled.
This feature is only enabled if requested by the sysadmin.

None of the extended inode fields is critical for correct filesystem operation.

Signed-off-by: Andreas Dilger <[EMAIL PROTECTED]>
Signed-off-by: Kalpak Shah <[EMAIL PROTECTED]>
Signed-off-by: Eric Sandeen <[EMAIL PROTECTED]>
Signed-off-by: Dave Kleikamp <[EMAIL PROTECTED]>
Signed-off-by: Mingming Cao <[EMAIL PROTECTED]>

Index: linux-2.6.22/include/linux/ext4_fs.h
===
--- linux-2.6.22.orig/include/linux/ext4_fs.h
+++ linux-2.6.22/include/linux/ext4_fs.h
@@ -350,20 +350,30 @@ struct ext4_inode {
 #define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1)
 #define EXT4_NSEC_MASK  (~0UL << EXT4_EPOCH_BITS)
 
+/*
+ * Extended fields will fit into an inode if the filesystem was formatted
+ * with large inodes (-I 256 or larger) and there are not currently any EAs
+ * consuming all of the available space. For new inodes we always reserve
+ * enough space for the kernel's known extended fields, but for inodes
+ * created with an old kernel this might not have been the case. None of
+ * the extended inode fields is critical for correct filesystem operation.
+ * This macro checks if a certain field fits in the inode. Note that
+ * inode-size = GOOD_OLD_INODE_SIZE + i_extra_isize
+ */
 #define EXT4_FITS_IN_INODE(ext4_inode, einode, field)	\
 	((offsetof(typeof(*ext4_inode), field) +	\
 	  sizeof((ext4_inode)->field))			\
 	<= (EXT4_GOOD_OLD_INODE_SIZE +			\
 	(einode)->i_extra_isize))			\
 
-static inline __le32 ext4_encode_extra_time(struct timespec *time)
+static __le32 ext4_encode_extra_time(struct timespec *time)
 {
return cpu_to_le32((sizeof(time->tv_sec) > 4 ?
 			   time->tv_sec >> 32 : 0) |
 			   ((time->tv_nsec << 2) & EXT4_NSEC_MASK));
 }
 
-static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra)
+static void ext4_decode_extra_time(struct timespec *time, __le32 extra)
 {
if (sizeof(time->tv_sec) > 4)
 	   time->tv_sec |= (__u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK)
Index: linux-2.6.22/include/linux/ext4_fs_i.h
===
--- linux-2.6.22.orig/include/linux/ext4_fs_i.h
+++ linux-2.6.22/include/linux/ext4_fs_i.h
@@ -153,6 +153,10 @@ struct ext4_inode_info {
 
 	unsigned long i_ext_generation;
 	struct ext4_ext_cache i_cached_extent;
+	/*
+	 * File creation time. Its function is same as that of
+	 * struct timespec i_{a,c,m}time in the generic inode.
+	 */
 	struct timespec i_crtime;
 };
 


Re: [EXT4 set 5][PATCH 1/1] expand inode i_extra_isize to support features in larger inode

2007-07-12 Thread Kalpak Shah
On Tue, 2007-07-10 at 16:32 -0700, Andrew Morton wrote:
> On Sun, 01 Jul 2007 03:38:01 -0400
> Mingming Cao <[EMAIL PROTECTED]> wrote:
> 
> > This patch is on top of the nanosecond timestamp and i_version_hi
> > patches. 
> 
> This sort of information isn't needed (or desired) when this patch hits the
> git tree.  Please ensure that things like this are cleaned up before the
> patches go to Linus.

The incremental patch I have attached contains an updated Changelog
entry which cleans this up.

> > +   !(EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND)) {
> > +   /* We need extra buffer credits since we may write into EA block
> > +* with this same handle */
> > +   if ((jbd2_journal_extend(handle,
> > +EXT4_DATA_TRANS_BLOCKS(inode->i_sb))) == 0) {
> > +   ret = ext4_expand_extra_isize(inode,
> > +   EXT4_SB(inode->i_sb)->s_want_extra_isize,
> > +   iloc, handle);
> > +   if (ret) {
> > +   EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND;
> > +   if (!expand_message) {
> > +   ext4_warning(inode->i_sb, __FUNCTION__,
> > +   "Unable to expand inode %lu. Delete"
> > +   " some EAs or run e2fsck.",
> > +   inode->i_ino);
> > +   expand_message = 1;
> > +   }
> > +   }
> > +   }
> > +   }
> 
> Maybe that message should come out once per mount rather than once per boot
> (or once per modprobe)?

Done. Now the message gets printed the first time s_mnt_count changes,
which means once per mount.

> > +   if (free < new_extra_isize) {
> > +   if (!tried_min_extra_isize && s_min_extra_isize) {
> > +   tried_min_extra_isize++;
> > +   new_extra_isize = s_min_extra_isize;
> 
> Aren't we missing a brelse(bh) here?

I have corrected this.

> >  
> > +#define IHDR(inode, raw_inode) \
> > +   ((struct ext4_xattr_ibody_header *) \
> > +   ((void *)raw_inode + \
> > +   EXT4_GOOD_OLD_INODE_SIZE + \
> > +   EXT4_I(inode)->i_extra_isize))
> > +#define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1))
> 
> Neither of these _have_ to be implemented as macros and hence they should
> not be.

These macros existed before and have just been moved. There are similar
such macros in the ext3/4 xattr code and they should probably be changed
to inlines. But that should be done in a different patch.

Thanks,
Kalpak.
We need to make sure that existing ext3 filesystems can also avail the new
fields that have been added to the ext4 inode. We use s_want_extra_isize and
s_min_extra_isize to decide by how much we should expand the inode. If
EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE feature is set then we expand the inode by
max(s_want_extra_isize, s_min_extra_isize , sizeof(ext4_inode) -
EXT4_GOOD_OLD_INODE_SIZE) bytes. Actually it is still an open question
about whether users should be able to set s_*_extra_isize smaller than
the known fields or not.

This patch also adds the functionality to expand inodes to include the
newly added fields. We start by trying to expand by s_want_extra_isize
bytes and if its fails we try to expand by s_min_extra_isize bytes. This
is done by changing the i_extra_isize if enough space is available in
the inode and no EAs are present. If EAs are present and there is enough
space in the inode then the EAs in the inode are shifted to make space.
If enough space is not available in the inode due to the EAs then 1 or
more EAs are shifted to the external EA block. In the worst case when
even the external EA block does not have enough space we inform the user
that some EA would need to be deleted or s_min_extra_isize would have to
be reduced.

This would be online expansion of inodes. expand_extra_isize option will also
be added to e2fsck which will expand all the inodes and if for any reason 
expansion fails for any inode then the EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE
feature will be reset.

Signed-off-by: Andreas Dilger <[EMAIL PROTECTED]>
Signed-off-by: Kalpak Shah <[EMAIL PROTECTED]>
Signed-off-by: Mingming Cao <[EMAIL PROTECTED]>

Index: linux-2.6.22/fs/ext4/inode.c
===
--- linux-2.6.22.orig/fs/ext4/inode.c
+++ linux-2.6.22/fs/ext4/inode.c
@@ -3130,9 +3130,8 @@ int ext4_expand_extra_isize(struct inode
 	struct ext4_xattr_ibody_header *header;
 	struct

Re: [EXT4 set 5][PATCH 1/1] expand inode i_extra_isize to support features in larger inode

2007-07-11 Thread Kalpak Shah
On Wed, 2007-07-11 at 10:34 -0700, Andrew Morton wrote:
> On Wed, 11 Jul 2007 06:10:56 -0600 Andreas Dilger <[EMAIL PROTECTED]> wrote:
> 
> > On Jul 10, 2007  16:32 -0700, Andrew Morton wrote:
> > > > err = ext4_reserve_inode_write(handle, inode, &iloc);
> > > > +   if (EXT4_I(inode)->i_extra_isize <
> > > > +   EXT4_SB(inode->i_sb)->s_want_extra_isize &&
> > > > +   !(EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND)) {
> > > > +   /* We need extra buffer credits since we may write into 
> > > > EA block
> > > > +* with this same handle */
> > > > +   if ((jbd2_journal_extend(handle,
> > > > +EXT4_DATA_TRANS_BLOCKS(inode->i_sb))) == 
> > > > 0) {
> > > > +   ret = ext4_expand_extra_isize(inode,
> > > > +   
> > > > EXT4_SB(inode->i_sb)->s_want_extra_isize,
> > > > +   iloc, handle);
> > > > +   if (ret) {
> > > > +   EXT4_I(inode)->i_state |= 
> > > > EXT4_STATE_NO_EXPAND;
> > > > +   if (!expand_message) {
> > > > +   ext4_warning(inode->i_sb, 
> > > > __FUNCTION__,
> > > > +   "Unable to expand inode %lu. 
> > > > Delete"
> > > > +   " some EAs or run e2fsck.",
> > > > +   inode->i_ino);
> > > > +   expand_message = 1;
> > > > +   }
> > > > +   }
> > > > +   }
> > > > +   }
> > > 
> > > Maybe that message should come out once per mount rather than once per 
> > > boot
> > > (or once per modprobe)?
> > 
> > Probably true.
> > 
> > > What are the consequences of a jbd2_journal_extend() failure in here?
> > 
> > Not fatal, just like every user of i_extra_isize.  If the inode isn't a
> > large inode, or it can't be expanded then there will be a minor loss of
> > functionality on that inode.  If the i_extra_isize is critical, then
> > the sysadmin will have run e2fsck to force s_min_extra_isize large enough.
> > 
> > Note that this is only applicable for filesystems which are upgraded.  For
> > new inodes (i.e. all inodes that exist in the filesystem if it was always
> > run on a kernel with the currently understood extra fields) then this will
> > never be invoked (until such a time new extra fields are added).
> 
> I'd suggest that we get a comment in the code explaining this: this
> unchecked error does rather stand out.
> 
> > > > +   if (EXT4_I(inode)->i_file_acl) {
> > > > +   bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
> > > > +   error = -EIO;
> > > > +   if (!bh)
> > > > +   goto cleanup;
> > > > +   if (ext4_xattr_check_block(bh)) {
> > > > +   ext4_error(inode->i_sb, __FUNCTION__,
> > > > +   "inode %lu: bad block %llu", 
> > > > inode->i_ino,
> > > > +   EXT4_I(inode)->i_file_acl);
> > > > +   error = -EIO;
> > > > +   goto cleanup;
> > > > +   }
> > > > +   base = BHDR(bh);
> > > > +   first = BFIRST(bh);
> > > > +   end = bh->b_data + bh->b_size;
> > > > +   min_offs = end - base;
> > > > +   free = ext4_xattr_free_space(first, &min_offs, base,
> > > > +&total_blk);
> > > > +   if (free < new_extra_isize) {
> > > > +   if (!tried_min_extra_isize && 
> > > > s_min_extra_isize) {
> > > > +   tried_min_extra_isize++;
> > > > +   new_extra_isize = s_min_extra_isize;
> > > 
> > > Aren't we missing a brelse(bh) here?
> > 
> > Seems likely, yes.
> 
> OK - could we get a positive ack from someone indicating that this will get
> looked at?  Because I am about to forget about it.

I will send a patch to add the comments and make the suggested
corrections.

Thanks,
Kalpak.

> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to [EMAIL PROTECTED]
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

-
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [EXT4 set 3][PATCH 1/1] ext4 nanosecond timestamp

2007-07-03 Thread Kalpak Shah
On Sun, 2007-07-01 at 03:36 -0400, Mingming Cao wrote:
> +
> +#define EXT4_INODE_GET_XTIME(xtime, inode, raw_inode)
>\
> +do {\
> + (inode)->xtime.tv_sec = le32_to_cpu((raw_inode)->xtime);   \
> + if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) \
> + ext4_decode_extra_time(&(inode)->xtime,\
> +raw_inode->xtime ## _extra);\
> +} while (0)
> +
> +#define EXT4_EINODE_GET_XTIME(xtime, einode, raw_inode)  
>\
> +do {\
> + if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime))  \
> + (einode)->xtime.tv_sec = le32_to_cpu((raw_inode)->xtime);  \
> + if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra))\
> + ext4_decode_extra_time(&(einode)->xtime,   \
> +raw_inode->xtime ## _extra);\
> +} while (0)
> +

This nanosecond patch seems to be missing the fix below which is required for 
http://bugzilla.kernel.org/show_bug.cgi?id=5079

If the timestamp is set to before epoch i.e. a negative timestamp then the file 
may have its date set into the future on 64-bit systems. So when the timestamp 
is read it must be cast as signed.

Index: linux-2.6.21/include/linux/ext4_fs.h
===
--- linux-2.6.21.orig/include/linux/ext4_fs.h
+++ linux-2.6.21/include/linux/ext4_fs.h
@@ -390,7 +390,7 @@ do {
   \

 #define EXT4_INODE_GET_XTIME(xtime, inode, raw_inode) \
 do {  \
-   (inode)->xtime.tv_sec = le32_to_cpu((raw_inode)->xtime);   \
+   (inode)->xtime.tv_sec = (signed)le32_to_cpu((raw_inode)->xtime);   \
if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) \
ext4_decode_extra_time(&(inode)->xtime,\
   raw_inode->xtime ## _extra);\
@@ -399,7 +399,8 @@ do {
   \
 #define EXT4_EINODE_GET_XTIME(xtime, einode, raw_inode)
   \
 do {  \
if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime))  \
-   (einode)->xtime.tv_sec = le32_to_cpu((raw_inode)->xtime);  \
+   (einode)->xtime.tv_sec =   \
+   (signed)le32_to_cpu((raw_inode)->xtime);   \
if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra))\
ext4_decode_extra_time(&(einode)->xtime,   \
   raw_inode->xtime ## _extra);\


Thanks,
Kalpak.

-
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [EXT4 set 3][PATCH 1/1] ext4 nanosecond timestamp

2007-07-03 Thread Kalpak Shah
On Sun, 2007-07-01 at 03:36 -0400, Mingming Cao wrote:
> +
> +#define EXT4_INODE_GET_XTIME(xtime, inode, raw_inode)
>\
> +do {\
> + (inode)->xtime.tv_sec = le32_to_cpu((raw_inode)->xtime);   \
> + if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) \
> + ext4_decode_extra_time(&(inode)->xtime,\
> +raw_inode->xtime ## _extra);\
> +} while (0)
> +
> +#define EXT4_EINODE_GET_XTIME(xtime, einode, raw_inode)  
>\
> +do {\
> + if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime))  \
> + (einode)->xtime.tv_sec = le32_to_cpu((raw_inode)->xtime);  \
> + if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra))\
> + ext4_decode_extra_time(&(einode)->xtime,   \
> +raw_inode->xtime ## _extra);\
> +} while (0)
> +

This nanosecond patch seems to be missing the fix below which is required for 
http://bugzilla.kernel.org/show_bug.cgi?id=5079

If the timestamp is set to before epoch i.e. a negative timestamp then the file 
may have its date set into the future on 64-bit systems. So when the timestamp 
is read it must be cast as signed.

Index: linux-2.6.21/include/linux/ext4_fs.h
===
--- linux-2.6.21.orig/include/linux/ext4_fs.h
+++ linux-2.6.21/include/linux/ext4_fs.h
@@ -390,7 +390,7 @@ do {
   \

 #define EXT4_INODE_GET_XTIME(xtime, inode, raw_inode) \
 do {  \
-   (inode)->xtime.tv_sec = le32_to_cpu((raw_inode)->xtime);   \
+   (inode)->xtime.tv_sec = (signed)le32_to_cpu((raw_inode)->xtime);   \
if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) \
ext4_decode_extra_time(&(inode)->xtime,\
   raw_inode->xtime ## _extra);\
@@ -399,7 +399,8 @@ do {
   \
 #define EXT4_EINODE_GET_XTIME(xtime, einode, raw_inode)
   \
 do {  \
if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime))  \
-   (einode)->xtime.tv_sec = le32_to_cpu((raw_inode)->xtime);  \
+   (einode)->xtime.tv_sec =   \
+   (signed)le32_to_cpu((raw_inode)->xtime);   \
if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra))\
ext4_decode_extra_time(&(einode)->xtime,   \
   raw_inode->xtime ## _extra);\


Thanks,
Kalpak.


-
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: ChunkFS - measuring cross-chunk references

2007-04-23 Thread Kalpak Shah
On Mon, 2007-04-23 at 12:49 +0530, Karuna sagar K wrote:
> Hi,
> 
> The tool estimates the cross-chunk references from an extt2/3 file
> system. It considers a block group as one chunk and calcuates how many
> block groups does a file span across. So, the block group size gives
> the estimate of chunk size.
> 
> The file systems were aged for about 3-4 months on a developers laptop.
> 

With a blocksize of 4KB, a block group would be 128 MB. In the original
Chunkfs paper, Valh had mentioned 1GB chunks and I believe it will be
possible to use 2GB, 4GB or 8GB chunks in the future. As the chunk size
increases the number of cross-chunk references will reduce and hence it
might be a good idea to present these statistics considering different
chunk sizes starting from 512MB upto 2GB.

Thanks,
Kalpak.

-
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Testing framework

2007-04-23 Thread Kalpak Shah
On Mon, 2007-04-23 at 02:16 +0530, Karuna sagar K wrote:
> Hi,
> 
> For some time I had been working on this file system test framework.
> Now I have a implementation for the same and below is the explanation.
> Any comments are welcome.
> 
> Introduction:
> The testing tools and benchmarks available around do not take into
> account the repair and recovery aspects of file systems. The test
> framework described here focuses on repair and recovery capabilities
> of file systems. Since most file systems use 'fsck' to recover from
> file system inconsistencies, the test framework characterizes file
> systems based on outcomes of running 'fsck'.



> Higher level perspective/approach:
> In this approach the file system is viewed as a tree of nodes, where
> nodes are either files or directories. The metadata information
> corresponding to some randomly chosen nodes of the tree are corrupted.
> Nodes which are corrupted are marked or recorded to be able to replay
> later. This file system is called source file system while the file
> system on which we need to replay the corruption is called target file
> system. The assumption is that the target file system contains a set
> of files and directories which is a superset of that in the source
> file system. Hence to replay the corruption we need point out which
> nodes in the source file system were corrupted in the source file
> system and corrupt the corresponding nodes in the target file system.
> 
> A major disadvantage with this approach is that on-disk structures
> (like superblocks, block group descriptors, etc.) are not considered
> for corruption.
> 
> Lower level perspective/approach:
> The file system is looked upon as a set of blocks (more precisely
> metadata blocks). We randomly choose from this set of blocks to
> corrupt. Hence we would be able to overcome the deficiency of the
> previous approach. However this approach makes it difficult to have a
> replayable corruption. Further thought about this approach has to be
> given.
> 

Fill a test filesystem with data and save it. Corrupt it by copying a
chunk of data from random locations A to B. Save positions A and B so
that you can reproduce the corruption. 

Or corrupt random bits (ideally in metadata blocks) and maintain the
list of the bit numbers for reproducing the corruption.

> We could have a blend of both the approaches in the program to
> compromise between corruption and replayability.
> 
> Repair Phase:
> The corrupted file system is repaired and recovered with 'fsck' or any
> other tools; this phase considers the repair and recovery action on
> the file system as a black box. The time taken to repair by the tool
> is measured.

I see that you are running fsck just once on the test filesystem. It
might be a good idea to run it twice and if second fsck does not find
the filesystem to be completely clean that means it is a bug in fsck.



> Summary Phase:
> This is the final phase in the model. A report file is prepared which
> summarizes the result of this test run. The summary contains:
> 
> Average time taken for recovery
> Number of files lost at the end of each iteration
> Number of files with metadata corruption at the end of each iteration
> Number of files with data corruption at the end of each iteration
> Number of files lost and found at the end of each iteration
> 
> Putting it all together:
> The Corruption, Repair and Comparison phases could be repeated a
> number of times (each repetition is called an iteration) before the
> summary of that test run is prepared.
> 
> TODO:
> Account for files in the lost+found directory during the comparison phase.
> Support for other file systems (only ext2 is supported currently)
> State of the either file system is stored, which may be huge, time
> consuming and not necessary. So, we could have better ways of storing
> the state.

Also, people may want to test with different mount options, so something
like "mount -t $fstype -o loop,$MOUNT_OPTIONS $imgname $mountpt" may be
useful. Similarly it may also be useful to have MKFS_OPTIONS while
formatting the filesystem.

Thanks,
Kalpak.

> 
> Comments are welcome!!
> 
> Thanks,
> Karuna

-
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html