Re: [PATCH 1/9] fs: add fcntl() interface for setting/getting write life time hints

2017-06-27 Thread Jens Axboe
On 06/27/2017 08:42 AM, Christoph Hellwig wrote:
> The API looks ok, but the code could use some cleanups.  What do
> you think about the incremental patch below:
> 
> It refactors various manipulations, and stores the write hint right
> in the iocb as there is a 4 byte hole (this will need some minor
> adjustments in the next patches):

How's this? Fixes for compile, and also squeeze an enum rw_hint into
a hole in the inode structure.

I'll refactor around this and squeeze into bio/rq holes as well, then
re-test it.

diff --git a/fs/fcntl.c b/fs/fcntl.c
index f4e7267d117f..25f96a101f1a 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -243,6 +243,62 @@ static int f_getowner_uids(struct file *filp, unsigned 
long arg)
 }
 #endif
 
+static bool rw_hint_valid(enum rw_hint hint)
+{
+   switch (hint) {
+   case RWF_WRITE_LIFE_NOT_SET:
+   case RWH_WRITE_LIFE_NONE:
+   case RWH_WRITE_LIFE_SHORT:
+   case RWH_WRITE_LIFE_MEDIUM:
+   case RWH_WRITE_LIFE_LONG:
+   case RWH_WRITE_LIFE_EXTREME:
+   return true;
+   default:
+   return false;
+   }
+}
+
+static long fcntl_rw_hint(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+   struct inode *inode = file_inode(file);
+   u64 *argp = (u64 __user *)arg;
+   enum rw_hint hint;
+
+   switch (cmd) {
+   case F_GET_FILE_RW_HINT:
+   if (put_user(__file_write_hint(file), argp))
+   return -EFAULT;
+   return 0;
+   case F_SET_FILE_RW_HINT:
+   if (get_user(hint, argp))
+   return -EFAULT;
+   if (!rw_hint_valid(hint))
+   return -EINVAL;
+
+   spin_lock(>f_lock);
+   file->f_write_hint = hint;
+   spin_unlock(>f_lock);
+   return 0;
+   case F_GET_RW_HINT:
+   if (put_user(__inode_write_hint(inode), argp))
+   return -EFAULT;
+   return 0;
+   case F_SET_RW_HINT:
+   if (get_user(hint, argp))
+   return -EFAULT;
+   if (!rw_hint_valid(hint))
+   return -EINVAL;
+
+   inode_lock(inode);
+   inode->i_write_hint = hint;
+   inode_unlock(inode);
+   return 0;
+   default:
+   return -EINVAL;
+   }
+}
+
 static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
struct file *filp)
 {
@@ -337,6 +393,12 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned 
long arg,
case F_GET_SEALS:
err = shmem_fcntl(filp, cmd, arg);
break;
+   case F_GET_RW_HINT:
+   case F_SET_RW_HINT:
+   case F_GET_FILE_RW_HINT:
+   case F_SET_FILE_RW_HINT:
+   err = fcntl_rw_hint(filp, cmd, arg);
+   break;
default:
break;
}
diff --git a/fs/inode.c b/fs/inode.c
index db5914783a71..f0e5fc77e6a4 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -146,6 +146,7 @@ int inode_init_always(struct super_block *sb, struct inode 
*inode)
i_gid_write(inode, 0);
atomic_set(>i_writecount, 0);
inode->i_size = 0;
+   inode->i_write_hint = WRITE_LIFE_NOT_SET;
inode->i_blocks = 0;
inode->i_bytes = 0;
inode->i_generation = 0;
diff --git a/fs/open.c b/fs/open.c
index cd0c5be8d012..3fe0c4aa7d27 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -759,6 +759,7 @@ static int do_dentry_open(struct file *f,
 likely(f->f_op->write || f->f_op->write_iter))
f->f_mode |= FMODE_CAN_WRITE;
 
+   f->f_write_hint = WRITE_LIFE_NOT_SET;
f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
 
file_ra_state_init(>f_ra, f->f_mapping->host->i_mapping);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 4574121f4746..4587a181162e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -265,6 +265,20 @@ struct page;
 struct address_space;
 struct writeback_control;
 
+#include 
+
+/*
+ * Write life time hint values.
+ */
+enum rw_hint {
+   WRITE_LIFE_NOT_SET  = 0,
+   WRITE_LIFE_NONE = RWH_WRITE_LIFE_NONE,
+   WRITE_LIFE_SHORT= RWH_WRITE_LIFE_SHORT,
+   WRITE_LIFE_MEDIUM   = RWH_WRITE_LIFE_MEDIUM,
+   WRITE_LIFE_LONG = RWH_WRITE_LIFE_LONG,
+   WRITE_LIFE_EXTREME  = RWH_WRITE_LIFE_EXTREME,
+};
+
 #define IOCB_EVENTFD   (1 << 0)
 #define IOCB_APPEND(1 << 1)
 #define IOCB_DIRECT(1 << 2)
@@ -280,6 +294,7 @@ struct kiocb {
void (*ki_complete)(struct kiocb *iocb, long ret, long ret2);
void*private;
int ki_flags;
+   enum rw_hintki_hint;
 };
 
 static inline bool is_sync_kiocb(struct kiocb *kiocb)
@@ -597,6 +612,7 @@ struct inode {
spinlock_t  i_lock; /* i_blocks, i_bytes, maybe 

Re: [PATCH 1/9] fs: add fcntl() interface for setting/getting write life time hints

2017-06-27 Thread Christoph Hellwig
On Tue, Jun 27, 2017 at 09:09:48AM -0600, Jens Axboe wrote:
> On 06/27/2017 08:42 AM, Christoph Hellwig wrote:
> > The API looks ok, but the code could use some cleanups.  What do
> > you think about the incremental patch below:
> > 
> > It refactors various manipulations, and stores the write hint right
> > in the iocb as there is a 4 byte hole (this will need some minor
> > adjustments in the next patches):
> 
> How's this? Fixes for compile, and also squeeze an enum rw_hint into
> a hole in the inode structure.
> 
> I'll refactor around this and squeeze into bio/rq holes as well, then
> re-test it.

Looks good, minor nitpick below:

> index 4574121f4746..4587a181162e 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -265,6 +265,20 @@ struct page;
>  struct address_space;
>  struct writeback_control;
>  
> +#include 

I didn't seem to need the move.  But if you want to move it can
we keep all the includes together at the very top?

> +static inline enum rw_hint __inode_write_hint(struct inode *inode)
> +{
> + return inode->i_write_hint;
> +}
> +
> +static inline enum rw_hint inode_write_hint(struct inode *inode)
> +{
> + enum rw_hint ret = __inode_write_hint(inode);
> + if (ret != WRITE_LIFE_NOT_SET)
> + return ret;
> + return WRITE_LIFE_NONE;
> +}
> +
> +static inline enum rw_hint __file_write_hint(struct file *file)
> +{
> + if (file->f_write_hint != WRITE_LIFE_NOT_SET)
> + return file->f_write_hint;
> +
> + return __inode_write_hint(file_inode(file));
> +}
> +
> +static inline enum rw_hint file_write_hint(struct file *file)
> +{
> + enum rw_hint ret = __file_write_hint(file);
> + if (ret != WRITE_LIFE_NOT_SET)
> + return ret;
> + return WRITE_LIFE_NONE;
> +}

I'd say kill all these helpers and just treat both WRITE_LIFE_NONE
and WRITE_LIFE_NOT_SET special all the way down in NVMe.



Re: [PATCH 1/9] fs: add fcntl() interface for setting/getting write life time hints

2017-06-27 Thread Jens Axboe
On 06/27/2017 09:16 AM, Christoph Hellwig wrote:
> On Tue, Jun 27, 2017 at 09:09:48AM -0600, Jens Axboe wrote:
>> On 06/27/2017 08:42 AM, Christoph Hellwig wrote:
>>> The API looks ok, but the code could use some cleanups.  What do
>>> you think about the incremental patch below:
>>>
>>> It refactors various manipulations, and stores the write hint right
>>> in the iocb as there is a 4 byte hole (this will need some minor
>>> adjustments in the next patches):
>>
>> How's this? Fixes for compile, and also squeeze an enum rw_hint into
>> a hole in the inode structure.
>>
>> I'll refactor around this and squeeze into bio/rq holes as well, then
>> re-test it.
> 
> Looks good, minor nitpick below:
> 
>> index 4574121f4746..4587a181162e 100644
>> --- a/include/linux/fs.h
>> +++ b/include/linux/fs.h
>> @@ -265,6 +265,20 @@ struct page;
>>  struct address_space;
>>  struct writeback_control;
>>  
>> +#include 
> 
> I didn't seem to need the move.  But if you want to move it can
> we keep all the includes together at the very top?

It did here, we need it for the RWH_ defines or my compile blows up.
But yeah, let's just move it to the top, not sure why it's in the
middle.

>> +static inline enum rw_hint __inode_write_hint(struct inode *inode)
>> +{
>> +return inode->i_write_hint;
>> +}
>> +
>> +static inline enum rw_hint inode_write_hint(struct inode *inode)
>> +{
>> +enum rw_hint ret = __inode_write_hint(inode);
>> +if (ret != WRITE_LIFE_NOT_SET)
>> +return ret;
>> +return WRITE_LIFE_NONE;
>> +}
>> +
>> +static inline enum rw_hint __file_write_hint(struct file *file)
>> +{
>> +if (file->f_write_hint != WRITE_LIFE_NOT_SET)
>> +return file->f_write_hint;
>> +
>> +return __inode_write_hint(file_inode(file));
>> +}
>> +
>> +static inline enum rw_hint file_write_hint(struct file *file)
>> +{
>> +enum rw_hint ret = __file_write_hint(file);
>> +if (ret != WRITE_LIFE_NOT_SET)
>> +return ret;
>> +return WRITE_LIFE_NONE;
>> +}
> 
> I'd say kill all these helpers and just treat both WRITE_LIFE_NONE
> and WRITE_LIFE_NOT_SET special all the way down in NVMe.

Sure, we can do that.

-- 
Jens Axboe



Re: [PATCH 1/9] fs: add fcntl() interface for setting/getting write life time hints

2017-06-27 Thread Jens Axboe
On 06/27/2017 08:57 AM, Christoph Hellwig wrote:
> On Tue, Jun 27, 2017 at 08:55:02AM -0600, Jens Axboe wrote:
>> BTW, that patch does not look like an incremental patch, what's
>> this against?
> 
> The patch I'm replying to, without the other ones.

Looks like a replacement patch, not incremental to that. I'll
update. And I'm fine with not using flags, in fact that's what
I preferred to do initially.

-- 
Jens Axboe



Re: [PATCH 1/9] fs: add fcntl() interface for setting/getting write life time hints

2017-06-27 Thread Christoph Hellwig
On Tue, Jun 27, 2017 at 08:55:02AM -0600, Jens Axboe wrote:
> BTW, that patch does not look like an incremental patch, what's
> this against?

The patch I'm replying to, without the other ones.


Re: [PATCH 1/9] fs: add fcntl() interface for setting/getting write life time hints

2017-06-27 Thread Jens Axboe
On 06/27/2017 08:42 AM, Christoph Hellwig wrote:
> The API looks ok, but the code could use some cleanups.  What do
> you think about the incremental patch below:
> 
> It refactors various manipulations, and stores the write hint right
> in the iocb as there is a 4 byte hole (this will need some minor
> adjustments in the next patches):

Sigh... Sure, that's how I did it originally as well.

BTW, that patch does not look like an incremental patch, what's
this against?

-- 
Jens Axboe



Re: [PATCH 1/9] fs: add fcntl() interface for setting/getting write life time hints

2017-06-27 Thread Christoph Hellwig
On Tue, Jun 27, 2017 at 07:42:55AM -0700, Christoph Hellwig wrote:
> The API looks ok, but the code could use some cleanups.  What do
> you think about the incremental patch below:
> 
> It refactors various manipulations, and stores the write hint right
> in the iocb as there is a 4 byte hole (this will need some minor
> adjustments in the next patches):

And looking over the followons I'd love to just store the hints
directly in the inode, bio and request themselves.  We have big
enough holes at least in the bio and request to store them, although
instead of the enum which is at least in sized we'd have to make them
an explicit u16 or even u8.


Re: [PATCH 1/9] fs: add fcntl() interface for setting/getting write life time hints

2017-06-27 Thread Christoph Hellwig
The API looks ok, but the code could use some cleanups.  What do
you think about the incremental patch below:

It refactors various manipulations, and stores the write hint right
in the iocb as there is a 4 byte hole (this will need some minor
adjustments in the next patches):

diff --git a/fs/fcntl.c b/fs/fcntl.c
index f4e7267d117f..c436278154b4 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -243,6 +243,63 @@ static int f_getowner_uids(struct file *filp, unsigned 
long arg)
 }
 #endif
 
+static bool rw_hint_valid(enum rw_hint hint)
+{
+   switch (hint) {
+   case RWF_WRITE_LIFE_NOT_SET:
+   case RWH_WRITE_LIFE_NONE:
+   case RWH_WRITE_LIFE_SHORT:
+   case RWH_WRITE_LIFE_MEDIUM:
+   case RWH_WRITE_LIFE_LONG:
+   case RWH_WRITE_LIFE_EXTREME:
+   return true;
+   default:
+   return false;
+   }
+}
+
+static long fcntl_rw_hint(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+   struct inode *inode = file_inode(file);
+   u64 *argp = (u64 __user *)arg;
+   enum rw_hint hint;
+
+   switch (cmd) {
+   case F_GET_FILE_RW_HINT:
+   if (put_user(__file_write_hint(file), argp))
+   return -EFAULT;
+   return 0;
+   case F_SET_FILE_RW_HINT:
+   if (get_user(hint, argp))
+   return -EFAULT;
+   if (!rw_hint_valid(hint))
+   return -EINVAL;
+
+   spin_lock(>f_lock);
+   file->f_write_hint = hint;
+   spin_unlock(>f_lock);
+   return 0;
+   case F_GET_RW_HINT:
+   if (put_user(__inode_write_hint(inode), argp))
+   return -EFAULT;
+   return 0;
+   case F_SET_RW_HINT:
+   if (get_user(hint, argp))
+   return -EFAULT;
+   if (!rw_hint_valid(hint))
+   return -EINVAL;
+
+   inode_lock(inode);
+   inode_set_flags(inode, hint << S_WRITE_LIFE_SHIFT,
+   S_WRITE_LIFE_MASK);
+   inode_unlock(inode);
+   return 0;
+   default:
+   return -EINVAL;
+   }
+}
+
 static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
struct file *filp)
 {
@@ -337,6 +394,12 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned 
long arg,
case F_GET_SEALS:
err = shmem_fcntl(filp, cmd, arg);
break;
+   case F_GET_RW_HINT:
+   case F_SET_RW_HINT:
+   case F_GET_FILE_RW_HINT:
+   case F_SET_FILE_RW_HINT:
+   err = fcntl_rw_hint(filp, cmd, arg);
+   break;
default:
break;
}
diff --git a/fs/open.c b/fs/open.c
index cd0c5be8d012..3fe0c4aa7d27 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -759,6 +759,7 @@ static int do_dentry_open(struct file *f,
 likely(f->f_op->write || f->f_op->write_iter))
f->f_mode |= FMODE_CAN_WRITE;
 
+   f->f_write_hint = WRITE_LIFE_NOT_SET;
f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
 
file_ra_state_init(>f_ra, f->f_mapping->host->i_mapping);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 4574121f4746..a07e9ce970d1 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -265,6 +265,18 @@ struct page;
 struct address_space;
 struct writeback_control;
 
+/*
+ * Write life time hint values.
+ */
+enum rw_hint {
+   WRITE_LIFE_NOT_SET  = 0,
+   WRITE_LIFE_NONE = RWH_WRITE_LIFE_NONE,
+   WRITE_LIFE_SHORT= RWH_WRITE_LIFE_SHORT,
+   WRITE_LIFE_MEDIUM   = RWH_WRITE_LIFE_MEDIUM,
+   WRITE_LIFE_LONG = RWH_WRITE_LIFE_LONG,
+   WRITE_LIFE_EXTREME  = RWH_WRITE_LIFE_EXTREME,
+};
+
 #define IOCB_EVENTFD   (1 << 0)
 #define IOCB_APPEND(1 << 1)
 #define IOCB_DIRECT(1 << 2)
@@ -280,6 +292,7 @@ struct kiocb {
void (*ki_complete)(struct kiocb *iocb, long ret, long ret2);
void*private;
int ki_flags;
+   enum rw_hintki_hint;
 };
 
 static inline bool is_sync_kiocb(struct kiocb *kiocb)
@@ -851,6 +864,7 @@ struct file {
 * Must not be taken from IRQ context.
 */
spinlock_t  f_lock;
+   enum rw_hintf_write_hint;
atomic_long_t   f_count;
unsigned intf_flags;
fmode_t f_mode;
@@ -1833,6 +1847,14 @@ struct super_operations {
 #endif
 
 /*
+ * Expected life time hint of a write for this inode. This uses the
+ * WRITE_LIFE_* encoding, we just need to define the shift. We need
+ * 3 bits for this. Next S_* value is 131072, bit 17.
+ */
+#define S_WRITE_LIFE_SHIFT 14  /* 16384, next bit */
+#define S_WRITE_LIFE_MASK  (7 << S_WRITE_LIFE_SHIFT)
+
+/*
  * Note that nosuid etc flags are 

Re: [PATCH 1/9] fs: add fcntl() interface for setting/getting write life time hints

2017-06-26 Thread Jens Axboe
On 06/26/2017 10:09 AM, Darrick J. Wong wrote:
> On Mon, Jun 26, 2017 at 07:55:27AM -0600, Jens Axboe wrote:
>> On 06/26/2017 03:51 AM, Christoph Hellwig wrote:
>>> Please document the userspace API (added linux-api and linux-man
>>> to CC for sugestions), especially including the odd effects of the
>>> per-inode settings.
>>
>> Of course, I'll send in a diff for the fcntl(2) man page.
>>
>>> Also I would highly recommend to use different fcntl commands
>>> for the file vs inode hints to avoid any strange behavior.
>>
>> OK, used to have that too... I can add specific _FILE versions.
> 
> While you're at it, can you also send in an xfstest or two to check the
> basic functionality of the fcntl so that we know the code reflects the
> userspace API ("I set this hint and now I can query it back" and "file
> hint overrides inode hint") that we want?

I definitely can. I already wrote the below to verify that it behaves
the way it should.


/*
 * test-writehints.c: test file/inode write hint setting/getting
 */
#include 
#include 
#include 
#include 
#include 
#include 
#include 

#ifndef F_GET_RW_HINT
#define F_LINUX_SPECIFIC_BASE   1024
#define F_GET_RW_HINT   (F_LINUX_SPECIFIC_BASE + 11)
#define F_SET_RW_HINT   (F_LINUX_SPECIFIC_BASE + 12)
#define F_GET_FILE_RW_HINT  (F_LINUX_SPECIFIC_BASE + 13)
#define F_SET_FILE_RW_HINT  (F_LINUX_SPECIFIC_BASE + 14)

#define RWF_WRITE_LIFE_NOT_SET  0
#define RWH_WRITE_LIFE_NONE 1
#define RWH_WRITE_LIFE_SHORT2
#define RWH_WRITE_LIFE_MEDIUM   3
#define RWH_WRITE_LIFE_LONG 4
#define RWH_WRITE_LIFE_EXTREME  5

#endif

static int __get_write_hint(int fd, int cmd)
{
uint64_t hint;
int ret;

ret = fcntl(fd, cmd, );
if (ret < 0) {
perror("fcntl: F_GET_RW_FILE_HINT");
return -1;
}

return hint;
}

static int get_file_write_hint(int fd)
{
return __get_write_hint(fd, F_GET_FILE_RW_HINT);
}

static int get_inode_write_hint(int fd)
{
return __get_write_hint(fd, F_GET_RW_HINT);
}

static void set_file_write_hint(int fd, uint64_t hint)
{
uint64_t set_hint = hint;
int ret;

ret = fcntl(fd, F_SET_FILE_RW_HINT, _hint);
if (ret < 0) {
perror("fcntl: F_RW_SET_HINT");
return;
}
}

static void set_inode_write_hint(int fd, uint64_t hint)
{
uint64_t set_hint = hint;
int ret;

ret = fcntl(fd, F_SET_RW_HINT, _hint);
if (ret < 0) {
perror("fcntl: F_RW_SET_HINT");
return;
}
}

int main(int argc, char *argv[])
{
char filename[] = "/tmp/writehintsXX";
int ihint, fhint, fd;

fd = open(filename, O_RDWR | O_CREAT | 0644);
if (fd < 0) {
perror("open");
return 2;
}

/*
 * Default hints for both file and inode should be NOT_SET
 */
fhint = get_file_write_hint(fd);
if (fhint < 0)
return 0;
ihint = get_inode_write_hint(fd);
assert(fhint == ihint);
assert(fhint == RWF_WRITE_LIFE_NOT_SET);

/*
 * Set inode hint, check file hint returns the right hint
 */
set_inode_write_hint(fd, RWH_WRITE_LIFE_SHORT);
fhint = get_file_write_hint(fd);
ihint = get_inode_write_hint(fd);
assert(fhint == ihint);
assert(fhint == RWH_WRITE_LIFE_SHORT);

/*
 * Now set file hint, ensure that this is now the hint we get
 */
set_file_write_hint(fd, RWH_WRITE_LIFE_LONG);
fhint = get_file_write_hint(fd);
ihint = get_inode_write_hint(fd);
assert(fhint == RWH_WRITE_LIFE_LONG);
assert(ihint == RWH_WRITE_LIFE_SHORT);

/*
 * Clear inode write hint, ensure that file still returns the set hint
 */
set_inode_write_hint(fd, RWF_WRITE_LIFE_NOT_SET);
fhint = get_file_write_hint(fd);
ihint = get_inode_write_hint(fd);
assert(fhint == RWH_WRITE_LIFE_LONG);
assert(ihint == RWF_WRITE_LIFE_NOT_SET);

/*
 * Clear file write hint, ensure that now returns cleared
 */
set_file_write_hint(fd, RWF_WRITE_LIFE_NOT_SET);
fhint = get_file_write_hint(fd);
assert(fhint == RWF_WRITE_LIFE_NOT_SET);

close(fd);
unlink(filename);
return 0;
}


-- 
Jens Axboe



Re: [PATCH 1/9] fs: add fcntl() interface for setting/getting write life time hints

2017-06-26 Thread Darrick J. Wong
On Mon, Jun 26, 2017 at 07:55:27AM -0600, Jens Axboe wrote:
> On 06/26/2017 03:51 AM, Christoph Hellwig wrote:
> > Please document the userspace API (added linux-api and linux-man
> > to CC for sugestions), especially including the odd effects of the
> > per-inode settings.
> 
> Of course, I'll send in a diff for the fcntl(2) man page.
> 
> > Also I would highly recommend to use different fcntl commands
> > for the file vs inode hints to avoid any strange behavior.
> 
> OK, used to have that too... I can add specific _FILE versions.

While you're at it, can you also send in an xfstest or two to check the
basic functionality of the fcntl so that we know the code reflects the
userspace API ("I set this hint and now I can query it back" and "file
hint overrides inode hint") that we want?

--D

> 
> -- 
> Jens Axboe
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-api" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/9] fs: add fcntl() interface for setting/getting write life time hints

2017-06-26 Thread Jens Axboe
Define a set of write life time hints:

RWH_WRITE_LIFE_NOT_SET  No hint information set
RWH_WRITE_LIFE_NONE No hints about write life time
RWH_WRITE_LIFE_SHORTData written has a short life time
RWH_WRITE_LIFE_MEDIUM   Data written has a medium life time
RWH_WRITE_LIFE_LONG Data written has a long life time
RWH_WRITE_LIFE_EXTREME  Data written has an extremely long life time

The intent is for these values to be relative to each other, no
absolute meaning should be attached to these flag names.

Add an fcntl interface for querying these flags, and also for
setting them as well:

F_GET_RW_HINT   Returns the read/write hint set on the
underlying inode.

F_SET_RW_HINT   Set one of the above write hints on the
underlying inode.

F_GET_FILE_RW_HINT  Returns the read/write hint set on the
file descriptor.

F_SET_FILE_RW_HINT  Set one of the above write hints on the
file descriptor.

The user passes in a 64-bit pointer to get/set these values, and
the interface returns 0/-1 on success/error.

Sample program testing/implementing basic setting/getting of write
hints is below.

Add support for storing the write life time hint in the inode flags
and in struct file as well, and pass them to the kiocb flags. If
both a file and its corresponding inode has a write hint, then we
use the one in the file, if available. The file hint can be used
for sync/direct IO, for buffered writeback only the inode hint
is available.

This is in preparation for utilizing these hints in the block layer,
to guide on-media data placement.

/*
 * writehint.c: get or set an inode write hint
 */
 #include 
 #include 
 #include 
 #include 
 #include 
 #include 

 #ifndef F_GET_RW_HINT
 #define F_LINUX_SPECIFIC_BASE  1024
 #define F_GET_RW_HINT  (F_LINUX_SPECIFIC_BASE + 11)
 #define F_SET_RW_HINT  (F_LINUX_SPECIFIC_BASE + 12)
 #endif

static char *str[] = { "RWF_WRITE_LIFE_NOT_SET", "RWH_WRITE_LIFE_NONE",
"RWH_WRITE_LIFE_SHORT", "RWH_WRITE_LIFE_MEDIUM",
"RWH_WRITE_LIFE_LONG", "RWH_WRITE_LIFE_EXTREME" };

int main(int argc, char *argv[])
{
uint64_t hint;
int fd, ret;

if (argc < 2) {
fprintf(stderr, "%s: file \n", argv[0]);
return 1;
}

fd = open(argv[1], O_RDONLY);
if (fd < 0) {
perror("open");
return 2;
}

if (argc > 2) {
hint = atoi(argv[2]);
ret = fcntl(fd, F_SET_RW_HINT, );
if (ret < 0) {
perror("fcntl: F_SET_RW_HINT");
return 4;
}
}

ret = fcntl(fd, F_GET_RW_HINT, );
if (ret < 0) {
perror("fcntl: F_GET_RW_HINT");
return 3;
}

printf("%s: hint %s\n", argv[1], str[hint]);
close(fd);
return 0;
}

Reviewed-by: Martin K. Petersen 
Signed-off-by: Jens Axboe 
---
 fs/fcntl.c | 66 +
 fs/inode.c | 11 +++
 fs/open.c  |  1 +
 include/linux/fs.h | 74 --
 include/uapi/linux/fcntl.h | 21 +
 5 files changed, 171 insertions(+), 2 deletions(-)

diff --git a/fs/fcntl.c b/fs/fcntl.c
index f4e7267d117f..e166807646bf 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -243,6 +243,66 @@ static int f_getowner_uids(struct file *filp, unsigned 
long arg)
 }
 #endif
 
+static long fcntl_rw_hint(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+   struct inode *inode = file_inode(file);
+   bool on_file = false;
+   enum rw_hint hint;
+   long ret = 0;
+
+   switch (cmd) {
+   case F_GET_FILE_RW_HINT:
+   on_file = true;
+   case F_GET_RW_HINT:
+   /*
+* If we ask for the file descriptor hint and it isn't set,
+* return the underlying inode write hint. This is what
+* writeback does as well.
+*/
+   hint = RWF_WRITE_LIFE_NOT_SET;
+   if (on_file)
+   hint = file->f_write_hint;
+
+   if (!on_file || hint == RWF_WRITE_LIFE_NOT_SET)
+   hint = mask_to_write_hint(inode->i_flags,
+   S_WRITE_LIFE_SHIFT);
+   if (put_user(hint, (u64 __user *) arg))
+   ret = -EFAULT;
+   break;
+   case F_SET_FILE_RW_HINT:
+   on_file = true;
+   case F_SET_RW_HINT:
+   if (get_user(hint, (u64 __user *) arg)) {
+   ret = -EFAULT;
+   break;
+   }
+   switch (hint) {
+

Re: [PATCH 1/9] fs: add fcntl() interface for setting/getting write life time hints

2017-06-26 Thread Jens Axboe
On 06/26/2017 03:51 AM, Christoph Hellwig wrote:
> Please document the userspace API (added linux-api and linux-man
> to CC for sugestions), especially including the odd effects of the
> per-inode settings.

Of course, I'll send in a diff for the fcntl(2) man page.

> Also I would highly recommend to use different fcntl commands
> for the file vs inode hints to avoid any strange behavior.

OK, used to have that too... I can add specific _FILE versions.

-- 
Jens Axboe



Re: [PATCH 1/9] fs: add fcntl() interface for setting/getting write life time hints

2017-06-26 Thread Christoph Hellwig
Please document the userspace API (added linux-api and linux-man
to CC for sugestions), especially including the odd effects of the
per-inode settings.

Also I would highly recommend to use different fcntl commands
for the file vs inode hints to avoid any strange behavior.


[PATCH 1/9] fs: add fcntl() interface for setting/getting write life time hints

2017-06-20 Thread Jens Axboe
Define a set of write life time hints:

RWH_WRITE_LIFE_NONE No hints about write life time
RWH_WRITE_LIFE_SHORTData written has a short life time
RWH_WRITE_LIFE_MEDIUM   Data written has a medium life time
RWH_WRITE_LIFE_LONG Data written has a long life time
RWH_WRITE_LIFE_EXTREME  Data written has an extremely long life tim

The intent is for these values to be relative to each other, no
absolute meaning should be attached to these flag names.

Add an fcntl interface for querying these flags, and also for
setting them as well:

F_GET_RW_HINT   Returns the read/write hint set.

F_SET_RW_HINT   Pass one of the above write hints.

The user passes in a 64-bit pointer to get/set these values, and
the interface returns 0/-1 on success/error.

Sample program testing/implementing basic setting/getting of write
hints is below.

Add support for storing the write life time hint in the inode flags
and in struct file as well, and pass them to the kiocb flags. If
both a file and its corresponding inode has a write hint, then we
use the one in the file, if available. The file hint can be used
for sync/direct IO, for buffered writeback only the inode hint
is available.

This is in preparation for utilizing these hints in the block layer,
to guide on-media data placement.

/*
 * writehint.c: check or set a file/inode write hint
 */
 #include 
 #include 
 #include 
 #include 
 #include 
 #include 

 #ifndef F_RW_GET_HINT
 #define F_LINUX_SPECIFIC_BASE  1024
 #define F_RW_GET_HINT  (F_LINUX_SPECIFIC_BASE + 11)
 #define F_RW_SET_HINT  (F_LINUX_SPECIFIC_BASE + 12)
 #endif

static char *str[] = { "WRITE_LIFE_NOT_SET", "WRITE_LIFE_NONE",
"WRITE_LIFE_SHORT", "WRITE_LIFE_MEDIUM",
"WRITE_LIFE_LONG", "WRITE_LIFE_EXTREME" };

int main(int argc, char *argv[])
{
uint64_t hint;
int fd, ret;

if (argc < 2) {
fprintf(stderr, "%s: file \n", argv[0]);
return 1;
}

fd = open(argv[1], O_RDONLY);
if (fd < 0) {
perror("open");
return 2;
}

if (argc > 2) {
hint = atoi(argv[2]);
ret = fcntl(fd, F_RW_SET_HINT, );
if (ret < 0) {
perror("fcntl: F_RW_SET_HINT");
return 4;
}
}

ret = fcntl(fd, F_RW_GET_HINT, );
if (ret < 0) {
perror("fcntl: F_RW_GET_HINT");
return 3;
}

printf("%s: hint %s\n", argv[1], str[hint]);
close(fd);
return 0;
}

Signed-off-by: Jens Axboe 
---
 fs/fcntl.c | 60 +
 fs/inode.c | 11 +++
 fs/open.c  |  1 +
 include/linux/fs.h | 74 --
 include/uapi/linux/fcntl.h | 16 ++
 5 files changed, 160 insertions(+), 2 deletions(-)

diff --git a/fs/fcntl.c b/fs/fcntl.c
index f4e7267d117f..7037f0560f36 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -243,6 +243,62 @@ static int f_getowner_uids(struct file *filp, unsigned 
long arg)
 }
 #endif
 
+static long fcntl_rw_hint(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+   struct inode *inode = file_inode(file);
+   enum rw_hint hint, old_hint;
+   long ret = 0;
+
+   switch (cmd) {
+   case F_GET_RW_HINT:
+   if (file->f_write_hint != WRITE_LIFE_NOT_SET)
+   hint = file->f_write_hint;
+   else
+   hint = mask_to_write_hint(inode->i_flags,
+   S_WRITE_LIFE_SHIFT);
+   if (put_user(hint, (u64 __user *) arg))
+   ret = -EFAULT;
+   break;
+   case F_SET_RW_HINT:
+   if (get_user(hint, (u64 __user *) arg)) {
+   ret = -EFAULT;
+   break;
+   }
+   switch (hint) {
+   case WRITE_LIFE_NOT_SET:
+   case WRITE_LIFE_NONE:
+   case WRITE_LIFE_SHORT:
+   case WRITE_LIFE_MEDIUM:
+   case WRITE_LIFE_LONG:
+   case WRITE_LIFE_EXTREME:
+   spin_lock(>f_lock);
+   file->f_write_hint = hint;
+   spin_unlock(>f_lock);
+
+   /*
+* Only propagate hint to inode, if no hint is set,
+* or if the hint is being cleared
+*/
+   old_hint = mask_to_write_hint(inode->i_flags,
+   S_WRITE_LIFE_SHIFT);
+   if (old_hint == WRITE_LIFE_NOT_SET ||
+   hint == WRITE_LIFE_NOT_SET)
+   

Re: [PATCH 1/9] fs: add fcntl() interface for setting/getting write life time hints

2017-06-20 Thread Jens Axboe
On 06/20/2017 05:09 PM, Bart Van Assche wrote:
> On Mon, 2017-06-19 at 11:04 -0600, Jens Axboe wrote:
>> +static long fcntl_rw_hint(struct file *file, unsigned int cmd,
>> +  u64 __user *ptr)
>> +{
>> +struct inode *inode = file_inode(file);
>> +long ret = 0;
>> +u64 hint;
>> +
>> +switch (cmd) {
>> +case F_GET_RW_HINT:
>> +hint = mask_to_write_hint(inode->i_flags, S_WRITE_LIFE_SHIFT);
>> +if (put_user(hint, ptr))
>> +ret = -EFAULT;
>> +break;
>> +case F_SET_RW_HINT:
>> +if (get_user(hint, ptr)) {
>> +ret = -EFAULT;
>> +break;
>> +}
>> +switch (hint) {
>> +case WRITE_LIFE_NONE:
>> +case WRITE_LIFE_SHORT:
>> +case WRITE_LIFE_MEDIUM:
>> +case WRITE_LIFE_LONG:
>> +case WRITE_LIFE_EXTREME:
>> +inode_set_write_hint(inode, hint);
>> +ret = 0;
>> +break;
>> +default:
>> +ret = -EINVAL;
>> +}
>> +break;
>> +default:
>> +ret = -EINVAL;
>> +break;
>> +}
>> +
>> +return ret;
>> +}
> 
> Hello Jens,
> 
> Do we need an (inline) helper function for checking the validity of a
> numerical WRITE_LIFE value next to the definition of the WRITE_LIFE_*
> constants, e.g. WRITE_LIFE_NONE <= hint && hint <= WRITE_LIFE_EXTREME?

Might not hurt in general, I can fold something like that in.

>> +/*
>> + * Steal 3 bits for stream information, this allows 8 valid streams
>> + */
>> +#define IOCB_WRITE_LIFE_SHIFT   7
>> +#define IOCB_WRITE_LIFE_MASK(BIT(7) | BIT(8) | BIT(9))
> 
> A minor comment: how about making this easier to read by defining
> IOCB_WRITE_LIFE_MASK as (7 << IOCB_WRITE_LIFE_SHIFT)?

Agree, that would be prettier.

>>  /*
>> + * Expected life time hint of a write for this inode. This uses the
>> + * WRITE_LIFE_* encoding, we just need to define the shift. We need
>> + * 3 bits for this. Next S_* value is 131072, bit 17.
>> + */
>> +#define S_WRITE_LIFE_MASK   0x1c000 /* bits 14..16 */
>> +#define S_WRITE_LIFE_SHIFT  14  /* 16384, next bit */
> 
> Another minor comment: how about making this easier to read by defining
> S_WRITE_LIFE_MASK as (7 << S_WRITE_LIFE_SHIFT)?

Agree, I'll make that change too.

>> /*
>> + * Write life time hint values.
>> + */
>> +enum rw_hint {
>> +WRITE_LIFE_NONE = RWH_WRITE_LIFE_NONE,
>> +WRITE_LIFE_SHORT = RWH_WRITE_LIFE_SHORT,
>> +WRITE_LIFE_MEDIUM = RWH_WRITE_LIFE_MEDIUM,
>> +WRITE_LIFE_LONG = RWH_WRITE_LIFE_LONG,
>> +WRITE_LIFE_EXTREME = RWH_WRITE_LIFE_EXTREME
>> +};
>> [ ... ]
>> +/*
>> + * Valid hint values for F_{GET,SET}_RW_HINT
>> + */
>> +#define RWH_WRITE_LIFE_NONE 0
>> +#define RWH_WRITE_LIFE_SHORT1
>> +#define RWH_WRITE_LIFE_MEDIUM   2
>> +#define RWH_WRITE_LIFE_LONG 3
>> +#define RWH_WRITE_LIFE_EXTREME  4
> 
> Maybe I missed something, but it's not clear to me why we have both an
> enum and defines with the same numerical values? BTW, I prefer an enum
> above #defines.

We use the enum internally, that's the hint that the fs and block layer
sees. The reason for the defines is for the user interface, where we
don't want that to be an enum. So the mapping between the two is the
definition of the enum rw_hint values.

-- 
Jens Axboe



Re: [PATCH 1/9] fs: add fcntl() interface for setting/getting write life time hints

2017-06-20 Thread Bart Van Assche
On Mon, 2017-06-19 at 11:04 -0600, Jens Axboe wrote:
> +static long fcntl_rw_hint(struct file *file, unsigned int cmd,
> +   u64 __user *ptr)
> +{
> + struct inode *inode = file_inode(file);
> + long ret = 0;
> + u64 hint;
> +
> + switch (cmd) {
> + case F_GET_RW_HINT:
> + hint = mask_to_write_hint(inode->i_flags, S_WRITE_LIFE_SHIFT);
> + if (put_user(hint, ptr))
> + ret = -EFAULT;
> + break;
> + case F_SET_RW_HINT:
> + if (get_user(hint, ptr)) {
> + ret = -EFAULT;
> + break;
> + }
> + switch (hint) {
> + case WRITE_LIFE_NONE:
> + case WRITE_LIFE_SHORT:
> + case WRITE_LIFE_MEDIUM:
> + case WRITE_LIFE_LONG:
> + case WRITE_LIFE_EXTREME:
> + inode_set_write_hint(inode, hint);
> + ret = 0;
> + break;
> + default:
> + ret = -EINVAL;
> + }
> + break;
> + default:
> + ret = -EINVAL;
> + break;
> + }
> +
> + return ret;
> +}

Hello Jens,

Do we need an (inline) helper function for checking the validity of a
numerical WRITE_LIFE value next to the definition of the WRITE_LIFE_*
constants, e.g. WRITE_LIFE_NONE <= hint && hint <= WRITE_LIFE_EXTREME?

> +/*
> + * Steal 3 bits for stream information, this allows 8 valid streams
> + */
> +#define IOCB_WRITE_LIFE_SHIFT7
> +#define IOCB_WRITE_LIFE_MASK (BIT(7) | BIT(8) | BIT(9))

A minor comment: how about making this easier to read by defining
IOCB_WRITE_LIFE_MASK as (7 << IOCB_WRITE_LIFE_SHIFT)?

>  /*
> + * Expected life time hint of a write for this inode. This uses the
> + * WRITE_LIFE_* encoding, we just need to define the shift. We need
> + * 3 bits for this. Next S_* value is 131072, bit 17.
> + */
> +#define S_WRITE_LIFE_MASK0x1c000 /* bits 14..16 */
> +#define S_WRITE_LIFE_SHIFT   14  /* 16384, next bit */

Another minor comment: how about making this easier to read by defining
S_WRITE_LIFE_MASK as (7 << S_WRITE_LIFE_SHIFT)?

> /*
> + * Write life time hint values.
> + */
> +enum rw_hint {
> + WRITE_LIFE_NONE = RWH_WRITE_LIFE_NONE,
> + WRITE_LIFE_SHORT = RWH_WRITE_LIFE_SHORT,
> + WRITE_LIFE_MEDIUM = RWH_WRITE_LIFE_MEDIUM,
> + WRITE_LIFE_LONG = RWH_WRITE_LIFE_LONG,
> + WRITE_LIFE_EXTREME = RWH_WRITE_LIFE_EXTREME
> +};
> [ ... ]
> +/*
> + * Valid hint values for F_{GET,SET}_RW_HINT
> + */
> +#define RWH_WRITE_LIFE_NONE  0
> +#define RWH_WRITE_LIFE_SHORT 1
> +#define RWH_WRITE_LIFE_MEDIUM2
> +#define RWH_WRITE_LIFE_LONG  3
> +#define RWH_WRITE_LIFE_EXTREME   4

Maybe I missed something, but it's not clear to me why we have both an enum and
defines with the same numerical values? BTW, I prefer an enum above #defines.

Thanks,

Bart.

[PATCH 1/9] fs: add fcntl() interface for setting/getting write life time hints

2017-06-19 Thread Jens Axboe
Define a set of write life time hints:

and add an fcntl interface for querying these flags, and also for
setting them as well:

F_GET_RW_HINT   Returns the read/write hint set.

F_SET_RW_HINT   Pass one of the above write hints.

The user passes in a 64-bit pointer to get/set these values, and
the interface returns 0/-1 on success/error.

Sample program testing/implementing basic setting/getting of write
hints is below.

Add support for storing the write life time hint in the inode flags,
and pass them to the kiocb flags as well. This is in preparation
for utilizing these hints in the block layer, to guide on-media
data placement.

/*
 * writehint.c: check or set a file/inode write hint
 */

static char *str[] = { "WRITE_LIFE_NONE", "WRITE_LIFE_SHORT",
"WRITE_LIFE_MEDIUM", "WRITE_LIFE_LONG",
"WRITE_LIFE_EXTREME" };

int main(int argc, char *argv[])
{
uint64_t hint = -1ULL;
int fd, ret;

if (argc < 2) {
fprintf(stderr, "%s: dev \n", argv[0]);
return 1;
}

fd = open(argv[1], O_RDONLY);
if (fd < 0) {
perror("open");
return 2;
}

if (argc > 2)
hint = atoi(argv[2]);

if (hint == -1ULL) {
ret = fcntl(fd, F_RW_GET_HINT, );
if (ret < 0) {
perror("fcntl: F_RW_GET_HINT");
return 3;
}
} else {
ret = fcntl(fd, F_RW_SET_HINT, );
if (ret < 0) {
perror("fcntl: F_RW_SET_HINT");
return 4;
}
}

printf("%s: %shint %s\n", argv[1], hint != -1ULL ? "set " : "", 
str[hint]);
close(fd);
return 0;
}

Signed-off-by: Jens Axboe 
---
 fs/fcntl.c | 43 
 fs/inode.c | 11 +
 include/linux/fs.h | 61 ++
 include/uapi/linux/fcntl.h | 15 
 4 files changed, 130 insertions(+)

diff --git a/fs/fcntl.c b/fs/fcntl.c
index f4e7267d117f..113b78c11631 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -243,6 +243,45 @@ static int f_getowner_uids(struct file *filp, unsigned 
long arg)
 }
 #endif
 
+static long fcntl_rw_hint(struct file *file, unsigned int cmd,
+ u64 __user *ptr)
+{
+   struct inode *inode = file_inode(file);
+   long ret = 0;
+   u64 hint;
+
+   switch (cmd) {
+   case F_GET_RW_HINT:
+   hint = mask_to_write_hint(inode->i_flags, S_WRITE_LIFE_SHIFT);
+   if (put_user(hint, ptr))
+   ret = -EFAULT;
+   break;
+   case F_SET_RW_HINT:
+   if (get_user(hint, ptr)) {
+   ret = -EFAULT;
+   break;
+   }
+   switch (hint) {
+   case WRITE_LIFE_NONE:
+   case WRITE_LIFE_SHORT:
+   case WRITE_LIFE_MEDIUM:
+   case WRITE_LIFE_LONG:
+   case WRITE_LIFE_EXTREME:
+   inode_set_write_hint(inode, hint);
+   ret = 0;
+   break;
+   default:
+   ret = -EINVAL;
+   }
+   break;
+   default:
+   ret = -EINVAL;
+   break;
+   }
+
+   return ret;
+}
+
 static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
struct file *filp)
 {
@@ -337,6 +376,10 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned 
long arg,
case F_GET_SEALS:
err = shmem_fcntl(filp, cmd, arg);
break;
+   case F_GET_RW_HINT:
+   case F_SET_RW_HINT:
+   err = fcntl_rw_hint(filp, cmd, (u64 __user *) arg);
+   break;
default:
break;
}
diff --git a/fs/inode.c b/fs/inode.c
index db5914783a71..defb015a2c6d 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -2120,3 +2120,14 @@ struct timespec current_time(struct inode *inode)
return timespec_trunc(now, inode->i_sb->s_time_gran);
 }
 EXPORT_SYMBOL(current_time);
+
+void inode_set_write_hint(struct inode *inode, enum rw_hint hint)
+{
+   unsigned int flags = write_hint_to_mask(hint, S_WRITE_LIFE_SHIFT);
+
+   if (flags != mask_to_write_hint(inode->i_flags, S_WRITE_LIFE_SHIFT)) {
+   inode_lock(inode);
+   inode_set_flags(inode, flags, S_WRITE_LIFE_MASK);
+   inode_unlock(inode);
+   }
+}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 023f0324762b..8720251cc153 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -270,6 +270,12 @@ struct writeback_control;
 #define IOCB_SYNC  (1 << 5)
 #define IOCB_WRITE (1 << 6)
 
+/*
+ * Steal 3 bits for stream