Re: [PATCH RESEND v1 03/16] vfs: add I/O frequency update function

2013-01-11 Thread Zhi Yong Wu
On Fri, Jan 11, 2013 at 10:27 PM, David Sterba  wrote:
> On Fri, Jan 11, 2013 at 03:38:31PM +0800, Zhi Yong Wu wrote:
>> On Thu, Jan 10, 2013 at 8:51 AM, David Sterba  wrote:
>> > On Thu, Dec 20, 2012 at 10:43:22PM +0800, zwu.ker...@gmail.com wrote:
>> >> --- a/fs/hot_tracking.c
>> >> +++ b/fs/hot_tracking.c
>> >> @@ -164,6 +164,135 @@ static void hot_inode_tree_exit(struct hot_info 
>> >> *root)
>> >>   spin_unlock(>lock);
>> >>  }
>> >>
>> >> +struct hot_inode_item
>> >> +*hot_inode_item_lookup(struct hot_info *root, u64 ino)
>> >> +{
>> >> + struct rb_node **p = >hot_inode_tree.map.rb_node;
>> >> + struct rb_node *parent = NULL;
>> >> + struct hot_comm_item *ci;
>> >> + struct hot_inode_item *entry;
>> >> +
>> >> + /* walk tree to find insertion point */
>> >> + spin_lock(>lock);
>> >> + while (*p) {
>> >> + parent = *p;
>> >> + ci = rb_entry(parent, struct hot_comm_item, rb_node);
>> >> + entry = container_of(ci, struct hot_inode_item, hot_inode);
>> >> + if (ino < entry->i_ino)
>> >> + p = &(*p)->rb_left;
>> >> + else if (ino > entry->i_ino)
>> >> + p = &(*p)->rb_right;
>> >
>> > style comment: put { } around the all if/else blocks,
>> no, it will violate checkpatch.pl. If the if/else block only contains
>> one line of code, we should not put {} around them.
>
> Unless its in a if / else if / else sequence, see
> Documentation/CodingStyle chapter 3.1. This is what I've learned long
> time ago and using it intuitively. I don't know to what extent
> checkpatch sticks to that document, the code is menat to be read by
> people, so if there is one style prevailing in the subsystem (and it is
> by looking into random fs/*.c files) it's wise to keep the style
> consistent.
I checked *.c in fs/, its coding style about if/else if is consistent
with what i said.:)
e.g in direct-io.c

if (sdio->final_block_in_bio != sdio->cur_page_block ||
cur_offset != bio_next_offset)
dio_bio_submit(dio, sdio);
/*
 * Submit now if the underlying fs is about to perform a
 * metadata read
 */
else if (sdio->boundary)
dio_bio_submit(dio, sdio);

>
> david



-- 
Regards,

Zhi Yong Wu
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH RESEND v1 03/16] vfs: add I/O frequency update function

2013-01-11 Thread David Sterba
On Fri, Jan 11, 2013 at 03:38:31PM +0800, Zhi Yong Wu wrote:
> On Thu, Jan 10, 2013 at 8:51 AM, David Sterba  wrote:
> > On Thu, Dec 20, 2012 at 10:43:22PM +0800, zwu.ker...@gmail.com wrote:
> >> --- a/fs/hot_tracking.c
> >> +++ b/fs/hot_tracking.c
> >> @@ -164,6 +164,135 @@ static void hot_inode_tree_exit(struct hot_info 
> >> *root)
> >>   spin_unlock(>lock);
> >>  }
> >>
> >> +struct hot_inode_item
> >> +*hot_inode_item_lookup(struct hot_info *root, u64 ino)
> >> +{
> >> + struct rb_node **p = >hot_inode_tree.map.rb_node;
> >> + struct rb_node *parent = NULL;
> >> + struct hot_comm_item *ci;
> >> + struct hot_inode_item *entry;
> >> +
> >> + /* walk tree to find insertion point */
> >> + spin_lock(>lock);
> >> + while (*p) {
> >> + parent = *p;
> >> + ci = rb_entry(parent, struct hot_comm_item, rb_node);
> >> + entry = container_of(ci, struct hot_inode_item, hot_inode);
> >> + if (ino < entry->i_ino)
> >> + p = &(*p)->rb_left;
> >> + else if (ino > entry->i_ino)
> >> + p = &(*p)->rb_right;
> >
> > style comment: put { } around the all if/else blocks,
> no, it will violate checkpatch.pl. If the if/else block only contains
> one line of code, we should not put {} around them.

Unless its in a if / else if / else sequence, see
Documentation/CodingStyle chapter 3.1. This is what I've learned long
time ago and using it intuitively. I don't know to what extent
checkpatch sticks to that document, the code is menat to be read by
people, so if there is one style prevailing in the subsystem (and it is
by looking into random fs/*.c files) it's wise to keep the style
consistent.

david
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH RESEND v1 03/16] vfs: add I/O frequency update function

2013-01-11 Thread David Sterba
On Fri, Jan 11, 2013 at 03:38:31PM +0800, Zhi Yong Wu wrote:
 On Thu, Jan 10, 2013 at 8:51 AM, David Sterba dste...@suse.cz wrote:
  On Thu, Dec 20, 2012 at 10:43:22PM +0800, zwu.ker...@gmail.com wrote:
  --- a/fs/hot_tracking.c
  +++ b/fs/hot_tracking.c
  @@ -164,6 +164,135 @@ static void hot_inode_tree_exit(struct hot_info 
  *root)
spin_unlock(root-lock);
   }
 
  +struct hot_inode_item
  +*hot_inode_item_lookup(struct hot_info *root, u64 ino)
  +{
  + struct rb_node **p = root-hot_inode_tree.map.rb_node;
  + struct rb_node *parent = NULL;
  + struct hot_comm_item *ci;
  + struct hot_inode_item *entry;
  +
  + /* walk tree to find insertion point */
  + spin_lock(root-lock);
  + while (*p) {
  + parent = *p;
  + ci = rb_entry(parent, struct hot_comm_item, rb_node);
  + entry = container_of(ci, struct hot_inode_item, hot_inode);
  + if (ino  entry-i_ino)
  + p = (*p)-rb_left;
  + else if (ino  entry-i_ino)
  + p = (*p)-rb_right;
 
  style comment: put { } around the all if/else blocks,
 no, it will violate checkpatch.pl. If the if/else block only contains
 one line of code, we should not put {} around them.

Unless its in a if / else if / else sequence, see
Documentation/CodingStyle chapter 3.1. This is what I've learned long
time ago and using it intuitively. I don't know to what extent
checkpatch sticks to that document, the code is menat to be read by
people, so if there is one style prevailing in the subsystem (and it is
by looking into random fs/*.c files) it's wise to keep the style
consistent.

david
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH RESEND v1 03/16] vfs: add I/O frequency update function

2013-01-11 Thread Zhi Yong Wu
On Fri, Jan 11, 2013 at 10:27 PM, David Sterba dste...@suse.cz wrote:
 On Fri, Jan 11, 2013 at 03:38:31PM +0800, Zhi Yong Wu wrote:
 On Thu, Jan 10, 2013 at 8:51 AM, David Sterba dste...@suse.cz wrote:
  On Thu, Dec 20, 2012 at 10:43:22PM +0800, zwu.ker...@gmail.com wrote:
  --- a/fs/hot_tracking.c
  +++ b/fs/hot_tracking.c
  @@ -164,6 +164,135 @@ static void hot_inode_tree_exit(struct hot_info 
  *root)
spin_unlock(root-lock);
   }
 
  +struct hot_inode_item
  +*hot_inode_item_lookup(struct hot_info *root, u64 ino)
  +{
  + struct rb_node **p = root-hot_inode_tree.map.rb_node;
  + struct rb_node *parent = NULL;
  + struct hot_comm_item *ci;
  + struct hot_inode_item *entry;
  +
  + /* walk tree to find insertion point */
  + spin_lock(root-lock);
  + while (*p) {
  + parent = *p;
  + ci = rb_entry(parent, struct hot_comm_item, rb_node);
  + entry = container_of(ci, struct hot_inode_item, hot_inode);
  + if (ino  entry-i_ino)
  + p = (*p)-rb_left;
  + else if (ino  entry-i_ino)
  + p = (*p)-rb_right;
 
  style comment: put { } around the all if/else blocks,
 no, it will violate checkpatch.pl. If the if/else block only contains
 one line of code, we should not put {} around them.

 Unless its in a if / else if / else sequence, see
 Documentation/CodingStyle chapter 3.1. This is what I've learned long
 time ago and using it intuitively. I don't know to what extent
 checkpatch sticks to that document, the code is menat to be read by
 people, so if there is one style prevailing in the subsystem (and it is
 by looking into random fs/*.c files) it's wise to keep the style
 consistent.
I checked *.c in fs/, its coding style about if/else if is consistent
with what i said.:)
e.g in direct-io.c

if (sdio-final_block_in_bio != sdio-cur_page_block ||
cur_offset != bio_next_offset)
dio_bio_submit(dio, sdio);
/*
 * Submit now if the underlying fs is about to perform a
 * metadata read
 */
else if (sdio-boundary)
dio_bio_submit(dio, sdio);


 david



-- 
Regards,

Zhi Yong Wu
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH RESEND v1 03/16] vfs: add I/O frequency update function

2013-01-10 Thread Zhi Yong Wu
On Thu, Jan 10, 2013 at 8:51 AM, David Sterba  wrote:
> On Thu, Dec 20, 2012 at 10:43:22PM +0800, zwu.ker...@gmail.com wrote:
>> --- a/fs/hot_tracking.c
>> +++ b/fs/hot_tracking.c
>> @@ -164,6 +164,135 @@ static void hot_inode_tree_exit(struct hot_info *root)
>>   spin_unlock(>lock);
>>  }
>>
>> +struct hot_inode_item
>> +*hot_inode_item_lookup(struct hot_info *root, u64 ino)
>> +{
>> + struct rb_node **p = >hot_inode_tree.map.rb_node;
>> + struct rb_node *parent = NULL;
>> + struct hot_comm_item *ci;
>> + struct hot_inode_item *entry;
>> +
>> + /* walk tree to find insertion point */
>> + spin_lock(>lock);
>> + while (*p) {
>> + parent = *p;
>> + ci = rb_entry(parent, struct hot_comm_item, rb_node);
>> + entry = container_of(ci, struct hot_inode_item, hot_inode);
>> + if (ino < entry->i_ino)
>> + p = &(*p)->rb_left;
>> + else if (ino > entry->i_ino)
>> + p = &(*p)->rb_right;
>
> style comment: put { } around the all if/else blocks,
no, it will violate checkpatch.pl. If the if/else block only contains
one line of code, we should not put {} around them.
>
>> + else {
>> + spin_unlock(>lock);
>> + kref_get(>hot_inode.refs);
>
> jumping forwards in the series, the spin_unlock and kref_get get swapped
> later, and I think that's the right order. Otherwise there's a small
> window where the entry does not get the reference and could be
> potentially freed by racing kref_put, no?
yes, good catch, thanks, done
>
> 
> spin_unlock(tree)
>  spin_lock(tree)
>  
>  kref_put(E) or via hot_inode_item_put(E) (1)
> kref_get(E)   (2)
>
>
> if the reference count at (1) was 1, it's freed and (2) hits a free
> memory. hot_inode_item_put can be called from filesystem or via seq
> print of the respective /proc files, so I think there are chances to hit
> the problem.
Great.
>
>> + return entry;
>> + }
>> + }
>> + spin_unlock(>lock);
>> +
>> + entry = kmem_cache_zalloc(hot_inode_item_cachep, GFP_NOFS);
>> + if (!entry)
>> + return ERR_PTR(-ENOMEM);
>> +
>> + spin_lock(>lock);
>> + hot_inode_item_init(entry, ino, >hot_inode_tree);
>> + rb_link_node(>hot_inode.rb_node, parent, p);
>> + rb_insert_color(>hot_inode.rb_node,
>> + >hot_inode_tree.map);
>> + spin_unlock(>lock);
>> +
>> + kref_get(>hot_inode.refs);
>
> Similar here, the entry is inserted into the tree but there's no
> refcount yet. And the order of spin_unlock/kref_get remains unchanged.
ditto
>
>> + return entry;
>> +}
>> +EXPORT_SYMBOL_GPL(hot_inode_item_lookup);
>> +
>> +static struct hot_range_item
>> +*hot_range_item_lookup(struct hot_inode_item *he,
>> + loff_t start)
>> +{
>> + struct rb_node **p = >hot_range_tree.map.rb_node;
>> + struct rb_node *parent = NULL;
>> + struct hot_comm_item *ci;
>> + struct hot_range_item *entry;
>> +
>> + /* walk tree to find insertion point */
>> + spin_lock(>lock);
>> + while (*p) {
>> + parent = *p;
>> + ci = rb_entry(parent, struct hot_comm_item, rb_node);
>> + entry = container_of(ci, struct hot_range_item, hot_range);
>> + if (start < entry->start)
>> + p = &(*p)->rb_left;
>> + else if (start > hot_range_end(entry))
>> + p = &(*p)->rb_right;
>
> if { ...}
> else if { ... }
We should not put {} around them as what i explained above.
>
>> + else {
>> + spin_unlock(>lock);
>> + kref_get(>hot_range.refs);
>
> same here
Done
>
>> + return entry;
>> + }
>> + }
>> + spin_unlock(>lock);
>> +
>> + entry = kmem_cache_zalloc(hot_range_item_cachep, GFP_NOFS);
>> + if (!entry)
>> + return ERR_PTR(-ENOMEM);
>> +
>> + spin_lock(>lock);
>> + hot_range_item_init(entry, start, he);
>> + rb_link_node(>hot_range.rb_node, parent, p);
>> + rb_insert_color(>hot_range.rb_node,
>> + >hot_range_tree.map);
>> + spin_unlock(>lock);
>> +
>> + kref_get(>hot_range.refs);
>
> and here
Done
>
>> + return entry;
>> +}
>> +
>> +/*
>> + * This function does the actual work of updating
>> + * the frequency numbers, whatever they turn out to be.
>
> Can this function be described a bit better? This comment did not help.
OK, i will
>
>> + */
>> +static void hot_rw_freq_calc(struct timespec old_atime,
>> + struct timespec cur_time, u64 *avg)
>> +{
>> + struct timespec delta_ts;
>> + u64 new_delta;
>> +
>> + delta_ts = timespec_sub(cur_time, old_atime);
>> + new_delta = timespec_to_ns(_ts) >> FREQ_POWER;
>> +
>> + *avg = (*avg << 

Re: [PATCH RESEND v1 03/16] vfs: add I/O frequency update function

2013-01-10 Thread Zhi Yong Wu
On Thu, Jan 10, 2013 at 8:51 AM, David Sterba dste...@suse.cz wrote:
 On Thu, Dec 20, 2012 at 10:43:22PM +0800, zwu.ker...@gmail.com wrote:
 --- a/fs/hot_tracking.c
 +++ b/fs/hot_tracking.c
 @@ -164,6 +164,135 @@ static void hot_inode_tree_exit(struct hot_info *root)
   spin_unlock(root-lock);
  }

 +struct hot_inode_item
 +*hot_inode_item_lookup(struct hot_info *root, u64 ino)
 +{
 + struct rb_node **p = root-hot_inode_tree.map.rb_node;
 + struct rb_node *parent = NULL;
 + struct hot_comm_item *ci;
 + struct hot_inode_item *entry;
 +
 + /* walk tree to find insertion point */
 + spin_lock(root-lock);
 + while (*p) {
 + parent = *p;
 + ci = rb_entry(parent, struct hot_comm_item, rb_node);
 + entry = container_of(ci, struct hot_inode_item, hot_inode);
 + if (ino  entry-i_ino)
 + p = (*p)-rb_left;
 + else if (ino  entry-i_ino)
 + p = (*p)-rb_right;

 style comment: put { } around the all if/else blocks,
no, it will violate checkpatch.pl. If the if/else block only contains
one line of code, we should not put {} around them.

 + else {
 + spin_unlock(root-lock);
 + kref_get(entry-hot_inode.refs);

 jumping forwards in the series, the spin_unlock and kref_get get swapped
 later, and I think that's the right order. Otherwise there's a small
 window where the entry does not get the reference and could be
 potentially freed by racing kref_put, no?
yes, good catch, thanks, done

 lookup entry E
 spin_unlock(tree)
  spin_lock(tree)
  lookup entry E
  kref_put(E) or via hot_inode_item_put(E) (1)
 kref_get(E)   (2)


 if the reference count at (1) was 1, it's freed and (2) hits a free
 memory. hot_inode_item_put can be called from filesystem or via seq
 print of the respective /proc files, so I think there are chances to hit
 the problem.
Great.

 + return entry;
 + }
 + }
 + spin_unlock(root-lock);
 +
 + entry = kmem_cache_zalloc(hot_inode_item_cachep, GFP_NOFS);
 + if (!entry)
 + return ERR_PTR(-ENOMEM);
 +
 + spin_lock(root-lock);
 + hot_inode_item_init(entry, ino, root-hot_inode_tree);
 + rb_link_node(entry-hot_inode.rb_node, parent, p);
 + rb_insert_color(entry-hot_inode.rb_node,
 + root-hot_inode_tree.map);
 + spin_unlock(root-lock);
 +
 + kref_get(entry-hot_inode.refs);

 Similar here, the entry is inserted into the tree but there's no
 refcount yet. And the order of spin_unlock/kref_get remains unchanged.
ditto

 + return entry;
 +}
 +EXPORT_SYMBOL_GPL(hot_inode_item_lookup);
 +
 +static struct hot_range_item
 +*hot_range_item_lookup(struct hot_inode_item *he,
 + loff_t start)
 +{
 + struct rb_node **p = he-hot_range_tree.map.rb_node;
 + struct rb_node *parent = NULL;
 + struct hot_comm_item *ci;
 + struct hot_range_item *entry;
 +
 + /* walk tree to find insertion point */
 + spin_lock(he-lock);
 + while (*p) {
 + parent = *p;
 + ci = rb_entry(parent, struct hot_comm_item, rb_node);
 + entry = container_of(ci, struct hot_range_item, hot_range);
 + if (start  entry-start)
 + p = (*p)-rb_left;
 + else if (start  hot_range_end(entry))
 + p = (*p)-rb_right;

 if { ...}
 else if { ... }
We should not put {} around them as what i explained above.

 + else {
 + spin_unlock(he-lock);
 + kref_get(entry-hot_range.refs);

 same here
Done

 + return entry;
 + }
 + }
 + spin_unlock(he-lock);
 +
 + entry = kmem_cache_zalloc(hot_range_item_cachep, GFP_NOFS);
 + if (!entry)
 + return ERR_PTR(-ENOMEM);
 +
 + spin_lock(he-lock);
 + hot_range_item_init(entry, start, he);
 + rb_link_node(entry-hot_range.rb_node, parent, p);
 + rb_insert_color(entry-hot_range.rb_node,
 + he-hot_range_tree.map);
 + spin_unlock(he-lock);
 +
 + kref_get(entry-hot_range.refs);

 and here
Done

 + return entry;
 +}
 +
 +/*
 + * This function does the actual work of updating
 + * the frequency numbers, whatever they turn out to be.

 Can this function be described a bit better? This comment did not help.
OK, i will

 + */
 +static void hot_rw_freq_calc(struct timespec old_atime,
 + struct timespec cur_time, u64 *avg)
 +{
 + struct timespec delta_ts;
 + u64 new_delta;
 +
 + delta_ts = timespec_sub(cur_time, old_atime);
 + new_delta = timespec_to_ns(delta_ts)  FREQ_POWER;
 +
 + *avg = (*avg  FREQ_POWER) - *avg + new_delta;
 + *avg = *avg  FREQ_POWER;
 +}
 +
 +static void hot_freq_data_update(struct hot_freq_data *freq_data, 

Re: [PATCH RESEND v1 03/16] vfs: add I/O frequency update function

2013-01-09 Thread David Sterba
On Thu, Dec 20, 2012 at 10:43:22PM +0800, zwu.ker...@gmail.com wrote:
> --- a/fs/hot_tracking.c
> +++ b/fs/hot_tracking.c
> @@ -164,6 +164,135 @@ static void hot_inode_tree_exit(struct hot_info *root)
>   spin_unlock(>lock);
>  }
>  
> +struct hot_inode_item
> +*hot_inode_item_lookup(struct hot_info *root, u64 ino)
> +{
> + struct rb_node **p = >hot_inode_tree.map.rb_node;
> + struct rb_node *parent = NULL;
> + struct hot_comm_item *ci;
> + struct hot_inode_item *entry;
> +
> + /* walk tree to find insertion point */
> + spin_lock(>lock);
> + while (*p) {
> + parent = *p;
> + ci = rb_entry(parent, struct hot_comm_item, rb_node);
> + entry = container_of(ci, struct hot_inode_item, hot_inode);
> + if (ino < entry->i_ino)
> + p = &(*p)->rb_left;
> + else if (ino > entry->i_ino)
> + p = &(*p)->rb_right;

style comment: put { } around the all if/else blocks

> + else {
> + spin_unlock(>lock);
> + kref_get(>hot_inode.refs);

jumping forwards in the series, the spin_unlock and kref_get get swapped
later, and I think that's the right order. Otherwise there's a small
window where the entry does not get the reference and could be
potentially freed by racing kref_put, no?


spin_unlock(tree)
 spin_lock(tree)
 
 kref_put(E) or via hot_inode_item_put(E) (1)
kref_get(E)   (2)


if the reference count at (1) was 1, it's freed and (2) hits a free
memory. hot_inode_item_put can be called from filesystem or via seq
print of the respective /proc files, so I think there are chances to hit
the problem.

> + return entry;
> + }
> + }
> + spin_unlock(>lock);
> +
> + entry = kmem_cache_zalloc(hot_inode_item_cachep, GFP_NOFS);
> + if (!entry)
> + return ERR_PTR(-ENOMEM);
> +
> + spin_lock(>lock);
> + hot_inode_item_init(entry, ino, >hot_inode_tree);
> + rb_link_node(>hot_inode.rb_node, parent, p);
> + rb_insert_color(>hot_inode.rb_node,
> + >hot_inode_tree.map);
> + spin_unlock(>lock);
> +
> + kref_get(>hot_inode.refs);

Similar here, the entry is inserted into the tree but there's no
refcount yet. And the order of spin_unlock/kref_get remains unchanged.

> + return entry;
> +}
> +EXPORT_SYMBOL_GPL(hot_inode_item_lookup);
> +
> +static struct hot_range_item
> +*hot_range_item_lookup(struct hot_inode_item *he,
> + loff_t start)
> +{
> + struct rb_node **p = >hot_range_tree.map.rb_node;
> + struct rb_node *parent = NULL;
> + struct hot_comm_item *ci;
> + struct hot_range_item *entry;
> +
> + /* walk tree to find insertion point */
> + spin_lock(>lock);
> + while (*p) {
> + parent = *p;
> + ci = rb_entry(parent, struct hot_comm_item, rb_node);
> + entry = container_of(ci, struct hot_range_item, hot_range);
> + if (start < entry->start)
> + p = &(*p)->rb_left;
> + else if (start > hot_range_end(entry))
> + p = &(*p)->rb_right;

if { ...} 
else if { ... }

> + else {
> + spin_unlock(>lock);
> + kref_get(>hot_range.refs);

same here

> + return entry;
> + }
> + }
> + spin_unlock(>lock);
> +
> + entry = kmem_cache_zalloc(hot_range_item_cachep, GFP_NOFS);
> + if (!entry)
> + return ERR_PTR(-ENOMEM);
> +
> + spin_lock(>lock);
> + hot_range_item_init(entry, start, he);
> + rb_link_node(>hot_range.rb_node, parent, p);
> + rb_insert_color(>hot_range.rb_node,
> + >hot_range_tree.map);
> + spin_unlock(>lock);
> +
> + kref_get(>hot_range.refs);

and here

> + return entry;
> +}
> +
> +/*
> + * This function does the actual work of updating
> + * the frequency numbers, whatever they turn out to be.

Can this function be described a bit better? This comment did not help.

> + */
> +static void hot_rw_freq_calc(struct timespec old_atime,
> + struct timespec cur_time, u64 *avg)
> +{
> + struct timespec delta_ts;
> + u64 new_delta;
> +
> + delta_ts = timespec_sub(cur_time, old_atime);
> + new_delta = timespec_to_ns(_ts) >> FREQ_POWER;
> +
> + *avg = (*avg << FREQ_POWER) - *avg + new_delta;
> + *avg = *avg >> FREQ_POWER;
> +}
> +
> +static void hot_freq_data_update(struct hot_freq_data *freq_data, bool write)
> +{
> + struct timespec cur_time = current_kernel_time();
> +
> + if (write) {
> + freq_data->nr_writes += 1;

The preferred style is

freq_data->nr_writes++

> + hot_rw_freq_calc(freq_data->last_write_time,
> + cur_time,
> +

Re: [PATCH RESEND v1 03/16] vfs: add I/O frequency update function

2013-01-09 Thread David Sterba
On Thu, Dec 20, 2012 at 10:43:22PM +0800, zwu.ker...@gmail.com wrote:
 --- a/fs/hot_tracking.c
 +++ b/fs/hot_tracking.c
 @@ -164,6 +164,135 @@ static void hot_inode_tree_exit(struct hot_info *root)
   spin_unlock(root-lock);
  }
  
 +struct hot_inode_item
 +*hot_inode_item_lookup(struct hot_info *root, u64 ino)
 +{
 + struct rb_node **p = root-hot_inode_tree.map.rb_node;
 + struct rb_node *parent = NULL;
 + struct hot_comm_item *ci;
 + struct hot_inode_item *entry;
 +
 + /* walk tree to find insertion point */
 + spin_lock(root-lock);
 + while (*p) {
 + parent = *p;
 + ci = rb_entry(parent, struct hot_comm_item, rb_node);
 + entry = container_of(ci, struct hot_inode_item, hot_inode);
 + if (ino  entry-i_ino)
 + p = (*p)-rb_left;
 + else if (ino  entry-i_ino)
 + p = (*p)-rb_right;

style comment: put { } around the all if/else blocks

 + else {
 + spin_unlock(root-lock);
 + kref_get(entry-hot_inode.refs);

jumping forwards in the series, the spin_unlock and kref_get get swapped
later, and I think that's the right order. Otherwise there's a small
window where the entry does not get the reference and could be
potentially freed by racing kref_put, no?

lookup entry E
spin_unlock(tree)
 spin_lock(tree)
 lookup entry E
 kref_put(E) or via hot_inode_item_put(E) (1)
kref_get(E)   (2)


if the reference count at (1) was 1, it's freed and (2) hits a free
memory. hot_inode_item_put can be called from filesystem or via seq
print of the respective /proc files, so I think there are chances to hit
the problem.

 + return entry;
 + }
 + }
 + spin_unlock(root-lock);
 +
 + entry = kmem_cache_zalloc(hot_inode_item_cachep, GFP_NOFS);
 + if (!entry)
 + return ERR_PTR(-ENOMEM);
 +
 + spin_lock(root-lock);
 + hot_inode_item_init(entry, ino, root-hot_inode_tree);
 + rb_link_node(entry-hot_inode.rb_node, parent, p);
 + rb_insert_color(entry-hot_inode.rb_node,
 + root-hot_inode_tree.map);
 + spin_unlock(root-lock);
 +
 + kref_get(entry-hot_inode.refs);

Similar here, the entry is inserted into the tree but there's no
refcount yet. And the order of spin_unlock/kref_get remains unchanged.

 + return entry;
 +}
 +EXPORT_SYMBOL_GPL(hot_inode_item_lookup);
 +
 +static struct hot_range_item
 +*hot_range_item_lookup(struct hot_inode_item *he,
 + loff_t start)
 +{
 + struct rb_node **p = he-hot_range_tree.map.rb_node;
 + struct rb_node *parent = NULL;
 + struct hot_comm_item *ci;
 + struct hot_range_item *entry;
 +
 + /* walk tree to find insertion point */
 + spin_lock(he-lock);
 + while (*p) {
 + parent = *p;
 + ci = rb_entry(parent, struct hot_comm_item, rb_node);
 + entry = container_of(ci, struct hot_range_item, hot_range);
 + if (start  entry-start)
 + p = (*p)-rb_left;
 + else if (start  hot_range_end(entry))
 + p = (*p)-rb_right;

if { ...} 
else if { ... }

 + else {
 + spin_unlock(he-lock);
 + kref_get(entry-hot_range.refs);

same here

 + return entry;
 + }
 + }
 + spin_unlock(he-lock);
 +
 + entry = kmem_cache_zalloc(hot_range_item_cachep, GFP_NOFS);
 + if (!entry)
 + return ERR_PTR(-ENOMEM);
 +
 + spin_lock(he-lock);
 + hot_range_item_init(entry, start, he);
 + rb_link_node(entry-hot_range.rb_node, parent, p);
 + rb_insert_color(entry-hot_range.rb_node,
 + he-hot_range_tree.map);
 + spin_unlock(he-lock);
 +
 + kref_get(entry-hot_range.refs);

and here

 + return entry;
 +}
 +
 +/*
 + * This function does the actual work of updating
 + * the frequency numbers, whatever they turn out to be.

Can this function be described a bit better? This comment did not help.

 + */
 +static void hot_rw_freq_calc(struct timespec old_atime,
 + struct timespec cur_time, u64 *avg)
 +{
 + struct timespec delta_ts;
 + u64 new_delta;
 +
 + delta_ts = timespec_sub(cur_time, old_atime);
 + new_delta = timespec_to_ns(delta_ts)  FREQ_POWER;
 +
 + *avg = (*avg  FREQ_POWER) - *avg + new_delta;
 + *avg = *avg  FREQ_POWER;
 +}
 +
 +static void hot_freq_data_update(struct hot_freq_data *freq_data, bool write)
 +{
 + struct timespec cur_time = current_kernel_time();
 +
 + if (write) {
 + freq_data-nr_writes += 1;

The preferred style is

freq_data-nr_writes++

 + hot_rw_freq_calc(freq_data-last_write_time,
 + cur_time,
 + 

[PATCH RESEND v1 03/16] vfs: add I/O frequency update function

2012-12-20 Thread zwu . kernel
From: Zhi Yong Wu 

  Add some util helpers to update access frequencies
for one file or its range.

Signed-off-by: Zhi Yong Wu 
---
 fs/hot_tracking.c|  178 ++
 fs/hot_tracking.h|5 +
 include/linux/hot_tracking.h |4 +
 3 files changed, 187 insertions(+), 0 deletions(-)

diff --git a/fs/hot_tracking.c b/fs/hot_tracking.c
index a73477c..6f587fa 100644
--- a/fs/hot_tracking.c
+++ b/fs/hot_tracking.c
@@ -164,6 +164,135 @@ static void hot_inode_tree_exit(struct hot_info *root)
spin_unlock(>lock);
 }
 
+struct hot_inode_item
+*hot_inode_item_lookup(struct hot_info *root, u64 ino)
+{
+   struct rb_node **p = >hot_inode_tree.map.rb_node;
+   struct rb_node *parent = NULL;
+   struct hot_comm_item *ci;
+   struct hot_inode_item *entry;
+
+   /* walk tree to find insertion point */
+   spin_lock(>lock);
+   while (*p) {
+   parent = *p;
+   ci = rb_entry(parent, struct hot_comm_item, rb_node);
+   entry = container_of(ci, struct hot_inode_item, hot_inode);
+   if (ino < entry->i_ino)
+   p = &(*p)->rb_left;
+   else if (ino > entry->i_ino)
+   p = &(*p)->rb_right;
+   else {
+   spin_unlock(>lock);
+   kref_get(>hot_inode.refs);
+   return entry;
+   }
+   }
+   spin_unlock(>lock);
+
+   entry = kmem_cache_zalloc(hot_inode_item_cachep, GFP_NOFS);
+   if (!entry)
+   return ERR_PTR(-ENOMEM);
+
+   spin_lock(>lock);
+   hot_inode_item_init(entry, ino, >hot_inode_tree);
+   rb_link_node(>hot_inode.rb_node, parent, p);
+   rb_insert_color(>hot_inode.rb_node,
+   >hot_inode_tree.map);
+   spin_unlock(>lock);
+
+   kref_get(>hot_inode.refs);
+   return entry;
+}
+EXPORT_SYMBOL_GPL(hot_inode_item_lookup);
+
+static loff_t hot_range_end(struct hot_range_item *hr)
+{
+   if (hr->start + hr->len < hr->start)
+   return (loff_t)-1;
+
+   return hr->start + hr->len - 1;
+}
+
+static struct hot_range_item
+*hot_range_item_lookup(struct hot_inode_item *he,
+   loff_t start)
+{
+   struct rb_node **p = >hot_range_tree.map.rb_node;
+   struct rb_node *parent = NULL;
+   struct hot_comm_item *ci;
+   struct hot_range_item *entry;
+
+   /* walk tree to find insertion point */
+   spin_lock(>lock);
+   while (*p) {
+   parent = *p;
+   ci = rb_entry(parent, struct hot_comm_item, rb_node);
+   entry = container_of(ci, struct hot_range_item, hot_range);
+   if (start < entry->start)
+   p = &(*p)->rb_left;
+   else if (start > hot_range_end(entry))
+   p = &(*p)->rb_right;
+   else {
+   spin_unlock(>lock);
+   kref_get(>hot_range.refs);
+   return entry;
+   }
+   }
+   spin_unlock(>lock);
+
+   entry = kmem_cache_zalloc(hot_range_item_cachep, GFP_NOFS);
+   if (!entry)
+   return ERR_PTR(-ENOMEM);
+
+   spin_lock(>lock);
+   hot_range_item_init(entry, start, he);
+   rb_link_node(>hot_range.rb_node, parent, p);
+   rb_insert_color(>hot_range.rb_node,
+   >hot_range_tree.map);
+   spin_unlock(>lock);
+
+   kref_get(>hot_range.refs);
+   return entry;
+}
+
+/*
+ * This function does the actual work of updating
+ * the frequency numbers, whatever they turn out to be.
+ */
+static void hot_rw_freq_calc(struct timespec old_atime,
+   struct timespec cur_time, u64 *avg)
+{
+   struct timespec delta_ts;
+   u64 new_delta;
+
+   delta_ts = timespec_sub(cur_time, old_atime);
+   new_delta = timespec_to_ns(_ts) >> FREQ_POWER;
+
+   *avg = (*avg << FREQ_POWER) - *avg + new_delta;
+   *avg = *avg >> FREQ_POWER;
+}
+
+static void hot_freq_data_update(struct hot_freq_data *freq_data, bool write)
+{
+   struct timespec cur_time = current_kernel_time();
+
+   if (write) {
+   freq_data->nr_writes += 1;
+   hot_rw_freq_calc(freq_data->last_write_time,
+   cur_time,
+   _data->avg_delta_writes);
+   freq_data->last_write_time = cur_time;
+   } else {
+   freq_data->nr_reads += 1;
+   hot_rw_freq_calc(freq_data->last_read_time,
+   freq_data->last_read_time,
+   cur_time,
+   _data->avg_delta_reads);
+   freq_data->last_read_time = cur_time;
+   }
+}
+
 /*
  * Initialize kmem cache for hot_inode_item and hot_range_item.
  */
@@ -191,6 +320,55 @@ err:
 EXPORT_SYMBOL_GPL(hot_cache_init);
 
 /*
+ * Main function 

[PATCH RESEND v1 03/16] vfs: add I/O frequency update function

2012-12-20 Thread zwu . kernel
From: Zhi Yong Wu wu...@linux.vnet.ibm.com

  Add some util helpers to update access frequencies
for one file or its range.

Signed-off-by: Zhi Yong Wu wu...@linux.vnet.ibm.com
---
 fs/hot_tracking.c|  178 ++
 fs/hot_tracking.h|5 +
 include/linux/hot_tracking.h |4 +
 3 files changed, 187 insertions(+), 0 deletions(-)

diff --git a/fs/hot_tracking.c b/fs/hot_tracking.c
index a73477c..6f587fa 100644
--- a/fs/hot_tracking.c
+++ b/fs/hot_tracking.c
@@ -164,6 +164,135 @@ static void hot_inode_tree_exit(struct hot_info *root)
spin_unlock(root-lock);
 }
 
+struct hot_inode_item
+*hot_inode_item_lookup(struct hot_info *root, u64 ino)
+{
+   struct rb_node **p = root-hot_inode_tree.map.rb_node;
+   struct rb_node *parent = NULL;
+   struct hot_comm_item *ci;
+   struct hot_inode_item *entry;
+
+   /* walk tree to find insertion point */
+   spin_lock(root-lock);
+   while (*p) {
+   parent = *p;
+   ci = rb_entry(parent, struct hot_comm_item, rb_node);
+   entry = container_of(ci, struct hot_inode_item, hot_inode);
+   if (ino  entry-i_ino)
+   p = (*p)-rb_left;
+   else if (ino  entry-i_ino)
+   p = (*p)-rb_right;
+   else {
+   spin_unlock(root-lock);
+   kref_get(entry-hot_inode.refs);
+   return entry;
+   }
+   }
+   spin_unlock(root-lock);
+
+   entry = kmem_cache_zalloc(hot_inode_item_cachep, GFP_NOFS);
+   if (!entry)
+   return ERR_PTR(-ENOMEM);
+
+   spin_lock(root-lock);
+   hot_inode_item_init(entry, ino, root-hot_inode_tree);
+   rb_link_node(entry-hot_inode.rb_node, parent, p);
+   rb_insert_color(entry-hot_inode.rb_node,
+   root-hot_inode_tree.map);
+   spin_unlock(root-lock);
+
+   kref_get(entry-hot_inode.refs);
+   return entry;
+}
+EXPORT_SYMBOL_GPL(hot_inode_item_lookup);
+
+static loff_t hot_range_end(struct hot_range_item *hr)
+{
+   if (hr-start + hr-len  hr-start)
+   return (loff_t)-1;
+
+   return hr-start + hr-len - 1;
+}
+
+static struct hot_range_item
+*hot_range_item_lookup(struct hot_inode_item *he,
+   loff_t start)
+{
+   struct rb_node **p = he-hot_range_tree.map.rb_node;
+   struct rb_node *parent = NULL;
+   struct hot_comm_item *ci;
+   struct hot_range_item *entry;
+
+   /* walk tree to find insertion point */
+   spin_lock(he-lock);
+   while (*p) {
+   parent = *p;
+   ci = rb_entry(parent, struct hot_comm_item, rb_node);
+   entry = container_of(ci, struct hot_range_item, hot_range);
+   if (start  entry-start)
+   p = (*p)-rb_left;
+   else if (start  hot_range_end(entry))
+   p = (*p)-rb_right;
+   else {
+   spin_unlock(he-lock);
+   kref_get(entry-hot_range.refs);
+   return entry;
+   }
+   }
+   spin_unlock(he-lock);
+
+   entry = kmem_cache_zalloc(hot_range_item_cachep, GFP_NOFS);
+   if (!entry)
+   return ERR_PTR(-ENOMEM);
+
+   spin_lock(he-lock);
+   hot_range_item_init(entry, start, he);
+   rb_link_node(entry-hot_range.rb_node, parent, p);
+   rb_insert_color(entry-hot_range.rb_node,
+   he-hot_range_tree.map);
+   spin_unlock(he-lock);
+
+   kref_get(entry-hot_range.refs);
+   return entry;
+}
+
+/*
+ * This function does the actual work of updating
+ * the frequency numbers, whatever they turn out to be.
+ */
+static void hot_rw_freq_calc(struct timespec old_atime,
+   struct timespec cur_time, u64 *avg)
+{
+   struct timespec delta_ts;
+   u64 new_delta;
+
+   delta_ts = timespec_sub(cur_time, old_atime);
+   new_delta = timespec_to_ns(delta_ts)  FREQ_POWER;
+
+   *avg = (*avg  FREQ_POWER) - *avg + new_delta;
+   *avg = *avg  FREQ_POWER;
+}
+
+static void hot_freq_data_update(struct hot_freq_data *freq_data, bool write)
+{
+   struct timespec cur_time = current_kernel_time();
+
+   if (write) {
+   freq_data-nr_writes += 1;
+   hot_rw_freq_calc(freq_data-last_write_time,
+   cur_time,
+   freq_data-avg_delta_writes);
+   freq_data-last_write_time = cur_time;
+   } else {
+   freq_data-nr_reads += 1;
+   hot_rw_freq_calc(freq_data-last_read_time,
+   freq_data-last_read_time,
+   cur_time,
+   freq_data-avg_delta_reads);
+   freq_data-last_read_time = cur_time;
+   }
+}
+
 /*
  * Initialize kmem cache for