Re: [PATCH] fs/buffer.c: Delete redundant uptodate check for buffer

2021-04-11 Thread Shaokun Zhang
+Cc: Andrew Morton

On 2021/4/1 14:57, Shaokun Zhang wrote:
> From: Yang Guo 
> 
> The buffer uptodate state has been checked in function set_buffer_uptodate,
> there is no need use buffer_uptodate before calling set_buffer_uptodate and
> delete it.
> 
> Cc: Alexander Viro  
> Signed-off-by: Yang Guo 
> Signed-off-by: Shaokun Zhang 
> ---
>  fs/buffer.c | 6 ++
>  1 file changed, 2 insertions(+), 4 deletions(-)
> 
> diff --git a/fs/buffer.c b/fs/buffer.c
> index 673cfbef9eec..2c0d0b3f3203 100644
> --- a/fs/buffer.c
> +++ b/fs/buffer.c
> @@ -2055,8 +2055,7 @@ int __block_write_begin_int(struct page *page, loff_t 
> pos, unsigned len,
>   block_end = block_start + blocksize;
>   if (block_end <= from || block_start >= to) {
>   if (PageUptodate(page)) {
> - if (!buffer_uptodate(bh))
> - set_buffer_uptodate(bh);
> + set_buffer_uptodate(bh);
>   }
>   continue;
>   }
> @@ -2088,8 +2087,7 @@ int __block_write_begin_int(struct page *page, loff_t 
> pos, unsigned len,
>   }
>   }
>   if (PageUptodate(page)) {
> - if (!buffer_uptodate(bh))
> - set_buffer_uptodate(bh);
> + set_buffer_uptodate(bh);
>   continue; 
>   }
>   if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
> 


[PATCH] fs: Optimized file struct to improve performance

2021-04-08 Thread Shaokun Zhang
From: Yuqi Jin 

In the syscall process, @f_count and @f_mod are frequently used, if
we put them together with each other and they will share the same
cacheline. It is useful for the performance.

syscall of unixbench is tested on Intel 8180.
before this patch
80 CPUs in system; running 80 parallel copies of tests

System Call Overhead3789860.2 lps   (10.0 s, 1 samples)

System Benchmarks Partial Index  BASELINE   RESULTINDEX
System Call Overhead  15000.03789860.2   2526.6
   
System Benchmarks Index Score (Partial Only) 2526.6

after this patch
80 CPUs in system; running 80 parallel copies of tests

System Call Overhead3951328.1 lps   (10.0 s, 1 samples)

System Benchmarks Partial Index  BASELINE   RESULTINDEX
System Call Overhead  15000.03951328.1   2634.2
   
System Benchmarks Index Score (Partial Only) 2634.2

Cc: Alexander Viro 
Signed-off-by: Yuqi Jin 
Signed-off-by: Shaokun Zhang 
---
 include/linux/fs.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3fbb98126248..cfc91d2dd6a7 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -917,7 +917,6 @@ struct file {
struct llist_node   fu_llist;
struct rcu_head fu_rcuhead;
} f_u;
-   struct path f_path;
struct inode*f_inode;   /* cached value */
const struct file_operations*f_op;
 
@@ -926,13 +925,14 @@ struct file {
 * Must not be taken from IRQ context.
 */
spinlock_t  f_lock;
-   enum rw_hintf_write_hint;
atomic_long_t   f_count;
unsigned intf_flags;
fmode_t f_mode;
struct mutexf_pos_lock;
loff_t  f_pos;
struct fown_struct  f_owner;
+   enum rw_hintf_write_hint;
+   struct path f_path;
const struct cred   *f_cred;
struct file_ra_statef_ra;
 
-- 
2.7.4



[PATCH] ext4: Delete redundant uptodate check for buffer

2021-04-01 Thread Shaokun Zhang
From: Yang Guo 

The buffer uptodate state has been checked in function set_buffer_uptodate,
there is no need use buffer_uptodate before calling set_buffer_uptodate and
delete it.

Cc: "Theodore Ts'o" 
Cc: Andreas Dilger 
Signed-off-by: Yang Guo 
Signed-off-by: Shaokun Zhang 
---
 fs/ext4/inode.c | 6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 0948a43f1b3d..32fa3ad38797 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1066,8 +1066,7 @@ static int ext4_block_write_begin(struct page *page, 
loff_t pos, unsigned len,
block_end = block_start + blocksize;
if (block_end <= from || block_start >= to) {
if (PageUptodate(page)) {
-   if (!buffer_uptodate(bh))
-   set_buffer_uptodate(bh);
+   set_buffer_uptodate(bh);
}
continue;
}
@@ -1092,8 +1091,7 @@ static int ext4_block_write_begin(struct page *page, 
loff_t pos, unsigned len,
}
}
if (PageUptodate(page)) {
-   if (!buffer_uptodate(bh))
-   set_buffer_uptodate(bh);
+   set_buffer_uptodate(bh);
continue;
}
if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
-- 
2.7.4



[PATCH] fs/buffer.c: Delete redundant uptodate check for buffer

2021-04-01 Thread Shaokun Zhang
From: Yang Guo 

The buffer uptodate state has been checked in function set_buffer_uptodate,
there is no need use buffer_uptodate before calling set_buffer_uptodate and
delete it.

Cc: Alexander Viro  
Signed-off-by: Yang Guo 
Signed-off-by: Shaokun Zhang 
---
 fs/buffer.c | 6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/fs/buffer.c b/fs/buffer.c
index 673cfbef9eec..2c0d0b3f3203 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2055,8 +2055,7 @@ int __block_write_begin_int(struct page *page, loff_t 
pos, unsigned len,
block_end = block_start + blocksize;
if (block_end <= from || block_start >= to) {
if (PageUptodate(page)) {
-   if (!buffer_uptodate(bh))
-   set_buffer_uptodate(bh);
+   set_buffer_uptodate(bh);
}
continue;
}
@@ -2088,8 +2087,7 @@ int __block_write_begin_int(struct page *page, loff_t 
pos, unsigned len,
}
}
if (PageUptodate(page)) {
-   if (!buffer_uptodate(bh))
-   set_buffer_uptodate(bh);
+   set_buffer_uptodate(bh);
continue; 
}
if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
-- 
2.7.4



Re: linux-next: build warnings after merge of the arm-perf tree

2021-03-26 Thread Shaokun Zhang
Hi Will & Stephen,

Apologies for the mistake.

Will, shall I send a new version v5 to fix this issue or other?

Thanks,
Shaokun

On 2021/3/26 16:52, Stephen Rothwell wrote:
> Hi all,
> 
> After merging the arm-perf tree, today's linux-next build (htmldocs)
> produced these warnings:
> 
> Documentation/admin-guide/perf/hisi-pmu.rst:61: WARNING: Unexpected 
> indentation.
> Documentation/admin-guide/perf/hisi-pmu.rst:62: WARNING: Block quote ends 
> without a blank line; unexpected unindent.
> Documentation/admin-guide/perf/hisi-pmu.rst:69: WARNING: Unexpected 
> indentation.
> Documentation/admin-guide/perf/hisi-pmu.rst:70: WARNING: Block quote ends 
> without a blank line; unexpected unindent.
> Documentation/admin-guide/perf/hisi-pmu.rst:83: WARNING: Unexpected 
> indentation.
> 
> Introduced by commit
> 
>   9b86b1b41e0f ("docs: perf: Add new description on HiSilicon uncore PMU v2")
> 


[tip: locking/core] locking/mutex: Remove repeated declaration

2021-03-25 Thread tip-bot2 for Shaokun Zhang
The following commit has been merged into the locking/core branch of tip:

Commit-ID: 8af856d18bfbe89676ade38caa2a5d06f75f211d
Gitweb:
https://git.kernel.org/tip/8af856d18bfbe89676ade38caa2a5d06f75f211d
Author:Shaokun Zhang 
AuthorDate:Wed, 24 Mar 2021 13:40:40 +08:00
Committer: Ingo Molnar 
CommitterDate: Thu, 25 Mar 2021 12:02:06 +01:00

locking/mutex: Remove repeated declaration

Commit 0cd39f4600ed ("locking/seqlock, headers: Untangle the spaghetti monster")
introduces 'struct ww_acquire_ctx' again, remove the repeated declaration and 
move
the pre-declarations to the top.

Signed-off-by: Shaokun Zhang 
Signed-off-by: Ingo Molnar 
Acked-by: Waiman Long 
Link: 
https://lore.kernel.org/r/1616564440-61318-1-git-send-email-zhangshao...@hisilicon.com
---
 include/linux/mutex.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index 0cd631a..e7a1267 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -20,6 +20,7 @@
 #include 
 #include 
 
+struct ww_class;
 struct ww_acquire_ctx;
 
 /*
@@ -65,9 +66,6 @@ struct mutex {
 #endif
 };
 
-struct ww_class;
-struct ww_acquire_ctx;
-
 struct ww_mutex {
struct mutex base;
struct ww_acquire_ctx *ctx;


[tip: locking/core] locking/mutex: Remove repeated declaration

2021-03-25 Thread tip-bot2 for Shaokun Zhang
The following commit has been merged into the locking/core branch of tip:

Commit-ID: 93b02d29fbdbc221c84adcaf0e85be9f8008
Gitweb:
https://git.kernel.org/tip/93b02d29fbdbc221c84adcaf0e85be9f8008
Author:Shaokun Zhang 
AuthorDate:Wed, 24 Mar 2021 13:40:40 +08:00
Committer: Thomas Gleixner 
CommitterDate: Thu, 25 Mar 2021 09:42:48 +01:00

locking/mutex: Remove repeated declaration

Commit 0cd39f4600ed ("locking/seqlock, headers: Untangle the spaghetti monster")
introduces 'struct ww_acquire_ctx' again, remove the repeated declaration and 
move
the pre-declarations to the top.

Signed-off-by: Shaokun Zhang 
Signed-off-by: Ingo Molnar 
Acked-by: Waiman Long 
Link: 
https://lore.kernel.org/r/1616564440-61318-1-git-send-email-zhangshao...@hisilicon.com
---
 include/linux/mutex.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index 0cd631a..e7a1267 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -20,6 +20,7 @@
 #include 
 #include 
 
+struct ww_class;
 struct ww_acquire_ctx;
 
 /*
@@ -65,9 +66,6 @@ struct mutex {
 #endif
 };
 
-struct ww_class;
-struct ww_acquire_ctx;
-
 struct ww_mutex {
struct mutex base;
struct ww_acquire_ctx *ctx;


[tip: locking/core] locking/mutex: Remove repeated declaration

2021-03-24 Thread tip-bot2 for Shaokun Zhang
The following commit has been merged into the locking/core branch of tip:

Commit-ID: 5965a7adbd72dd9b288c0911cb73719fed1efa08
Gitweb:
https://git.kernel.org/tip/5965a7adbd72dd9b288c0911cb73719fed1efa08
Author:Shaokun Zhang 
AuthorDate:Wed, 24 Mar 2021 13:40:40 +08:00
Committer: Ingo Molnar 
CommitterDate: Wed, 24 Mar 2021 08:15:19 +01:00

locking/mutex: Remove repeated declaration

Commit 0cd39f4600ed ("locking/seqlock, headers: Untangle the spaghetti monster")
introduces 'struct ww_acquire_ctx' again, remove the repeated declaration and 
move
the pre-declarations to the top.

Signed-off-by: Shaokun Zhang 
Signed-off-by: Ingo Molnar 
Acked-by: Waiman Long 
Link: 
https://lore.kernel.org/r/1616564440-61318-1-git-send-email-zhangshao...@hisilicon.com
---
 include/linux/mutex.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index 0cd631a..e7a1267 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -20,6 +20,7 @@
 #include 
 #include 
 
+struct ww_class;
 struct ww_acquire_ctx;
 
 /*
@@ -65,9 +66,6 @@ struct mutex {
 #endif
 };
 
-struct ww_class;
-struct ww_acquire_ctx;
-
 struct ww_mutex {
struct mutex base;
struct ww_acquire_ctx *ctx;


[PATCH v2] locking/mutex: Remove repeated declaration

2021-03-23 Thread Shaokun Zhang
Commit 0cd39f4600ed ("locking/seqlock, headers: Untangle the spaghetti monster")
introduces 'struct ww_acquire_ctx' again, remove the repeated declaration and 
move
the pre-declarations to the top.

Cc: Peter Zijlstra 
Cc: Ingo Molnar 
Cc: Will Deacon 
Cc: Waiman Long 
Cc: Boqun Feng 
Acked-by: Waiman Long 
Signed-off-by: Shaokun Zhang 
---
 include/linux/mutex.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index 0cd631a19727..e7a126796937 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -20,6 +20,7 @@
 #include 
 #include 
 
+struct ww_class;
 struct ww_acquire_ctx;
 
 /*
@@ -65,9 +66,6 @@ struct mutex {
 #endif
 };
 
-struct ww_class;
-struct ww_acquire_ctx;
-
 struct ww_mutex {
struct mutex base;
struct ww_acquire_ctx *ctx;
-- 
2.7.4



Re: [PATCH] locking/mutex: Remove repeated declaration

2021-03-23 Thread Shaokun Zhang
Hi Ingo,

On 2021/3/23 19:23, Ingo Molnar wrote:
> 
> * Shaokun Zhang  wrote:
> 
>> Commit 0cd39f4600ed ("locking/seqlock, headers: Untangle the spaghetti 
>> monster")
>> introduces 'struct ww_acquire_ctx' again, remove the repeated declaration.
>>
>> Cc: Peter Zijlstra 
>> Cc: Ingo Molnar 
>> Cc: Will Deacon 
>> Cc: Waiman Long 
>> Cc: Boqun Feng 
>> Signed-off-by: Shaokun Zhang 
>> ---
>>  include/linux/mutex.h | 2 --
>>  1 file changed, 2 deletions(-)
>>
>> diff --git a/include/linux/mutex.h b/include/linux/mutex.h
>> index 0cd631a19727..d80c0e22c822 100644
>> --- a/include/linux/mutex.h
>> +++ b/include/linux/mutex.h
>> @@ -20,8 +20,6 @@
>>  #include 
>>  #include 
>>  
>> -struct ww_acquire_ctx;
>> -
>>  /*
>>   * Simple, straightforward mutexes with strict semantics:
>>   *
> 
> Please also group the pre-declarations together, that's the canonical 
> pattern we use in headers.

Ok,

> 
> I.e. have something like this at the top:
> 

Got it, I will do it in next version.

Thanks,
Shaokun

>   struct ww_class;
>   struct ww_acquire_ctx;
> 
> Thanks,
> 
>   Ingo
> .
> 


[PATCH] locking/mutex: Remove repeated declaration

2021-03-22 Thread Shaokun Zhang
Commit 0cd39f4600ed ("locking/seqlock, headers: Untangle the spaghetti monster")
introduces 'struct ww_acquire_ctx' again, remove the repeated declaration.

Cc: Peter Zijlstra 
Cc: Ingo Molnar 
Cc: Will Deacon 
Cc: Waiman Long 
Cc: Boqun Feng 
Signed-off-by: Shaokun Zhang 
---
 include/linux/mutex.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index 0cd631a19727..d80c0e22c822 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -20,8 +20,6 @@
 #include 
 #include 
 
-struct ww_acquire_ctx;
-
 /*
  * Simple, straightforward mutexes with strict semantics:
  *
-- 
2.7.4



Re: [PATCH] fs/buffer.c: Add checking buffer head stat before clear

2021-02-07 Thread Shaokun Zhang
Hi Andrew,

在 2021/2/6 7:45, Andrew Morton 写道:
> On Wed, 3 Feb 2021 14:14:50 +0800 Shaokun Zhang  
> wrote:
> 
>> From: Yang Guo 
>>
>> clear_buffer_new() is used to clear buffer new stat. When PAGE_SIZE
>> is 64K, most buffer heads in the list are not needed to clear.
>> clear_buffer_new() has an enpensive atomic modification operation,
>> Let's add checking buffer head before clear it as __block_write_begin_int
>> does which is good for performance.
> 
> Did this produce any measurable improvement?

It has been tested on Huwei Kunpeng 920 which is ARM64 platform and test 
commond is below:
numactl --cpunodebind=0 --membind=0 fio -name=randwrite -numjobs=16 
-filename=/mnt/test1
-rw=randwrite -ioengine=libaio -direct=0 -iodepth=64 -sync=0 -norandommap 
-group_reporting
-runtime=60 -time_based -bs=4k -size=5G

The test result before patch:
WRITE: bw=930MiB/s (976MB/s), 930MiB/s-930MiB/s (976MB/s-976MB/s), io=54.5GiB 
(58.5GB),
run=60001-60001msec

The test result after patch:
WRITE: bw=958MiB/s (1005MB/s), 958MiB/s-958MiB/s (1005MB/s-1005MB/s), 
io=56.1GiB (60.3GB),
run=60001-60001msec

> 
> Perhaps we should give clear_buffer_x() the same optimization as
> set_buffer_x()?
> 

Good catch,
but we check it more about it, if we do it the same as set_buffer_x(),
many more codes will be fixed, such as ext4_wait_block_bitmap
it has done sanity check using buffer_new and clear_buffer_new
will check it again.

Thanks,
Shaokun

> 
> static __always_inline void set_buffer_##name(struct buffer_head *bh) \
> { \
>   if (!test_bit(BH_##bit, &(bh)->b_state))\
>   set_bit(BH_##bit, &(bh)->b_state);  \
> } \
> static __always_inline void clear_buffer_##name(struct buffer_head *bh)   
> \
> { \
>   clear_bit(BH_##bit, &(bh)->b_state);\
> } \
> 
> 
> .
> 


[PATCH -next] xfs: Fix unused variable 'mp' warning

2021-02-03 Thread Shaokun Zhang
There is a warning on arm64 platform:
  CC [M]  fs/xfs/xfs_ioctl32.o
fs/xfs/xfs_ioctl32.c: In function ‘xfs_file_compat_ioctl’:
fs/xfs/xfs_ioctl32.c:441:20: warning: unused variable ‘mp’ [-Wunused-variable]
  441 |  struct xfs_mount *mp = ip->i_mount;
  |^~
  LD [M]  fs/xfs/xfs.o

Fix this warning.

Fixes: f736d93d76d3 ("xfs: support idmapped mounts")
Cc: "Darrick J. Wong" 
Cc: Christoph Hellwig 
Cc: Christian Brauner  
Signed-off-by: Shaokun Zhang 
---
 fs/xfs/xfs_ioctl32.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index 926427b19573..fd590c0b5d3b 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -438,7 +438,6 @@ xfs_file_compat_ioctl(
 {
struct inode*inode = file_inode(filp);
struct xfs_inode*ip = XFS_I(inode);
-   struct xfs_mount*mp = ip->i_mount;
void__user *arg = compat_ptr(p);
int error;
 
@@ -446,6 +445,8 @@ xfs_file_compat_ioctl(
 
switch (cmd) {
 #if defined(BROKEN_X86_ALIGNMENT)
+   struct xfs_mount*mp = ip->i_mount;
+
case XFS_IOC_ALLOCSP_32:
case XFS_IOC_FREESP_32:
case XFS_IOC_ALLOCSP64_32:
-- 
2.7.4



[PATCH] fs/buffer.c: Add checking buffer head stat before clear

2021-02-02 Thread Shaokun Zhang
From: Yang Guo 

clear_buffer_new() is used to clear buffer new stat. When PAGE_SIZE
is 64K, most buffer heads in the list are not needed to clear.
clear_buffer_new() has an enpensive atomic modification operation,
Let's add checking buffer head before clear it as __block_write_begin_int
does which is good for performance.

Cc: Alexander Viro 
Cc: Andrew Morton 
Cc: Nick Piggin 
Signed-off-by: Yang Guo 
Signed-off-by: Shaokun Zhang 
---
 fs/buffer.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/buffer.c b/fs/buffer.c
index 32647d2011df..f1c3a5b27a90 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2083,7 +2083,8 @@ static int __block_commit_write(struct inode *inode, 
struct page *page,
set_buffer_uptodate(bh);
mark_buffer_dirty(bh);
}
-   clear_buffer_new(bh);
+   if (buffer_new(bh))
+   clear_buffer_new(bh);
 
block_start = block_end;
bh = bh->b_this_page;
-- 
2.7.4



Re: [PATCH v2 1/3] perf vendor events: Add cache refill and DCZVA events

2021-01-21 Thread Shaokun Zhang
Hi,

在 2021/1/21 18:54, Shunsuke Nakamura 写道:
> Adds L1 data cache refill prefetch, L2 data cache refill prefetch, 
> and DCZVA instruction events.

A silly question, Does Arm define these events? I checked Arm ARM
document(DDI0487Fc) that these event numbers are reserved. Or maybe
I miss something.

Thanks,
Shaokun

> 
> Signed-off-by: Shunsuke Nakamura 
> ---
>  .../perf/pmu-events/arch/arm64/armv8-recommended.json  | 18 
> ++
>  1 file changed, 18 insertions(+)
> 
> diff --git a/tools/perf/pmu-events/arch/arm64/armv8-recommended.json 
> b/tools/perf/pmu-events/arch/arm64/armv8-recommended.json
> index d0a1986..ee0e67d 100644
> --- a/tools/perf/pmu-events/arch/arm64/armv8-recommended.json
> +++ b/tools/perf/pmu-events/arch/arm64/armv8-recommended.json
> @@ -54,6 +54,12 @@
>  "BriefDescription": "L1D cache invalidate"
>  },
>  {
> +"PublicDescription": "This event counts L1D_CACHE_REFILL caused by 
> software or hardware prefetch.",
> +"EventCode": "0x49",
> +"EventName": "L1D_CACHE_REFILL_PRF",
> +"BriefDescription": "This event counts L1D_CACHE_REFILL caused by 
> software or hardware prefetch."
> +},
> +{
>  "PublicDescription": "Attributable Level 1 data TLB refill, read",
>  "EventCode": "0x4C",
>  "EventName": "L1D_TLB_REFILL_RD",
> @@ -120,6 +126,12 @@
>  "BriefDescription": "L2D cache invalidate"
>  },
>  {
> +"PublicDescription": "This event counts L2D_CACHE_REFILL caused by 
> software or hardware prefetch.",
> +"EventCode": "0x59",
> +"EventName": "L2D_CACHE_REFILL_PRF",
> +"BriefDescription": "This event counts L2D_CACHE_REFILL caused by 
> software or hardware prefetch."
> +},
> +{
>  "PublicDescription": "Attributable Level 2 data or unified TLB 
> refill, read",
>  "EventCode": "0x5c",
>  "EventName": "L2D_TLB_REFILL_RD",
> @@ -408,6 +420,12 @@
>  "BriefDescription": "Release consistency operation speculatively 
> executed, Store-Release"
> },
> {
> + "PublicDescription": "This event counts architecturally executed 
> zero blocking operations due to the 'DC ZVA' instruction.",
> + "EventCode": "0x9f",
> + "EventName": "DCZVA_SPEC",
> + "BriefDescription": "This event counts architecturally executed 
> zero blocking operations due to the 'DC ZVA' instruction."
> +   },
> +   {
>  "PublicDescription": "Attributable Level 3 data or unified cache 
> access, read",
>  "EventCode": "0xa0",
>  "EventName": "L3D_CACHE_RD",
> 


[PATCH v8] lib: optimize cpumask_local_spread()

2021-01-04 Thread Shaokun Zhang
From: Yuqi Jin 

In multi-processor and NUMA system, I/O driver will find cpu cores that
which shall be bound IRQ. When cpu cores in the local numa have been
used up, it is better to find the node closest to the local numa node
for performance, instead of choosing any online cpu immediately.

On arm64 or x86 platform that has 2-sockets and 4-NUMA nodes, if the
network card is located in node2 of socket1, while the number queues
of network card is greater than the number of cores of node2, when all
cores of node2 has been bound to the queues, the remaining queues will
be bound to the cores of node0. That's not friendly to performance.
Let's improve it and find the nearest unused node through NUMA distance
for the non-local NUMA nodes.

On Huawei Kunpeng 920 server, there are 4 NUMA node(0 - 3) in the 2-cpu
system(0 - 1). The topology of this server is followed:
available: 4 nodes (0-3)
node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
node 0 size: 63379 MB
node 0 free: 61899 MB
node 1 cpus: 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 
46 47
node 1 size: 64509 MB
node 1 free: 63942 MB
node 2 cpus: 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 
70 71
node 2 size: 64509 MB
node 2 free: 63056 MB
node 3 cpus: 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 
94 95
node 3 size: 63997 MB
node 3 free: 63420 MB
node distances:
node   0   1   2   3
  0:  10  16  32  33
  1:  16  10  25  32
  2:  32  25  10  16
  3:  33  32  16  10

We perform PS (parameter server) business test, the behavior of the
service is that the client initiates a request through the network card,
the server responds to the request after calculation.  When two PS
processes run on node2 and node3 separately and the network card is
located on 'node2' which is in cpu1, the performance of node2 (26W QPS)
and node3 (22W QPS) is different.

It is better that the NIC queues are bound to the cpu1 cores in turn, then
XPS will also be properly initialized, while cpumask_local_spread only
considers the local node. When the number of NIC queues exceeds the
number of cores in the local node, it returns to the online core directly.
So when PS runs on node3 sending a calculated request, the performance is
not as good as the node2.

The IRQ from 369-392 will be bound from NUMA node0 to NUMA node3 with this
patch, before the patch:

Euler:/sys/bus/pci # cat /proc/irq/369/smp_affinity_list
0
Euler:/sys/bus/pci # cat /proc/irq/370/smp_affinity_list
1
...
Euler:/sys/bus/pci # cat /proc/irq/391/smp_affinity_list
22
Euler:/sys/bus/pci # cat /proc/irq/392/smp_affinity_list
23
After the patch:
Euler:/sys/bus/pci # cat /proc/irq/369/smp_affinity_list
72
Euler:/sys/bus/pci # cat /proc/irq/370/smp_affinity_list
73
...
Euler:/sys/bus/pci # cat /proc/irq/391/smp_affinity_list
94
Euler:/sys/bus/pci # cat /proc/irq/392/smp_affinity_list
95

So the performance of the node3 is the same as node2 that is 26W QPS when
the network card is still in 'node2' with the patch.

Cc: Dave Hansen 
Cc: Rusty Russell 
Cc: Andrew Morton 
Cc: Juergen Gross 
Cc: Paul Burton 
Cc: Michal Hocko 
Cc: Michael Ellerman 
Cc: Mike Rapoport 
Cc: Anshuman Khandual 
Signed-off-by: Yuqi Jin 
Signed-off-by: Shaokun Zhang 
---
ChangeLog from v7:
1. Fix confusing comments
2. Improve the loop for choosing the nearest node.

ChangeLog from v6:
1. Addressed Dave comments
2. Fix the warning from Hulk Robot
3. Simply the git log.

ChangeLog from v5:
1. Rebase to 5.10-rc2

ChangeLog from v4:
1. Rebase to 5.6-rc3 

ChangeLog from v3:
1. Make spread_lock local to cpumask_local_spread();
2. Add more descriptions on the affinities change in log;

ChangeLog from v2:
1. Change the variables as static and use spinlock to protect;
2. Give more explantation on test and performance;

 include/linux/cpumask.h |  2 +-
 lib/cpumask.c   | 73 -
 2 files changed, 55 insertions(+), 20 deletions(-)

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 383684e30f12..ab0c2a39bfb4 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -189,7 +189,7 @@ static inline unsigned int cpumask_any_but(const struct 
cpumask *mask,
return 1;
 }
 
-static inline unsigned int cpumask_local_spread(unsigned int i, int node)
+static inline unsigned int cpumask_local_spread(unsigned int cpu_index, int 
node)
 {
return 0;
 }
diff --git a/lib/cpumask.c b/lib/cpumask.c
index 35924025097b..1885c314e410 100644
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -193,47 +193,82 @@ void __init free_bootmem_cpumask_var(cpumask_var_t mask)
 }
 #endif
 
+static int find_nearest_node(int node, nodemask_t nodes)
+{
+   int i, min_dist, node_id = -1;
+
+   /* Choose the first unused node to compare */
+   for (i = 0; i < nr_node_ids; i++)
+   if (!node_isset(i, nodes)) {
+   min_dist = node_dista

Re: [PATCH v7] lib: optimize cpumask_local_spread()

2020-12-10 Thread Shaokun Zhang
Hi Dave,

Apologies for the late reply.

在 2020/12/1 1:08, Dave Hansen 写道:
  {
 -  int cpu, hk_flags;
 +  static DEFINE_SPINLOCK(spread_lock);
 +  static bool used[MAX_NUMNODES];
>>>
>>> I thought I mentioned this last time.  How large is this array?  How
>>> large would it be if it were a nodemask_t?  Would this be less code if
>>
>> Apologies that I forgot to do it.
>>
>>> you just dynamically allocated and freed the node mask instead of having
>>> a spinlock and a memset?
>>
>> Ok, but I think the spinlock is also needed, do I miss something?
> 
> There was no spinlock there before your patch.  You just need it to
> protect the structures you declared static.  If you didn't have static
> structures, you wouldn't need a lock.

Got it, I will allocate it dynamically.

> 
 +  unsigned long flags;
 +  int cpu, hk_flags, j, id;
const struct cpumask *mask;
  
hk_flags = HK_FLAG_DOMAIN | HK_FLAG_MANAGED_IRQ;
 @@ -352,20 +379,27 @@ unsigned int cpumask_local_spread(unsigned int i, 
 int node)
return cpu;
}
} else {
 -  /* NUMA first. */
 -  for_each_cpu_and(cpu, cpumask_of_node(node), mask) {
 -  if (i-- == 0)
 -  return cpu;
 +  spin_lock_irqsave(_lock, flags);
 +  memset(used, 0, nr_node_ids * sizeof(bool));
 +  /* select node according to the distance from local node */
 +  for (j = 0; j < nr_node_ids; j++) {
 +  id = find_nearest_node(node, used);
 +  if (id < 0)
 +  break;
>>>
>>> There's presumably an outer loop in a driver which is trying to bind a
>>> bunch of interrupts to a bunch of CPUs.  We know there are on the order
>>> of dozens of these interrupts.
>>>
>>> for_each_interrupt() // in the driver
>>> for (j=0;j>> // find_nearest_node():
>>> for (i = 0; i < nr_node_ids; i++) {
>>> for (i = 0; i < nr_node_ids; i++) {
>>>
>>> Does this worry anybody else?  It thought our upper limits on the number
>>> of NUMA nodes was 1024.  Doesn't that make our loop O(N^3) where the
>>> worst case is hundreds of millions of loops?
>>
>> If the NUMA nodes is 1024 in real system, it is more worthy to find the
>> earest node, rather than choose a random one, And it is only called in
>> I/O device initialization. Comments also are given to this interface.
> 
> This doesn't really make me feel better.  An end user booting this on a

My bad, I only want to explain the issue.

> big system with a bunch of cards could see a minutes-long delay.  I can

Indeed.

> also see funky stuff happening like if we have a ton of NUMA nodes and
> few CPUs.
> 
>>> I don't want to prematurely optimize this, but that seems like something
>>> that might just fall over on bigger systems.
>>>
>>> This also seems really wasteful if we have a bunch of memory-only nodes.
>>>  Each of those will be found via find_nearest_node(), but then this loop:
>>
>> Got it, all effort is used to choose the nearest node for performance. If
>> we don't it, I think some one will also debug this in future.
> 
> If we're going to kick the can down the road for some poor sod to debug,
> can we at least help them out with a warning?
> 
> Maybe we WARN_ONCE() after we fall back for more than 2 or 3 nodes.
> 

Ok,

> But, I still don't think you've addressed my main concern: This is
> horrifically inefficient searching for CPUs inside nodes that are known
> to have no CPUs.

How about optimizing as follows:
+   for (j = 0; j < nr_node_ids; j++) {
+   id = find_nearest_node(node, nodes);
+   if (id < 0)
+   break;
+   nmask = cpumask_of_node(id);
+   cpumask_and(_possible_mask, , & nmask);
+   cpu_of_node = cpumask_weight(node_possible_mask);
+   if (cpu_index > cpu_of_node) {
+   cpu_index -= cpu_of_node;
+   node_set(id, nodes);
+   continue;
+   }
+
+   for_each_cpu(cpu, node_possible_mask)
+   if (cpu_index-- == 0)
+   return cpu;
+
+   node_set(id, nodes);
}

> 
 +  for_each_cpu_and(cpu, cpumask_of_node(id), mask)
 +  if (i-- == 0) {
 +  spin_unlock_irqrestore(_lock,
 + flags);
 +  return cpu;
 +  }
 +  used[id] = true;
}
>>>
>>> Will just exit immediately because 

Re: [PATCH v7] lib: optimize cpumask_local_spread()

2020-11-26 Thread Shaokun Zhang
Hi Dave,

Apologies for later reply.

在 2020/11/21 1:48, Dave Hansen 写道:
> On 11/17/20 6:54 PM, Shaokun Zhang wrote:
>> From: Yuqi Jin 
>>
>> In multi-processor and NUMA system, I/O driver will find cpu cores that
>> which shall be bound IRQ. When cpu cores in the local numa have been
>> used up, it is better to find the node closest to the local numa node
>> for performance, instead of choosing any online cpu immediately.
>>
>> On arm64 or x86 platform that has 2-sockets and 4-NUMA nodes, if the
>> network card is located in node2 of socket1, while the number queues
>> of network card is greater than the number of cores of node2, when all
>> cores of node2 has been bound to the queues, the remaining queues will
>> be bound to the cores of node0 which is further than NUMA node3.
> 
> That's quite the run-on sentence. :)
> 
>> It is
>> not friendly for performance or Intel's DDIO (Data Direct I/O Technology)
> 
> Could you explain *why* it is not friendly to DDIO specifically?  This
> patch affects where the interrupt handler runs.  But, DDIO is based on
> memory locations rather than the location of the interrupt handler.
> 
> It would be ideal to make that connection: How does the location of the
> interrupt handler impact the memory allocation location?
> 

When the interrupt handler is across chips, the BD, packet header, and even
payload are required for the RX packet interrupt handler. However, the DDIO
cannot transmit data to there.

>> when if the user enables SNC (sub-NUMA-clustering).
> 
> Again, the role that SNC plays here isn't spelled out.  I *believe* it's
> because SNC ends up reducing the number of CPUs in each NUMA node.  That
>  makes the existing code run out of CPUs to which to bind to the "local"
> node sooner.

Yes.

> 
>> +static int find_nearest_node(int node, bool *used)
>> +{
>> +int i, min_dist, node_id = -1;
>> +
>> +/* Choose the first unused node to compare */
>> +for (i = 0; i < nr_node_ids; i++) {
>> +if (used[i] == false) {
>> +min_dist = node_distance(node, i);
>> +node_id = i;
>> +break;
>> +}
>> +}
>> +
>> +/* Compare and return the nearest node */
>> +for (i = 0; i < nr_node_ids; i++) {
>> +if (node_distance(node, i) < min_dist && used[i] == false) {
>> +min_dist = node_distance(node, i);
>> +node_id = i;
>> +}
>> +}
>> +
>> +return node_id;
>> +}
>> +
>>  /**
>>   * cpumask_local_spread - select the i'th cpu with local numa cpu's first
>>   * @i: index number
>>   * @node: local numa_node
>>   *
>>   * This function selects an online CPU according to a numa aware policy;
>> - * local cpus are returned first, followed by non-local ones, then it
>> - * wraps around.
>> + * local cpus are returned first, followed by the next one which is the
>> + * nearest unused NUMA node based on NUMA distance, then it wraps around.
>>   *
>>   * It's not very efficient, but useful for setup.
>>   */
>>  unsigned int cpumask_local_spread(unsigned int i, int node)
> 
> FWIW, I think 'i' is criminally bad naming.  It should be called
> nr_cpus_to_skip or something similar.
> 

Ok, I really didn't consider this parameter naming before.

> I also detest the comments that are there today.
> 
>   Loop through all the online CPUs on the system.  Start with the
>   CPUs on 'node', then fall back to CPUs on NUMA nodes which are
>   increasingly far away.
> 
>   Skip the first 'nr_cpus_to_skip' CPUs which are found.
> 
>   This function is not very efficient, especially for large
>   'nr_cpus_to_skip' because it loops over the same CPUs on each
>   call and does not remember its state from previous calls.
> 

Shame for my bad comment, I will follow it.

>>  {
>> -int cpu, hk_flags;
>> +static DEFINE_SPINLOCK(spread_lock);
>> +static bool used[MAX_NUMNODES];
> 
> I thought I mentioned this last time.  How large is this array?  How
> large would it be if it were a nodemask_t?  Would this be less code if

Apologies that I forgot to do it.

> you just dynamically allocated and freed the node mask instead of having
> a spinlock and a memset?
> 

Ok, but I think the spinlock is also needed, do I miss something?

>> +unsigned long flags;
>> +int cpu, hk_flags, j, id;
>>  const struct cpumask *mask;
>>  
>>  hk_flags = HK_FLAG_DOMAIN | HK_FLAG_MANAGED_IRQ;
>&

[PATCH v7] lib: optimize cpumask_local_spread()

2020-11-17 Thread Shaokun Zhang
From: Yuqi Jin 

In multi-processor and NUMA system, I/O driver will find cpu cores that
which shall be bound IRQ. When cpu cores in the local numa have been
used up, it is better to find the node closest to the local numa node
for performance, instead of choosing any online cpu immediately.

On arm64 or x86 platform that has 2-sockets and 4-NUMA nodes, if the
network card is located in node2 of socket1, while the number queues
of network card is greater than the number of cores of node2, when all
cores of node2 has been bound to the queues, the remaining queues will
be bound to the cores of node0 which is further than NUMA node3. It is
not friendly for performance or Intel's DDIO (Data Direct I/O Technology)
when if the user enables SNC (sub-NUMA-clustering).
Let's improve it and find the nearest unused node through NUMA distance
for the non-local NUMA nodes.

On Huawei Kunpeng 920 server, there are 4 NUMA node(0 - 3) in the 2-cpu
system(0 - 1). The topology of this server is followed:
available: 4 nodes (0-3)
node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
node 0 size: 63379 MB
node 0 free: 61899 MB
node 1 cpus: 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 
46 47
node 1 size: 64509 MB
node 1 free: 63942 MB
node 2 cpus: 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 
70 71
node 2 size: 64509 MB
node 2 free: 63056 MB
node 3 cpus: 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 
94 95
node 3 size: 63997 MB
node 3 free: 63420 MB
node distances:
node   0   1   2   3
  0:  10  16  32  33
  1:  16  10  25  32
  2:  32  25  10  16
  3:  33  32  16  10

We perform PS (parameter server) business test, the behavior of the
service is that the client initiates a request through the network card,
the server responds to the request after calculation.  When two PS
processes run on node2 and node3 separately and the network card is
located on 'node2' which is in cpu1, the performance of node2 (26W QPS)
and node3 (22W QPS) is different.

It is better that the NIC queues are bound to the cpu1 cores in turn, then
XPS will also be properly initialized, while cpumask_local_spread only
considers the local node. When the number of NIC queues exceeds the
number of cores in the local node, it returns to the online core directly.
So when PS runs on node3 sending a calculated request, the performance is
not as good as the node2.

The IRQ from 369-392 will be bound from NUMA node0 to NUMA node3 with this
patch, before the patch:

Euler:/sys/bus/pci # cat /proc/irq/369/smp_affinity_list
0
Euler:/sys/bus/pci # cat /proc/irq/370/smp_affinity_list
1
...
Euler:/sys/bus/pci # cat /proc/irq/391/smp_affinity_list
22
Euler:/sys/bus/pci # cat /proc/irq/392/smp_affinity_list
23
After the patch:
Euler:/sys/bus/pci # cat /proc/irq/369/smp_affinity_list
72
Euler:/sys/bus/pci # cat /proc/irq/370/smp_affinity_list
73
...
Euler:/sys/bus/pci # cat /proc/irq/391/smp_affinity_list
94
Euler:/sys/bus/pci # cat /proc/irq/392/smp_affinity_list
95

So the performance of the node3 is the same as node2 that is 26W QPS when
the network card is still in 'node2' with the patch.

Cc: Dave Hansen 
Cc: Rusty Russell 
Cc: Andrew Morton 
Cc: Juergen Gross 
Cc: Paul Burton 
Cc: Michal Hocko 
Cc: Michael Ellerman 
Cc: Mike Rapoport 
Cc: Anshuman Khandual 
Signed-off-by: Yuqi Jin 
Signed-off-by: Shaokun Zhang 
---
ChangeLog from v6:
1. Addressed Dave comments
2. Fix the warning from Hulk Robot
3. Simply the git log.

ChangeLog from v5:
1. Rebase to 5.10-rc2

ChangeLog from v4:
1. Rebase to 5.6-rc3 

ChangeLog from v3:
1. Make spread_lock local to cpumask_local_spread();
2. Add more descriptions on the affinities change in log;

ChangeLog from v2:
1. Change the variables as static and use spinlock to protect;
2. Give more explantation on test and performance;

 lib/cpumask.c | 60 ++-
 1 file changed, 47 insertions(+), 13 deletions(-)

diff --git a/lib/cpumask.c b/lib/cpumask.c
index 97a005ffde31..516d7237e302 100644
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -325,20 +325,47 @@ void __init free_bootmem_cpumask_var(cpumask_var_t mask)
 }
 #endif
 
+static int find_nearest_node(int node, bool *used)
+{
+   int i, min_dist, node_id = -1;
+
+   /* Choose the first unused node to compare */
+   for (i = 0; i < nr_node_ids; i++) {
+   if (used[i] == false) {
+   min_dist = node_distance(node, i);
+   node_id = i;
+   break;
+   }
+   }
+
+   /* Compare and return the nearest node */
+   for (i = 0; i < nr_node_ids; i++) {
+   if (node_distance(node, i) < min_dist && used[i] == false) {
+   min_dist = node_distance(node, i);
+   node_id = i;
+   }
+   }
+
+   return node_id;
+}
+
 /**
  * cpumask_local_spread 

Re: [PATCH v6] lib: optimize cpumask_local_spread()

2020-11-16 Thread Shaokun Zhang
Hi Dave,

在 2020/11/16 22:48, Dave Hansen 写道:
> On 11/15/20 11:59 PM, Shaokun Zhang wrote:
>>> Do you want to take another pass at submitting this patch?
>> 'Another pass'? Sorry for my bad understading, I don't follow it correctly.
> 
> Could you please incorporate the feedback that I've given about this
> version of the patch and write a new version?

Yeah, I will do it soon addressed your comments.

Cheers,
Shaokun.

> 


Re: [PATCH v6] lib: optimize cpumask_local_spread()

2020-11-16 Thread Shaokun Zhang
Hi Dave,

在 2020/11/14 0:02, Dave Hansen 写道:
> On 11/12/20 6:06 PM, Shaokun Zhang wrote:
>>>> On Huawei Kunpeng 920 server, there are 4 NUMA node(0 - 3) in the 2-cpu
>>>> system(0 - 1). The topology of this server is followed:
>>>
>>> This is with a feature enabled that Intel calls sub-NUMA-clustering
>>> (SNC), right?  Explaining *that* feature would also be great context for
>>
>> Correct,
>>
>>> why this gets triggered on your system and not normally on others and
>>> why nobody noticed this until now.
>>
>> This is on intel 6248 platform:
> 
> I have no idea what a "6248 platform" is.
> 

My apologies that it's Cascade Lake, [1]

>>>> +static void calc_node_distance(int *node_dist, int node)
>>>> +{
>>>> +  int i;
>>>> +
>>>> +  for (i = 0; i < nr_node_ids; i++)
>>>> +  node_dist[i] = node_distance(node, i);
>>>> +}
>>>
>>> This appears to be the only place node_dist[] is written.  That means it
>>> always contains a one-dimensional slice of the two-dimensional data
>>> represented by node_distance().
>>>
>>> Why is a copy of this data needed?
>>
>> It is used to store the distance with the @node for later, apologies that I
>> can't follow your question correctly.
> 
> Right, the data that you store is useful.  *But*, it's also a verbatim
> copy of the data from node_distance().  Why not just use node_distance()
> directly in your code rather than creating a partial copy of it in the

Ok, I will remove this redundant function in next version.

> local node_dist[] array?
> 
> 
>>>>  unsigned int cpumask_local_spread(unsigned int i, int node)
>>>>  {
>>>> -  int cpu, hk_flags;
>>>> +  static DEFINE_SPINLOCK(spread_lock);
>>>> +  static int node_dist[MAX_NUMNODES];
>>>> +  static bool used[MAX_NUMNODES];
>>>
>>> Not to be *too* picky, but there is a reason we declare nodemask_t as a
>>> bitmap and not an array of bools.  Isn't this just wasteful?
>>>
>>>> +  unsigned long flags;
>>>> +  int cpu, hk_flags, j, id;
>>>>const struct cpumask *mask;
>>>>  
>>>>hk_flags = HK_FLAG_DOMAIN | HK_FLAG_MANAGED_IRQ;
>>>> @@ -220,20 +256,28 @@ unsigned int cpumask_local_spread(unsigned int i, 
>>>> int node)
>>>>return cpu;
>>>>}
>>>>} else {
>>>> -  /* NUMA first. */
>>>> -  for_each_cpu_and(cpu, cpumask_of_node(node), mask) {
>>>> -  if (i-- == 0)
>>>> -  return cpu;
>>>> -  }
>>>> +  spin_lock_irqsave(_lock, flags);
>>>> +  memset(used, 0, nr_node_ids * sizeof(bool));
>>>> +  calc_node_distance(node_dist, node);
>>>> +  /* Local node first then the nearest node is used */
>>>
>>> Is this comment really correct?  This makes it sound like there is only
>>
>> I think it is correct, that's what we want to choose the nearest node.
>>
>>> fallback to a single node.  Doesn't the _code_ fall back basically
>>> without limit?
>>
>> If I follow your question correctly, without this patch, if the local
>> node is used up, one random node will be choosed, right? Now we firstly
>> choose the nearest node by the distance, if all nodes has been choosen,
>> it will return the initial solution.
> 
> The comment makes it sound like the code does:
>   1. Do the local node
>   2. Do the next nearest node
>   3. Stop
> 

That's more clear, I will udpate the comments as the new patch.

> In reality, I *think* it's more of a loop where it search
> ever-increasing distances away from the local node.
> 
> I just think the comment needs to be made more precise.

Got it.

> 
>>>> +  for (j = 0; j < nr_node_ids; j++) {
>>>> +  id = find_nearest_node(node_dist, used);
>>>> +  if (id < 0)
>>>> +  break;
>>>>  
>>>> -  for_each_cpu(cpu, mask) {
>>>> -  /* Skip NUMA nodes, done above. */
>>>> -  if (cpumask_test_cpu(cpu, cpumask_of_node(node)))
>>>> -  continue;
>>>> +  for_each_cpu_and(cpu, cpumask_of_node(id), mask)
>>>> +  if (i-- == 0) {
>>>> +  spin_unlock_irqrestore(_lock,
>>>> + flags);
>>>> +  return cpu;
>>>> +  }
>>>> +  used[id] = 1;
>>>> +  }
>>>> +  spin_unlock_irqrestore(_lock, flags);
>>>
>>> The existing code was pretty sparsely commented.  This looks to me to
>>> make it more complicated and *less* commented.  Not the best combo.
>>
>> Apologies for the bad comments, hopefully I describe it clearly by the above
>> explantion.
> 
> Do you want to take another pass at submitting this patch?

'Another pass'? Sorry for my bad understading, I don't follow it correctly.

Thanks,
Shaokun

[1]https://en.wikichip.org/wiki/intel/xeon_gold/6248

> .
> 


Re: [PATCH v6] lib: optimize cpumask_local_spread()

2020-11-12 Thread Shaokun Zhang
Hi Dave,

在 2020/11/5 0:10, Dave Hansen 写道:
> On 11/3/20 5:39 AM, Shaokun Zhang wrote:
>> Currently, Intel DDIO affects only local sockets, so its performance
>> improvement is due to the relative difference in performance between the
>> local socket I/O and remote socket I/O.To ensure that Intel DDIO’s
>> benefits are available to applications where they are most useful, the
>> irq can be pinned to particular sockets using Intel DDIO.
>> This arrangement is called socket affinityi. So this patch can help
>> Intel DDIO work. The same I/O stash function for most processors
> 
> A great changelog would probably include a bit of context about DDIO.
> Even being from Intel, I'd heard about this, but I didn't immediately
> know what the acronym was.
> 
> The thing that matters here is that DDIO allows devices to use processor
> caches instead of having them always do uncached accesses to main
> memory.  That's a pretty important detail left out of the changelog.
> 
>> On Huawei Kunpeng 920 server, there are 4 NUMA node(0 - 3) in the 2-cpu
>> system(0 - 1). The topology of this server is followed:
> 
> This is with a feature enabled that Intel calls sub-NUMA-clustering
> (SNC), right?  Explaining *that* feature would also be great context for

Correct,

> why this gets triggered on your system and not normally on others and
> why nobody noticed this until now.

This is on intel 6248 platform:
[root@localhost ~]# numactl -H
available: 4 nodes (0-3)
node 0 cpus: 0 1 2 5 6 10 11 12 15 16 40 41 42 45 46 50 51 52 55 56
node 0 size: 46893 MB
node 0 free: 45982 MB
node 1 cpus: 3 4 7 8 9 13 14 17 18 19 43 44 47 48 49 53 54 57 58 59
node 1 size: 48379 MB
node 1 free: 48235 MB
node 2 cpus: 20 21 22 25 26 30 31 32 35 36 60 61 62 65 66 70 71 72 75 76
node 2 size: 48353 MB
node 2 free: 48022 MB
node 3 cpus: 23 24 27 28 29 33 34 37 38 39 63 64 67 68 69 73 74 77 78 79
node 3 size: 48378 MB
node 3 free: 48196 MB
node distances:
node   0   1   2   3
  0:  10  11  21  21
  1:  11  10  21  21
  2:  21  21  10  11
  3:  21  21  11  10
[root@localhost ~]#
When the intel client turns on SNC, the mlx5 network card is used and is located
in node2, while the number queues of network card is greater than the number of
cores of node2. When all cores in the node2 has been binded, the core of node0
will be choosed, resulting in cross-chip and DDIO failure. If applying this 
patch,
node3 will be choosed to avoid this cross-chip operation.

> 
>> The IRQ from 369-392 will be bound from NUMA node0 to NUMA node3 with this
>> patch, before the patch:
>>
>> Euler:/sys/bus/pci # cat /proc/irq/369/smp_affinity_list
>> 0
>> Euler:/sys/bus/pci # cat /proc/irq/370/smp_affinity_list
>> 1
>> ...
>> Euler:/sys/bus/pci # cat /proc/irq/391/smp_affinity_list
>> 22
>> Euler:/sys/bus/pci # cat /proc/irq/392/smp_affinity_list
>> 23
>> After the patch:
> 
> I _think_ what you are trying to convey here is that IRQs 369 and 370
> are from devices plugged in to one socket, but their IRQs are bound to
> CPUs 0 and 1 which are in the other socket.  Once device traffic leaves
> the socket, it can no longer use DDIO and performance suffers.
> 
> The same situation is true for IRQs 391/392 and CPUs 22/23.
> 
> You don't come out and say it, but I assume that the root of this issue
> is that once we fill up a NUMA node worth of CPUs with an affinitized
> IRQ per CPU, we go looking for CPUs in other NUMA nodes.  In this case,
> we have the processor in this weird mode that chops sockets into two
> NUMA nodes, which makes the device's NUMA node fill up faster.
> 
> The current behavior just "wraps around" to find a new node.  But, this
> wrap around behavior is nasty in this case because it might cross a socket.
> 

We want to improve the siutation that the card is inserted in second socket,
but it is binded with the first socket CPU cores, so we want to calcualte
this distance between different NUMA node and choose the nearesd node, it is
not a simple wraps arouad.

>> +static void calc_node_distance(int *node_dist, int node)
>> +{
>> +int i;
>> +
>> +for (i = 0; i < nr_node_ids; i++)
>> +node_dist[i] = node_distance(node, i);
>> +}
> 
> This appears to be the only place node_dist[] is written.  That means it
> always contains a one-dimensional slice of the two-dimensional data
> represented by node_distance().
> 
> Why is a copy of this data needed?

It is used to store the distance with the @node for later, apologies that I
can't follow your question correctly.

> 
>> +static int find_nearest_node(int *node_dist, bool *used)
>> +{
>> +int i, min_dist = node_dist[0], node_id = -1;
>> +
>> +/* Choose 

[PATCH v6] lib: optimize cpumask_local_spread()

2020-11-03 Thread Shaokun Zhang
From: Yuqi Jin 

In multi-processor and NUMA system, I/O driver will find cpu cores that
which shall be bound IRQ.  When cpu cores in the local numa have been
used, it is better to find the node closest to the local numa node for
performance, instead of choosing any online cpu immediately.

Currently, Intel DDIO affects only local sockets, so its performance
improvement is due to the relative difference in performance between the
local socket I/O and remote socket I/O.To ensure that Intel DDIO’s
benefits are available to applications where they are most useful, the
irq can be pinned to particular sockets using Intel DDIO.
This arrangement is called socket affinityi. So this patch can help
Intel DDIO work. The same I/O stash function for most processors

On Huawei Kunpeng 920 server, there are 4 NUMA node(0 - 3) in the 2-cpu
system(0 - 1). The topology of this server is followed:
available: 4 nodes (0-3)
node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
node 0 size: 63379 MB
node 0 free: 61899 MB
node 1 cpus: 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 
46 47
node 1 size: 64509 MB
node 1 free: 63942 MB
node 2 cpus: 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 
70 71
node 2 size: 64509 MB
node 2 free: 63056 MB
node 3 cpus: 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 
94 95
node 3 size: 63997 MB
node 3 free: 63420 MB
node distances:
node   0   1   2   3
  0:  10  16  32  33
  1:  16  10  25  32
  2:  32  25  10  16
  3:  33  32  16  10

We perform PS (parameter server) business test, the behavior of the
service is that the client initiates a request through the network card,
the server responds to the request after calculation.  When two PS
processes run on node2 and node3 separately and the network card is
located on 'node2' which is in cpu1, the performance of node2 (26W QPS)
and node3 (22W QPS) is different.

It is better that the NIC queues are bound to the cpu1 cores in turn, then
XPS will also be properly initialized, while cpumask_local_spread only
considers the local node.  When the number of NIC queues exceeds the
number of cores in the local node, it returns to the online core directly.
So when PS runs on node3 sending a calculated request, the performance is
not as good as the node2.

The IRQ from 369-392 will be bound from NUMA node0 to NUMA node3 with this
patch, before the patch:

Euler:/sys/bus/pci # cat /proc/irq/369/smp_affinity_list
0
Euler:/sys/bus/pci # cat /proc/irq/370/smp_affinity_list
1
...
Euler:/sys/bus/pci # cat /proc/irq/391/smp_affinity_list
22
Euler:/sys/bus/pci # cat /proc/irq/392/smp_affinity_list
23
After the patch:
Euler:/sys/bus/pci # cat /proc/irq/369/smp_affinity_list
72
Euler:/sys/bus/pci # cat /proc/irq/370/smp_affinity_list
73
...
Euler:/sys/bus/pci # cat /proc/irq/391/smp_affinity_list
94
Euler:/sys/bus/pci # cat /proc/irq/392/smp_affinity_list
95

So the performance of the node3 is the same as node2 that is 26W QPS when
the network card is still in 'node2' with the patch.

It is considered that the NIC and other I/O devices shall initialize the
interrupt binding, if the cores of the local node are used up, it is
reasonable to return the node closest to it.  Let's optimize it and find
the nearest node through NUMA distance for the non-local NUMA nodes.

Cc: Rusty Russell 
Cc: Andrew Morton 
Cc: Juergen Gross 
Cc: Paul Burton 
Cc: Michal Hocko 
Cc: Michael Ellerman 
Cc: Mike Rapoport 
Cc: Anshuman Khandual 
Signed-off-by: Yuqi Jin 
Signed-off-by: Shaokun Zhang 
---
Hi Andrew,

I rebased this patch later following this thread [1]

ChangeLog from v5:
1. Rebase to 5.10-rc2

ChangeLog from v4:
1. Rebase to 5.6-rc3 

ChangeLog from v3:
1. Make spread_lock local to cpumask_local_spread();
2. Add more descriptions on the affinities change in log;

ChangeLog from v2:
1. Change the variables as static and use spinlock to protect;
2. Give more explantation on test and performance;

[1]https://lkml.org/lkml/2020/6/30/1300

 lib/cpumask.c | 66 +--
 1 file changed, 55 insertions(+), 11 deletions(-)

diff --git a/lib/cpumask.c b/lib/cpumask.c
index 85da6ab4fbb5..baecaf271770 100644
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -193,6 +193,38 @@ void __init free_bootmem_cpumask_var(cpumask_var_t mask)
 }
 #endif
 
+static void calc_node_distance(int *node_dist, int node)
+{
+   int i;
+
+   for (i = 0; i < nr_node_ids; i++)
+   node_dist[i] = node_distance(node, i);
+}
+
+static int find_nearest_node(int *node_dist, bool *used)
+{
+   int i, min_dist = node_dist[0], node_id = -1;
+
+   /* Choose the first unused node to compare */
+   for (i = 0; i < nr_node_ids; i++) {
+   if (used[i] == 0) {
+   min_dist = node_dist[i];
+   node_id = i;
+   break;
+   }
+   }
+
+   /* Compare and 

Re: [NAK] Re: [PATCH] fs: Optimized fget to improve performance

2020-08-30 Thread Shaokun Zhang
Hi Al,

在 2020/8/27 22:28, Al Viro 写道:
> On Thu, Aug 27, 2020 at 06:19:44PM +0800, Shaokun Zhang wrote:
>> From: Yuqi Jin 
>>
>> It is well known that the performance of atomic_add is better than that of
>> atomic_cmpxchg.
>> The initial value of @f_count is 1. While @f_count is increased by 1 in
>> __fget_files, it will go through three phases: > 0, < 0, and = 0. When the
>> fixed value 0 is used as the condition for terminating the increase of 1,
>> only atomic_cmpxchg can be used. When we use < 0 as the condition for
>> stopping plus 1, we can use atomic_add to obtain better performance.
> 
> Suppose another thread has just removed it from the descriptor table.
> 
>> +static inline bool get_file_unless_negative(atomic_long_t *v, long a)
>> +{
>> +long c = atomic_long_read(v);
>> +
>> +if (c <= 0)
>> +return 0;
> 
> Still 1.  Now the other thread has gotten to dropping the last reference,
> decremented counter to zero and committed to freeing the struct file.
> 

Apologies that I missed it.

>> +
>> +return atomic_long_add_return(a, v) - 1;
> 
> ... and you increment that sucker back to 1.  Sure, you return 0, so the
> caller does nothing to that struct file.  Which includes undoing the
> changes to its refecount.
> 
> In the meanwhile, the third thread does fget on the same descriptor,
> and there we end up bumping the refcount to 2 and succeeding.  Which
> leaves the caller with reference to already doomed struct file...
> 
>   IOW, NAK - this is completely broken.  The whole point of
> atomic_long_add_unless() is that the check and conditional increment
> are atomic.  Together.  That's what your optimization takes out.
> 

How about this? We try to replace atomic_cmpxchg with atomic_add to improve
performance. The atomic_add does not check the current f_count value.
Therefore, the number of online CPUs is reserved to prevent multi-core
competition.

+
+static inline bool get_file_unless(atomic_long_t *v, long a)
+{
+   long cpus = num_online_cpus();
+   long c = atomic_long_read(v);
+   long ret;
+
+   if (c > cpus || c < -cpus)
+   ret = atomic_long_add_return(a, v) - a;
+   else
+   ret = atomic_long_add_unless(v, a, 0);
+
+   return ret;
+}
+
 #define get_file_rcu_many(x, cnt)  \
-   atomic_long_add_unless(&(x)->f_count, (cnt), 0)
+   get_file_unless(&(x)->f_count, (cnt))

Thanks,
Shaokun

> .
> 



[PATCH] fs: Optimized fget to improve performance

2020-08-27 Thread Shaokun Zhang
From: Yuqi Jin 

It is well known that the performance of atomic_add is better than that of
atomic_cmpxchg.
The initial value of @f_count is 1. While @f_count is increased by 1 in
__fget_files, it will go through three phases: > 0, < 0, and = 0. When the
fixed value 0 is used as the condition for terminating the increase of 1,
only atomic_cmpxchg can be used. When we use < 0 as the condition for
stopping plus 1, we can use atomic_add to obtain better performance.

we test syscall in unixbench on Huawei Kunpeng920(arm64). We've got a 132%
performance boost. 

with this patch and the patch [1]
System Call Overhead9516926.2 lps   (10.0 s, 1 samples)

System Benchmarks Partial Index  BASELINE   RESULTINDEX
System Call Overhead  15000.09516926.2   6344.6
   
System Benchmarks Index Score (Partial Only) 6344.6

with this patch and without the patch [1]
System Call Overhead5290449.3 lps   (10.0 s, 1 samples)

System Benchmarks Partial Index  BASELINE   RESULTINDEX
System Call Overhead  15000.05290449.3   3527.0
   
System Benchmarks Index Score (Partial Only) 3527.0

without any patch
System Call Overhead4102310.5 lps   (10.0 s, 1 samples)

System Benchmarks Partial Index  BASELINE   RESULTINDEX
System Call Overhead  15000.04102310.5   2734.9
   
System Benchmarks Index Score (Partial Only) 2734.9

[1] https://lkml.org/lkml/2020/6/24/283

Cc: kernel test robot 
Cc: Will Deacon 
Cc: Mark Rutland 
Cc: Peter Zijlstra 
Cc: Alexander Viro 
Cc: Boqun Feng 
Signed-off-by: Yuqi Jin 
Signed-off-by: Shaokun Zhang 
---
Hi Rong,

Can you help to test this patch individually and with [1] together on
your platform please? [1] has been tested on your platform[2].

[2] https://lkml.org/lkml/2020/7/8/227

 include/linux/fs.h | 13 -
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/include/linux/fs.h b/include/linux/fs.h
index e019ea2f1347..2a9c2a30dc58 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -972,8 +972,19 @@ static inline struct file *get_file(struct file *f)
atomic_long_inc(>f_count);
return f;
 }
+
+static inline bool get_file_unless_negative(atomic_long_t *v, long a)
+{
+   long c = atomic_long_read(v);
+
+   if (c <= 0)
+   return 0;
+
+   return atomic_long_add_return(a, v) - 1;
+}
+
 #define get_file_rcu_many(x, cnt)  \
-   atomic_long_add_unless(&(x)->f_count, (cnt), 0)
+   get_file_unless_negative(&(x)->f_count, (cnt))
 #define get_file_rcu(x) get_file_rcu_many((x), 1)
 #define file_count(x)  atomic_long_read(&(x)->f_count)
 
-- 
2.7.4



[PATCH] iommu/iova: Replace cmpxchg with xchg in queue_iova

2020-08-27 Thread Shaokun Zhang
From: Yuqi Jin 

The performance of the atomic_xchg is better than atomic_cmpxchg because
no comparison is required. While the value of @fq_timer_on can only be 0
or 1. Let's use atomic_xchg instead of atomic_cmpxchg here because we
only need to check that the value changes from 0 to 1 or from 1 to 1.

Cc: Joerg Roedel 
Signed-off-by: Yuqi Jin 
Signed-off-by: Shaokun Zhang 
---
 drivers/iommu/iova.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index 45a251da5453..30d969a4c5fd 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -579,7 +579,7 @@ void queue_iova(struct iova_domain *iovad,
 
/* Avoid false sharing as much as possible. */
if (!atomic_read(>fq_timer_on) &&
-   !atomic_cmpxchg(>fq_timer_on, 0, 1))
+   !atomic_xchg(>fq_timer_on, 1))
mod_timer(>fq_timer,
  jiffies + msecs_to_jiffies(IOVA_FQ_TIMEOUT));
 }
-- 
2.7.4



Re: [PATCH RESEND] fs: Move @f_count to different cacheline with @f_mode

2020-08-27 Thread Shaokun Zhang
Hi Aleksa,

在 2020/8/26 16:24, Aleksa Sarai 写道:
> On 2020-08-26, Shaokun Zhang  wrote:
>> 在 2020/8/22 0:02, Will Deacon 写道:
>>>   - This thing is tagged with __randomize_layout, so it doesn't help anybody
>>> using that crazy plugin
>>
>> This patch isolated the @f_count with @f_mode absolutely and we don't care 
>> the
>> base address of the structure, or I may miss something what you said.
> 
> __randomize_layout randomises the order of fields in a structure on each
> kernel rebuild (to make attacks against sensitive kernel structures
> theoretically harder because the offset of a field is per-build). It is

My bad, I missed Will's comments for my poor understanding on it.

> separate to ASLR or other base-related randomisation. However it depends
> on having CONFIG_GCC_PLUGIN_RANDSTRUCT=y and I believe (at least for
> distribution kernels) this isn't a widely-used configuration.

Thanks for more explanations about it, in our test, this config is also
disabled. If having CONFIG_GCC_PLUGIN_RANDSTRUCT=y, it seems this patch
will lose its value.
If it isn't widely-used for this config, hopefully we can do something on
the scene.

Thanks,
Shaokun

> 



Re: [PATCH RESEND] fs: Move @f_count to different cacheline with @f_mode

2020-08-26 Thread Shaokun Zhang
Hi Will,

在 2020/8/22 0:02, Will Deacon 写道:
> On Wed, Jun 24, 2020 at 04:32:28PM +0800, Shaokun Zhang wrote:
>> get_file_rcu_many, which is called by __fget_files, has used
>> atomic_try_cmpxchg now and it can reduce the access number of the global
>> variable to improve the performance of atomic instruction compared with
>> atomic_cmpxchg. 
>>
>> __fget_files does check the @f_mode with mask variable and will do some
>> atomic operations on @f_count, but both are on the same cacheline.
>> Many CPU cores do file access and it will cause much conflicts on @f_count. 
>> If we could make the two members into different cachelines, it shall relax
>> the siutations.
>>
>> We have tested this on ARM64 and X86, the result is as follows:
>> Syscall of unixbench has been run on Huawei Kunpeng920 with this patch:
>> 24 x System Call Overhead  1
>>
>> System Call Overhead3160841.4 lps   (10.0 s, 1 samples)
>>
>> System Benchmarks Partial Index  BASELINE   RESULTINDEX
>> System Call Overhead  15000.03160841.4   2107.2
>>
>> System Benchmarks Index Score (Partial Only) 2107.2
>>
>> Without this patch:
>> 24 x System Call Overhead  1
>>
>> System Call Overhead456.0 lps   (10.0 s, 1 samples)
>>
>> System Benchmarks Partial Index  BASELINE   RESULTINDEX
>> System Call Overhead  15000.0456.0   1481.6
>>
>> System Benchmarks Index Score (Partial Only) 1481.6
>>
>> And on Intel 6248 platform with this patch:
>> 40 CPUs in system; running 24 parallel copies of tests
>>
>> System Call Overhead4288509.1 lps   (10.0 s, 1 
>> samples)
>>
>> System Benchmarks Partial Index  BASELINE   RESULTINDEX
>> System Call Overhead  15000.04288509.1   2859.0
>>
>> System Benchmarks Index Score (Partial Only) 2859.0
>>
>> Without this patch:
>> 40 CPUs in system; running 24 parallel copies of tests
>>
>> System Call Overhead3666313.0 lps   (10.0 s, 1 
>> samples)
>>
>> System Benchmarks Partial Index  BASELINE   RESULTINDEX
>> System Call Overhead  15000.03666313.0   2444.2
>>        
>> System Benchmarks Index Score (Partial Only) 2444.2
>>
>> Cc: Will Deacon 
>> Cc: Mark Rutland 
>> Cc: Peter Zijlstra 
>> Cc: Alexander Viro 
>> Cc: Boqun Feng 
>> Signed-off-by: Yuqi Jin 
>> Signed-off-by: Shaokun Zhang 
>> ---
>>  include/linux/fs.h | 2 +-
>>  1 file changed, 1 insertion(+), 1 deletion(-)
>>
>> diff --git a/include/linux/fs.h b/include/linux/fs.h
>> index 3f881a892ea7..0faeab5622fb 100644
>> --- a/include/linux/fs.h
>> +++ b/include/linux/fs.h
>> @@ -955,7 +955,6 @@ struct file {
>>   */
>>  spinlock_t  f_lock;
>>  enum rw_hintf_write_hint;
>> -atomic_long_t   f_count;
>>  unsigned intf_flags;
>>  fmode_t f_mode;
>>  struct mutexf_pos_lock;
>> @@ -979,6 +978,7 @@ struct file {
>>  struct address_space*f_mapping;
>>  errseq_tf_wb_err;
>>  errseq_tf_sb_err; /* for syncfs */
>> +atomic_long_t   f_count;
>>  } __randomize_layout
>>__attribute__((aligned(4)));  /* lest something weird decides that 2 
>> is OK */
> 
> Hmm. So the microbenchmark numbers look lovely, but:

Thanks,

> 
>   - What impact does it actually have for real workloads?

It is exposed by we do the unixbench test. About the real workloads, if it has 
many
threads and open the same file, it shall be useful like unixbench.
If not the scenes, it should not be regression with the patch because we only 
change
the poistion of @f_count with @f_mode.

>   - How do we avoid regressing performance by innocently changing the struct
> again later on?

It shall be commented this change on the @f_count, I'm not sure it is enough.

>   - This thing is tagged with __randomize_layout, so it doesn't help anybody
> using that crazy plugin

This patch isolated the @f_count with @f_mode absolutely and we don't care the
base address of the structure, or I may miss something what you said.

>   - What about all the other atomics and locks that share cachelines?

An interesting question, to be honest, about this issue, we did performance
profile using unixbench and found it, then we want to relax the conflicts.
For other scenes, this method may be useful if it is debugged by the same
conflicts, but it can't be detected automatically.

Thanks,
Shaokun

> 
> Will
> 
> .
> 



Re: linux-next: Tree for Jul 30 [build failure on arm64]

2020-07-31 Thread Shaokun Zhang
Hi,

在 2020/7/31 16:30, Naresh Kamboju 写道:
> On Fri, 31 Jul 2020 at 09:38, Stephen Rothwell  wrote:
>>
>> Hi all,
>>
>> On Fri, 31 Jul 2020 10:46:52 +0800 Shaokun Zhang 
>>  wrote:
>>>
>>> There's a build failure on arm64:
>>>
>>> In file included from ./include/linux/compat.h:17:0,
>>>  from ./arch/arm64/include/asm/stat.h:13,
>>>  from ./include/linux/stat.h:6,
>>>  from ./include/linux/sysfs.h:22,
>>>  from ./include/linux/kobject.h:20,
>>>  from ./include/linux/of.h:17,
>>>  from ./include/linux/irqdomain.h:35,
>>>  from ./include/linux/acpi.h:13,
>>>  from ./include/acpi/apei.h:9,
>>>  from ./include/acpi/ghes.h:5,
>>>  from ./include/linux/arm_sdei.h:8,
>>>  from arch/arm64/kernel/asm-offsets.c:10:
>>> ./include/linux/fs.h: In function ‘vfs_whiteout’:
>>> ./include/linux/fs.h:1709:32: error: ‘S_IFCHR’ undeclared (first use in 
>>> this function)
>>>   return vfs_mknod(dir, dentry, S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV);
>>> ^
>>> ./include/linux/fs.h:1709:32: note: each undeclared identifier is reported 
>>> only once for each
>>> function it appears in
>>> ./include/linux/fs.h: At top level:
>>> ./include/linux/fs.h:1855:46: warning: ‘struct kstat’ declared inside 
>>> parameter list
>>>   int (*getattr) (const struct path *, struct kstat *, u32, unsigned int);
>>>   ^
>>> ./include/linux/fs.h:1855:46: warning: its scope is only this definition or 
>>> declaration, which is
>>> probably not what you want
>>> ./include/linux/fs.h: In function ‘__mandatory_lock’:
>>> ./include/linux/fs.h:2325:25: error: ‘S_ISGID’ undeclared (first use in 
>>> this function)
>>>   return (ino->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID;
>>>  ^
>>> ./include/linux/fs.h:2325:35: error: ‘S_IXGRP’ undeclared (first use in 
>>> this function)
>>>   return (ino->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID;
>>>^
>>> ./include/linux/fs.h: In function ‘invalidate_remote_inode’:
>>> ./include/linux/fs.h:2588:6: error: implicit declaration of function 
>>> ‘S_ISREG’
>>> [-Werror=implicit-function-declaration]
>>>   if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
>>>   ^
>>> ./include/linux/fs.h:2588:32: error: implicit declaration of function 
>>> ‘S_ISDIR’
>>> [-Werror=implicit-function-declaration]
>>>   if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
>>> ^
>>> ./include/linux/fs.h:2589:6: error: implicit declaration of function 
>>> ‘S_ISLNK’
>>> [-Werror=implicit-function-declaration]
>>>   S_ISLNK(inode->i_mode))
>>>   ^
>>> ./include/linux/fs.h: In function ‘execute_ok’:
>>> ./include/linux/fs.h:2768:26: error: ‘S_IXUGO’ undeclared (first use in 
>>> this function)
>>>   return (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode);
>>
>> Presumably caused by commit
>>
>>   b902bfb3f0e9 ("arm64: stop using  directly")
> 
> I have reverted this commit
>   b902bfb3f0e9 ("arm64: stop using  directly")
> 
> and rebuilt arm64 failed due to below errors/warnings.
> 
> make -sk KBUILD_BUILD_USER=TuxBuild -C/linux -j16 ARCH=arm64
> CROSS_COMPILE=aarch64-linux-gnu- HOSTCC=gcc CC="sccache
> aarch64-linux-gnu-gcc" O=build Image
> #
> In file included from ../include/linux/stat.h:6,
>  from ../include/linux/sysfs.h:22,
>  from ../include/linux/kobject.h:20,
>  from ../include/linux/of.h:17,
>  from ../include/linux/irqdomain.h:35,
>  from ../include/linux/acpi.h:13,
>  from ../include/acpi/apei.h:9,
>  from ../include/acpi/ghes.h:5,
>  from ../include/linux/arm_sdei.h:8,
>  from ../arch/arm64/kernel/asm-offsets.c:10:
> ../arch/arm64/include/asm/stat.h:20:2: error: unknown type name ‘compat_u64’
>20 |  compat_u64 st_dev;
>   |  

0a3a4497a1de <"compat: lift compat_s64 and compat_u64 to ">
has removed the compat_s64 and compat_u64.

Thanks,
Shaokun

^~
> ../arch/arm64/include/asm/stat.h:31:2: error: unknown type name ‘compat_u64’
>31 |  compat_u64 st_rdev;
>   |  ^~
> ../arch/arm64/include/asm/stat.h:34:2: error: unknown type name ‘compat_s64’
>34 |  compat_s64 st_size;
>   |  ^~
> ../arch/arm64/include/asm/stat.h:36:2: error: unknown type name ‘compat_u64’
>36 |  compat_u64 st_blocks; /* Number of 512-byte blocks allocated. */
>   |  ^~
> ../arch/arm64/include/asm/stat.h:47:2: error: unknown type name ‘compat_u64’
>47 |  compat_u64 st_ino;
>   |  ^~
> make[2]: *** [../scripts/Makefile.build:114:
> arch/arm64/kernel/asm-offsets.s] Error 1
> 
> - Naresh
> 
> .
> 



Re: linux-next: Tree for Jul 30 [build failure on arm64]

2020-07-30 Thread Shaokun Zhang
Hi,

There's a build failure on arm64:

In file included from ./include/linux/compat.h:17:0,
 from ./arch/arm64/include/asm/stat.h:13,
 from ./include/linux/stat.h:6,
 from ./include/linux/sysfs.h:22,
 from ./include/linux/kobject.h:20,
 from ./include/linux/of.h:17,
 from ./include/linux/irqdomain.h:35,
 from ./include/linux/acpi.h:13,
 from ./include/acpi/apei.h:9,
 from ./include/acpi/ghes.h:5,
 from ./include/linux/arm_sdei.h:8,
 from arch/arm64/kernel/asm-offsets.c:10:
./include/linux/fs.h: In function ‘vfs_whiteout’:
./include/linux/fs.h:1709:32: error: ‘S_IFCHR’ undeclared (first use in this 
function)
  return vfs_mknod(dir, dentry, S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV);
^
./include/linux/fs.h:1709:32: note: each undeclared identifier is reported only 
once for each
function it appears in
./include/linux/fs.h: At top level:
./include/linux/fs.h:1855:46: warning: ‘struct kstat’ declared inside parameter 
list
  int (*getattr) (const struct path *, struct kstat *, u32, unsigned int);
  ^
./include/linux/fs.h:1855:46: warning: its scope is only this definition or 
declaration, which is
probably not what you want
./include/linux/fs.h: In function ‘__mandatory_lock’:
./include/linux/fs.h:2325:25: error: ‘S_ISGID’ undeclared (first use in this 
function)
  return (ino->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID;
 ^
./include/linux/fs.h:2325:35: error: ‘S_IXGRP’ undeclared (first use in this 
function)
  return (ino->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID;
   ^
./include/linux/fs.h: In function ‘invalidate_remote_inode’:
./include/linux/fs.h:2588:6: error: implicit declaration of function ‘S_ISREG’
[-Werror=implicit-function-declaration]
  if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
  ^
./include/linux/fs.h:2588:32: error: implicit declaration of function ‘S_ISDIR’
[-Werror=implicit-function-declaration]
  if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
^
./include/linux/fs.h:2589:6: error: implicit declaration of function ‘S_ISLNK’
[-Werror=implicit-function-declaration]
  S_ISLNK(inode->i_mode))
  ^
./include/linux/fs.h: In function ‘execute_ok’:
./include/linux/fs.h:2768:26: error: ‘S_IXUGO’ undeclared (first use in this 
function)
  return (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode);

Thanks,
Shaokun

在 2020/7/30 19:46, Stephen Rothwell 写道:
> Hi all,
> 
> Changes since 20200729:
> 
> My fixes tree contains:
> 
>   dbf24e30ce2e ("device_cgroup: Fix RCU list debugging warning")
> 
> Linus' tree gained a build failure for which I revertd a commit.
> 
> The vfs tree lost its build failure.
> 
> The printk tree lost its build failure.
> 
> The net-next tree lost its build failure.
> 
> The security tree still had its build failure for which I reverted
> 3 commits.
> 
> The iommu tree gained a conflict against the dma-mapping tree.
> 
> The tip tree still had its build failure for which I reverted a
> commit.
> 
> The vhost tree lost its build failure but gained more for which I applied
> a patch.
> 
> The hmm tree gained conflicts against the drm and kvm-ppc trees.
> 
> Non-merge commits (relative to Linus' tree): 11751
>  12254 files changed, 603002 insertions(+), 226737 deletions(-)
> 
> 
> 
> I have created today's linux-next tree at
> git://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git
> (patches at http://www.kernel.org/pub/linux/kernel/next/ ).  If you
> are tracking the linux-next tree using git, you should not use "git pull"
> to do so as that will try to merge the new linux-next release with the
> old one.  You should use "git fetch" and checkout or reset to the new
> master.
> 
> You can see which trees have been included by looking in the Next/Trees
> file in the source.  There are also quilt-import.log and merge.log
> files in the Next directory.  Between each merge, the tree was built
> with a ppc64_defconfig for powerpc, an allmodconfig for x86_64, a
> multi_v7_defconfig for arm and a native build of tools/perf. After
> the final fixups (if any), I do an x86_64 modules_install followed by
> builds for x86_64 allnoconfig, powerpc allnoconfig (32 and 64 bit),
> ppc44x_defconfig, allyesconfig and pseries_le_defconfig and i386, sparc
> and sparc64 defconfig and htmldocs. And finally, a simple boot test
> of the powerpc pseries_le_defconfig kernel in qemu (with and without
> kvm enabled).
> 
> Below is a summary of the state of the merge.
> 
> I am currently merging 328 trees (counting Linus' and 85 trees of bug
> fix patches pending for the current merge release).
> 
> Stats about the size of the tree over time can be seen at
> 

Re: [fs] 936e92b615: unixbench.score 32.3% improvement

2020-07-13 Thread Shaokun Zhang
Hi maintainers,

This issue is debugged on Huawei Kunpeng 920 which is an ARM64 platform and we 
also do more tests
on x86 platform.
Since Rong has also reported the improvement on x86,it seems necessary for us 
to do it.
Any comments on it?

Thanks,
Shaokun

在 2020/7/8 15:23, kernel test robot 写道:
> Greeting,
> 
> FYI, we noticed a 32.3% improvement of unixbench.score due to commit:
> 
> 
> commit: 936e92b615e212d08eb74951324bef25ba564c34 ("[PATCH RESEND] fs: Move 
> @f_count to different cacheline with @f_mode")
> url: 
> https://github.com/0day-ci/linux/commits/Shaokun-Zhang/fs-Move-f_count-to-different-cacheline-with-f_mode/20200624-163511
> base: https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git 
> 5e857ce6eae7ca21b2055cca4885545e29228fe2
> 
> in testcase: unixbench
> on test machine: 192 threads Intel(R) Xeon(R) Platinum 9242 CPU @ 2.30GHz 
> with 192G memory
> with following parameters:
> 
>   runtime: 300s
>   nr_task: 30%
>   test: syscall
>   cpufreq_governor: performance
>   ucode: 0x5002f01
> 
> test-description: UnixBench is the original BYTE UNIX benchmark suite aims to 
> test performance of Unix-like system.
> test-url: https://github.com/kdlucas/byte-unixbench
> 
> 
> 
> 
> 
> Details are as below:
> -->
> 
> 
> To reproduce:
> 
> git clone https://github.com/intel/lkp-tests.git
> cd lkp-tests
> bin/lkp install job.yaml  # job file is attached in this email
> bin/lkp run job.yaml
> 
> =
> compiler/cpufreq_governor/kconfig/nr_task/rootfs/runtime/tbox_group/test/testcase/ucode:
>   
> gcc-9/performance/x86_64-rhel-7.6/30%/debian-x86_64-20191114.cgz/300s/lkp-csl-2ap3/syscall/unixbench/0x5002f01
> 
> commit: 
>   5e857ce6ea ("Merge branch 'hch' (maccess patches from Christoph Hellwig)")
>   936e92b615 ("fs: Move @f_count to different cacheline with @f_mode")
> 
> 5e857ce6eae7ca21 936e92b615e212d08eb74951324 
>  --- 
>  %stddev %change %stddev
>  \  |\  
>   2297 ±  2% +32.3%   3038unixbench.score
> 171.74   +34.8% 231.55unixbench.time.user_time
>  1.366e+09   +32.6%  1.812e+09unixbench.workload
>  26472 ±  6%   +1270.0% 362665 ±158%  cpuidle.C1.usage
>   0.25 ±  2%  +0.10.33mpstat.cpu.all.usr%
>   8.32 ± 43%+129.7%  19.12 ± 63%  sched_debug.cpu.clock.stddev
>   8.32 ± 43%+129.7%  19.12 ± 63%  
> sched_debug.cpu.clock_task.stddev
>   2100 ±  2% -15.6%   1772 ±  9%  sched_debug.cpu.nr_switches.min
> 373.34 ±  3% +12.4% 419.48 ±  6%  
> sched_debug.cpu.ttwu_local.stddev
>   2740 ± 12% -72.3% 757.75 ±105%  
> numa-vmstat.node0.nr_inactive_anon
>   3139 ±  8% -69.9% 946.25 ± 97%  numa-vmstat.node0.nr_shmem
>   2740 ± 12% -72.3% 757.75 ±105%  
> numa-vmstat.node0.nr_zone_inactive_anon
> 373.75 ± 51%+443.3%   2030 ± 26%  
> numa-vmstat.node2.nr_inactive_anon
> 496.00 ± 19%+366.1%   2311 ± 29%  numa-vmstat.node2.nr_shmem
> 373.75 ± 51%+443.3%   2030 ± 26%  
> numa-vmstat.node2.nr_zone_inactive_anon
>  13728 ± 13%+148.1%  34056 ± 46%  numa-vmstat.node3.nr_active_anon
>  78558   +11.3%  87431 ±  6%  numa-vmstat.node3.nr_file_pages
>   9939 ±  8% +19.7%  11902 ± 13%  numa-vmstat.node3.nr_shmem
>  13728 ± 13%+148.1%  34056 ± 46%  
> numa-vmstat.node3.nr_zone_active_anon
>  11103 ± 13% -71.2%   3201 ± 99%  numa-meminfo.node0.Inactive
>  10962 ± 12% -72.3%   3032 ±105%  
> numa-meminfo.node0.Inactive(anon)
>   8551 ± 31% -29.4%   6034 ± 18%  numa-meminfo.node0.Mapped
>  12560 ±  8% -69.9%   3786 ± 97%  numa-meminfo.node0.Shmem
>   1596 ± 51%+415.6%   8230 ± 24%  numa-meminfo.node2.Inactive
>   1496 ± 51%+442.8%   8122 ± 26%  
> numa-meminfo.node2.Inactive(anon)
>   1984 ± 19%+366.1%   9248 ± 29%  numa-meminfo.node2.Shmem
>  54929 ± 13%+148.0% 136212 ± 46%  numa-meminfo.node3.Active
>  54929 ± 13%+148.0% 136206 ± 46%  numa-meminfo.node3.Active(anon)
> 314216   +11.3% 349697 ±  6%  numa-meminfo.node3.FilePages
> 747907 ±  2% +15.2% 861672 ±  9%  numa-meminfo.node3.MemUsed
>  39744 ±  8% +19.7%  47580 ± 13%  numa-meminfo.node3.Shmem
>  13.9

Re: [Patch v3 1/3] lib: Restrict cpumask_local_spread to houskeeping CPUs

2020-06-29 Thread Shaokun Zhang
Hi Andrew,

在 2020/6/25 3:26, Andrew Morton 写道:
> On Tue, 23 Jun 2020 15:23:29 -0400 Nitesh Narayan Lal  
> wrote:
> 
>> From: Alex Belits 
>>
>> The current implementation of cpumask_local_spread() does not respect the
>> isolated CPUs, i.e., even if a CPU has been isolated for Real-Time task,
>> it will return it to the caller for pinning of its IRQ threads. Having
>> these unwanted IRQ threads on an isolated CPU adds up to a latency
>> overhead.
>>
>> Restrict the CPUs that are returned for spreading IRQs only to the
>> available housekeeping CPUs.
>>
>> ...
>>
>> --- a/lib/cpumask.c
>> +++ b/lib/cpumask.c
>> @@ -6,6 +6,7 @@
>>  #include 
>>  #include 
>>  #include 
>> +#include 
>>  
>>  /**
>>   * cpumask_next - get the next cpu in a cpumask
>> @@ -205,22 +206,27 @@ void __init free_bootmem_cpumask_var(cpumask_var_t 
>> mask)
>>   */
>>  unsigned int cpumask_local_spread(unsigned int i, int node)
>>  {
>> -int cpu;
>> +int cpu, hk_flags;
>> +const struct cpumask *mask;
>>  
>> +hk_flags = HK_FLAG_DOMAIN | HK_FLAG_WQ;
>> +mask = housekeeping_cpumask(hk_flags);
>>  /* Wrap: we always want a cpu. */
>> -i %= num_online_cpus();
>> +i %= cpumask_weight(mask);
>>  
>>  if (node == NUMA_NO_NODE) {
>> -for_each_cpu(cpu, cpu_online_mask)
>> +for_each_cpu(cpu, mask) {
>>  if (i-- == 0)
>>  return cpu;
>> +}
>>  } else {
>>  /* NUMA first. */
>> -for_each_cpu_and(cpu, cpumask_of_node(node), cpu_online_mask)
>> +for_each_cpu_and(cpu, cpumask_of_node(node), mask) {
>>  if (i-- == 0)
>>  return cpu;
>> +}
>>  
>> -for_each_cpu(cpu, cpu_online_mask) {
>> +for_each_cpu(cpu, mask) {
>>  /* Skip NUMA nodes, done above. */
>>  if (cpumask_test_cpu(cpu, cpumask_of_node(node)))
>>  continue;
> 
> Are you aware of these changes to cpu_local_spread()?
> https://lore.kernel.org/lkml/1582768688-2314-1-git-send-email-zhangshao...@hisilicon.com/
> 
> I don't see a lot of overlap but it would be nice for you folks to

Yeah, it's a different issue from Nitesh. About our's patch, it has been
linux-next long time, will it be merged in Linus's tree?

Thanks,
Shaokun

> check each other's homework ;)
> 
> 
> 
> .
> 



Re: linux-next: Tree for Jun 24 [build failure on arm64]

2020-06-27 Thread Shaokun Zhang
Hi Will,

My apologies for reply later because of a short holiday in China.

在 2020/6/24 18:55, Will Deacon 写道:
> On Wed, Jun 24, 2020 at 05:08:56PM +0800, Shaokun Zhang wrote:
>> +Will Deacon,
>>
>> Hi Will,
>>
>> There's a build failure on arm64:
>>
>>   CALLscripts/atomic/check-atomics.sh
>>   CALLscripts/checksyscalls.sh
>>   LD  arch/arm64/kernel/vdso/vdso.so.dbg
>> ld: unrecognized option '--no-eh-frame-hdr'
>> ld: use the --help option for usage information
>> arch/arm64/kernel/vdso/Makefile:64: recipe for target
>> 'arch/arm64/kernel/vdso/vdso.so.dbg' failed
>> make[1]: *** [arch/arm64/kernel/vdso/vdso.so.dbg] Error 1
>> arch/arm64/Makefile:175: recipe for target 'vdso_prepare' failed
>> make: *** [vdso_prepare] Error 2
>>
>> GCC version is followed:
>> gcc (Ubuntu/Linaro 5.4.0-6ubuntu1~16.04.12) 5.4.0 20160609
>>
>> It seems caused by
>> 87676cfca141 arm64: vdso: Disable dwarf unwinding through the sigreturn
>> trampoline
> 
> Urgh, binutils quality strikes again. If you're able to reproduce locally,
> can you try the diff below, please? All the linkers I have kicking around

It works, I saw Jon has already tested it.
Thanks your quick reply.

Shaokun

> seem to support --no-eh-frame-hdr.
> 
> Will
> 
> --->8
> 
> diff --git a/arch/arm64/kernel/vdso/Makefile b/arch/arm64/kernel/vdso/Makefile
> index 1e5a940532da..97d3d3632093 100644
> --- a/arch/arm64/kernel/vdso/Makefile
> +++ b/arch/arm64/kernel/vdso/Makefile
> @@ -23,8 +23,9 @@ btildflags-$(CONFIG_ARM64_BTI_KERNEL) += -z force-bti
>  # potential future proofing if we end up with internal calls to the exported
>  # routines, as x86 does (see 6f121e548f83 ("x86, vdso: Reimplement vdso.so
>  # preparation in build-time C")).
> -ldflags-y := -shared -nostdlib -soname=linux-vdso.so.1 --hash-style=sysv \
> -   -Bsymbolic --no-eh-frame-hdr --build-id -n $(btildflags-y) -T
> +ldflags-y := -shared -nostdlib -soname=linux-vdso.so.1 --hash-style=sysv 
>   \
> +-Bsymbolic $(call ld-option, --no-eh-frame-hdr) --build-id -n
>   \
> +$(btildflags-y) -T
>  
>  ccflags-y := -fno-common -fno-builtin -fno-stack-protector -ffixed-x18
>  ccflags-y += -DDISABLE_BRANCH_PROFILING
> 
> 
> .
> 



Re: linux-next: Tree for Jun 24 [build failure on arm64]

2020-06-24 Thread Shaokun Zhang
+Will Deacon,

Hi Will,

There's a build failure on arm64:

  CALLscripts/atomic/check-atomics.sh
  CALLscripts/checksyscalls.sh
  LD  arch/arm64/kernel/vdso/vdso.so.dbg
ld: unrecognized option '--no-eh-frame-hdr'
ld: use the --help option for usage information
arch/arm64/kernel/vdso/Makefile:64: recipe for target
'arch/arm64/kernel/vdso/vdso.so.dbg' failed
make[1]: *** [arch/arm64/kernel/vdso/vdso.so.dbg] Error 1
arch/arm64/Makefile:175: recipe for target 'vdso_prepare' failed
make: *** [vdso_prepare] Error 2

GCC version is followed:
gcc (Ubuntu/Linaro 5.4.0-6ubuntu1~16.04.12) 5.4.0 20160609

It seems caused by
87676cfca141 arm64: vdso: Disable dwarf unwinding through the sigreturn
trampoline

Thanks,
Shaokun

在 2020/6/24 14:53, Stephen Rothwell 写道:
> Hi all,
> 
> Changes since 20200623:
> 
> Renamed trees:slave-dma{,-fixes} -> dmaengine{,-fixes}
> 
> My fixes tree contains:
> 
>   466d58f824f1 ("device_cgroup: Fix RCU list debugging warning")
>   9bd7b7c45d71 ("sched: Fix RANDSTRUCT build fail")
>   2f437faecf71 ("powerpc/boot/dts: Fix dtc "pciex" warnings")
> 
> The printk tree lost its build failure.
> 
> The hid tree still had its build failure so I used the version from
> next-20200618.
> 
> The amdgpu tree lost its build failure.
> 
> The tip tree still had one build failure for which I reverted a commit.
> 
> The rcu tree gained a conflict against the tip tree.
> 
> Non-merge commits (relative to Linus' tree): 3015
>  3371 files changed, 258238 insertions(+), 58359 deletions(-)
> 
> 
> 
> I have created today's linux-next tree at
> git://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git
> (patches at http://www.kernel.org/pub/linux/kernel/next/ ).  If you
> are tracking the linux-next tree using git, you should not use "git pull"
> to do so as that will try to merge the new linux-next release with the
> old one.  You should use "git fetch" and checkout or reset to the new
> master.
> 
> You can see which trees have been included by looking in the Next/Trees
> file in the source.  There are also quilt-import.log and merge.log
> files in the Next directory.  Between each merge, the tree was built
> with a ppc64_defconfig for powerpc, an allmodconfig for x86_64, a
> multi_v7_defconfig for arm and a native build of tools/perf. After
> the final fixups (if any), I do an x86_64 modules_install followed by
> builds for x86_64 allnoconfig, powerpc allnoconfig (32 and 64 bit),
> ppc44x_defconfig, allyesconfig and pseries_le_defconfig and i386, sparc
> and sparc64 defconfig and htmldocs. And finally, a simple boot test
> of the powerpc pseries_le_defconfig kernel in qemu (with and without
> kvm enabled).
> 
> Below is a summary of the state of the merge.
> 
> I am currently merging 321 trees (counting Linus' and 82 trees of bug
> fix patches pending for the current merge release).
> 
> Stats about the size of the tree over time can be seen at
> http://neuling.org/linux-next-size.html .
> 
> Status of my local build tests will be at
> http://kisskb.ellerman.id.au/linux-next .  If maintainers want to give
> advice about cross compilers/configs that work, we are always open to add
> more builds.
> 
> Thanks to Randy Dunlap for doing many randconfig builds.  And to Paul
> Gortmaker for triage and bug fixes.
> 



[PATCH RESEND] fs: Move @f_count to different cacheline with @f_mode

2020-06-24 Thread Shaokun Zhang
get_file_rcu_many, which is called by __fget_files, has used
atomic_try_cmpxchg now and it can reduce the access number of the global
variable to improve the performance of atomic instruction compared with
atomic_cmpxchg. 

__fget_files does check the @f_mode with mask variable and will do some
atomic operations on @f_count, but both are on the same cacheline.
Many CPU cores do file access and it will cause much conflicts on @f_count. 
If we could make the two members into different cachelines, it shall relax
the siutations.

We have tested this on ARM64 and X86, the result is as follows:
Syscall of unixbench has been run on Huawei Kunpeng920 with this patch:
24 x System Call Overhead  1

System Call Overhead3160841.4 lps   (10.0 s, 1 samples)

System Benchmarks Partial Index  BASELINE   RESULTINDEX
System Call Overhead  15000.03160841.4   2107.2
   
System Benchmarks Index Score (Partial Only) 2107.2

Without this patch:
24 x System Call Overhead  1

System Call Overhead456.0 lps   (10.0 s, 1 samples)

System Benchmarks Partial Index  BASELINE   RESULTINDEX
System Call Overhead  15000.0456.0   1481.6
   
System Benchmarks Index Score (Partial Only) 1481.6

And on Intel 6248 platform with this patch:
40 CPUs in system; running 24 parallel copies of tests

System Call Overhead4288509.1 lps   (10.0 s, 1 samples)

System Benchmarks Partial Index  BASELINE   RESULTINDEX
System Call Overhead  15000.04288509.1   2859.0
   
System Benchmarks Index Score (Partial Only) 2859.0

Without this patch:
40 CPUs in system; running 24 parallel copies of tests

System Call Overhead3666313.0 lps   (10.0 s, 1 samples)

System Benchmarks Partial Index  BASELINE   RESULTINDEX
System Call Overhead  15000.03666313.0   2444.2
   
System Benchmarks Index Score (Partial Only) 2444.2

Cc: Will Deacon 
Cc: Mark Rutland 
Cc: Peter Zijlstra 
Cc: Alexander Viro 
Cc: Boqun Feng 
Signed-off-by: Yuqi Jin 
Signed-off-by: Shaokun Zhang 
---
 include/linux/fs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3f881a892ea7..0faeab5622fb 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -955,7 +955,6 @@ struct file {
 */
spinlock_t  f_lock;
enum rw_hintf_write_hint;
-   atomic_long_t   f_count;
unsigned intf_flags;
fmode_t f_mode;
struct mutexf_pos_lock;
@@ -979,6 +978,7 @@ struct file {
struct address_space*f_mapping;
errseq_tf_wb_err;
errseq_tf_sb_err; /* for syncfs */
+   atomic_long_t   f_count;
 } __randomize_layout
   __attribute__((aligned(4))); /* lest something weird decides that 2 is OK */
 
-- 
2.7.4



Re: [PATCH] drivers/perf: hisi: Add identifier sysfs file

2020-06-23 Thread Shaokun Zhang
Hi John,

Thanks your further explaination and I'm ok on it, so for this patch:

Acked-by: Shaokun Zhang 

Thanks,
Shaokun

在 2020/6/18 17:18, John Garry 写道:
> On 18/06/2020 02:40, Shaokun Zhang wrote:
>>> }
>>>   +    hha_pmu->identifier = readl(hha_pmu->base + HHA_VERSION);
>> Since we are now refactoring the PMU framework, the PMU version offset
>> is always the same except DDRC PMU and other uncore PMU modules will
>> also use this, how about we do it as the common code:
>>
>> #define HISI_PMU_VERSION_REG   0x1CF0
>> int hisi_uncore_pmu_version(struct hisi_pmu *hisi_pmu)
>> {
>>     return readl(hisi_pmu->base + HISI_PMU_VERSION_REG);
>> }
>> EXPORT_SYMBOL_GPL(hisi_uncore_pmu_version);
> 
> Hi Shaokun,
> 
> Some points to make:
> 
> - It's hardly worth having a separate function to do this 1-line readl()
> call, especially since it not even generic (DDRC is different)
> 
> - We would have to export it (or put in a common header file with static
> inline keywords) - less exports are good
> 
> - with factoring out common code, it's good to reduce total code - this
> change would increase it, AFAICS
> 
> - This is HW specific. The driver is currently layered such that all HW
> specific stuff is in the HW driver (like hisi_uncore_ddrc_pmu.c), and
> not library code (hisi_uncore_pmu.c). I don't see why you want to mix
> that, like you're proposing in the framework revision you proposed
> internally.
> 
> Thanks,
> John
> 
>>
>> hha_pmu->identifier = hisi_uncore_pmu_version(hha_pmu);
>> we can remove the duplicated PMU version register definition in each
>> module.
>>
>> Thanks,
>> Shaokun
>>
>>> +
>>>   return 0;
>>>   }
>>>   @@ -320,10 +323,23 @@ static const struct attribute_group
>>> hisi_hha_pmu_cpumask_attr_group = {
>>>   .attrs = hisi_hha_pmu_cpumask_attrs,
>>>   };
> 
> 
> .



Re: [PATCH] drivers/perf: hisi: Add identifier sysfs file

2020-06-17 Thread Shaokun Zhang
Hi John,

在 2020/6/17 21:05, John Garry 写道:
> To allow userspace to identify the specific implementation of the device,
> add an "identifier" sysfs file.
> 
> Encoding is as follows:
> hi1620: 0x0   (aka hip08)
> hi1630: 0x30
> 
> Signed-off-by: John Garry 
> 
> diff --git a/drivers/perf/hisilicon/hisi_uncore_ddrc_pmu.c 
> b/drivers/perf/hisilicon/hisi_uncore_ddrc_pmu.c
> index 15713faaa07e..a83d99f2662e 100644
> --- a/drivers/perf/hisilicon/hisi_uncore_ddrc_pmu.c
> +++ b/drivers/perf/hisilicon/hisi_uncore_ddrc_pmu.c
> @@ -33,6 +33,7 @@
>  #define DDRC_INT_MASK0x6c8
>  #define DDRC_INT_STATUS  0x6cc
>  #define DDRC_INT_CLEAR   0x6d0
> +#define DDRC_VERSION 0x710
>  
>  /* DDRC has 8-counters */
>  #define DDRC_NR_COUNTERS 0x8
> @@ -267,6 +268,8 @@ static int hisi_ddrc_pmu_init_data(struct platform_device 
> *pdev,
>   return PTR_ERR(ddrc_pmu->base);
>   }
>  
> + ddrc_pmu->identifier = readl(ddrc_pmu->base + DDRC_VERSION);
> +
>   return 0;
>  }
>  
> @@ -308,10 +311,23 @@ static const struct attribute_group 
> hisi_ddrc_pmu_cpumask_attr_group = {
>   .attrs = hisi_ddrc_pmu_cpumask_attrs,
>  };
>  
> +static struct device_attribute hisi_ddrc_pmu_identifier_attr =
> + __ATTR(identifier, 0444, hisi_uncore_pmu_identifier_attr_show, NULL);
> +
> +static struct attribute *hisi_ddrc_pmu_identifier_attrs[] = {
> + _ddrc_pmu_identifier_attr.attr,
> + NULL
> +};
> +
> +static struct attribute_group hisi_ddrc_pmu_identifier_group = {
> + .attrs = hisi_ddrc_pmu_identifier_attrs,
> +};
> +
>  static const struct attribute_group *hisi_ddrc_pmu_attr_groups[] = {
>   _ddrc_pmu_format_group,
>   _ddrc_pmu_events_group,
>   _ddrc_pmu_cpumask_attr_group,
> + _ddrc_pmu_identifier_group,
>   NULL,
>  };
>  
> diff --git a/drivers/perf/hisilicon/hisi_uncore_hha_pmu.c 
> b/drivers/perf/hisilicon/hisi_uncore_hha_pmu.c
> index dcc5600788a9..4fdaf1d995be 100644
> --- a/drivers/perf/hisilicon/hisi_uncore_hha_pmu.c
> +++ b/drivers/perf/hisilicon/hisi_uncore_hha_pmu.c
> @@ -23,6 +23,7 @@
>  #define HHA_INT_MASK 0x0804
>  #define HHA_INT_STATUS   0x0808
>  #define HHA_INT_CLEAR0x080C
> +#define HHA_VERSION  0x1cf0
>  #define HHA_PERF_CTRL0x1E00
>  #define HHA_EVENT_CTRL   0x1E04
>  #define HHA_EVENT_TYPE0  0x1E80
> @@ -261,6 +262,8 @@ static int hisi_hha_pmu_init_data(struct platform_device 
> *pdev,
>   return PTR_ERR(hha_pmu->base);
>   }
>  
> + hha_pmu->identifier = readl(hha_pmu->base + HHA_VERSION);

Since we are now refactoring the PMU framework, the PMU version offset
is always the same except DDRC PMU and other uncore PMU modules will
also use this, how about we do it as the common code:

#define HISI_PMU_VERSION_REG   0x1CF0
int hisi_uncore_pmu_version(struct hisi_pmu *hisi_pmu)
{
   return readl(hisi_pmu->base + HISI_PMU_VERSION_REG);
}
EXPORT_SYMBOL_GPL(hisi_uncore_pmu_version);

hha_pmu->identifier = hisi_uncore_pmu_version(hha_pmu);
we can remove the duplicated PMU version register definition in each module.

Thanks,
Shaokun

> +
>   return 0;
>  }
>  
> @@ -320,10 +323,23 @@ static const struct attribute_group 
> hisi_hha_pmu_cpumask_attr_group = {
>   .attrs = hisi_hha_pmu_cpumask_attrs,
>  };
>  
> +static struct device_attribute hisi_hha_pmu_identifier_attr =
> + __ATTR(identifier, 0444, hisi_uncore_pmu_identifier_attr_show, NULL);
> +
> +static struct attribute *hisi_hha_pmu_identifier_attrs[] = {
> + _hha_pmu_identifier_attr.attr,
> + NULL
> +};
> +
> +static struct attribute_group hisi_hha_pmu_identifier_group = {
> + .attrs = hisi_hha_pmu_identifier_attrs,
> +};
> +
>  static const struct attribute_group *hisi_hha_pmu_attr_groups[] = {
>   _hha_pmu_format_group,
>   _hha_pmu_events_group,
>   _hha_pmu_cpumask_attr_group,
> + _hha_pmu_identifier_group,
>   NULL,
>  };
>  
> diff --git a/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c 
> b/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c
> index 7719ae4e2c56..0e7477220be1 100644
> --- a/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c
> +++ b/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c
> @@ -25,6 +25,7 @@
>  #define L3C_INT_STATUS   0x0808
>  #define L3C_INT_CLEAR0x080c
>  #define L3C_EVENT_CTRL   0x1c00
> +#define L3C_VERSION  0x1cf0
>  #define L3C_EVENT_TYPE0  0x1d00
>  /*
>   * Each counter is 48-bits and [48:63] are reserved
> @@ -264,6 +265,8 @@ static int hisi_l3c_pmu_init_data(struct platform_device 
> *pdev,
>   return PTR_ERR(l3c_pmu->base);
>   }
>  
> + l3c_pmu->identifier = readl(l3c_pmu->base + L3C_VERSION);
> +
>   return 0;
>  }
>  
> @@ -310,10 +313,23 @@ static const struct attribute_group 
> hisi_l3c_pmu_cpumask_attr_group = {
>   .attrs = hisi_l3c_pmu_cpumask_attrs,
>  };
>  
> +static struct 

[tip: irq/core] platform-msi: Fix typos in comment

2020-05-30 Thread tip-bot2 for Shaokun Zhang
The following commit has been merged into the irq/core branch of tip:

Commit-ID: ae0bb9fda405c881848f7f6e94d912b35f6e31d2
Gitweb:
https://git.kernel.org/tip/ae0bb9fda405c881848f7f6e94d912b35f6e31d2
Author:Shaokun Zhang 
AuthorDate:Mon, 18 May 2020 11:00:59 +08:00
Committer: Marc Zyngier 
CommitterDate: Mon, 18 May 2020 10:28:30 +01:00

platform-msi: Fix typos in comment

Fix up one typos @nev -> @nr_irqs.

Signed-off-by: Shaokun Zhang 
Signed-off-by: Marc Zyngier 
Link: 
https://lore.kernel.org/r/1589770859-19340-1-git-send-email-zhangshao...@hisilicon.com
---
 drivers/base/platform-msi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/base/platform-msi.c b/drivers/base/platform-msi.c
index 8da314b..c4a17e5 100644
--- a/drivers/base/platform-msi.c
+++ b/drivers/base/platform-msi.c
@@ -387,7 +387,7 @@ void platform_msi_domain_free(struct irq_domain *domain, 
unsigned int virq,
  *
  * @domain:The platform-msi domain
  * @virq:  The base irq from which to perform the allocate operation
- * @nvec:  How many interrupts to free from @virq
+ * @nr_irqs:   How many interrupts to free from @virq
  *
  * Return 0 on success, or an error code on failure. Must be called
  * with irq_domain_mutex held (which can only be done as part of a


Re: [PATCH V1 RESEND 1/3] perf/imx_ddr: Add system PMU identifier for userspace

2020-05-27 Thread Shaokun Zhang
Hi,

On 2020/5/27 22:34, John Garry wrote:
>
> I also really dislike this. What's the preferred way to identify the SoC
> from userspace?

 /proc/cpuinfo? ;)
>>>
>>> The *SoC*!
>>>
 For an non-firmware specific case, I'd say soc_device should be. I'd
 guess ACPI systems don't use it and for them it's dmidecode typically.
 The other problem I have with soc_device is it is optional.
>>>
>>
>> Hi Will,
>>
>>> John -- what do you think about using soc_device to expose this information,
>>> with ACPI systems using DMI data instead?
>>
>> Generally I don't think that DMI is reliable, and I saw this as the least 
>> preferred choice. I'm looking at the sysfs DMI info for my dev board, and I 
>> don't even see anything like a SoC identifier.
>>
>> As for the event_source device sysfs identifier file, it would not always 
>> contain effectively the same as the SoC ID.
>>
>> Certain PMUs which I'm interested in plan to have probe-able identification 
>> info available in future.
>>
> 
> BTW, Shaokun now tells me that the HiSi uncore PMU HW have such registers to 
> identify the implementation. I didn't know.
> 

Right, we have this register which shows the PMU version.

Thanks,
Shaokun


> So we could add that identifier file for those PMUs as proof-of-concept, 
> exposing that register.
> 
> As for other PMUs which I'm interested in, again, future versions should have 
> such registers to self-identify.
> 
> So using something derived from the DT compat string would hopefully be the 
> uncommon case.
> 
> Cheers,
> John
> 
> .
> 



[PATCH] platform-msi: Fix typos in comment

2020-05-17 Thread Shaokun Zhang
Fix up one typos @nev -> @nr_irqs.

Cc: Marc Zyngier  
Signed-off-by: Shaokun Zhang 
---
 drivers/base/platform-msi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/base/platform-msi.c b/drivers/base/platform-msi.c
index 8da314b81eab..c4a17e5edf8b 100644
--- a/drivers/base/platform-msi.c
+++ b/drivers/base/platform-msi.c
@@ -387,7 +387,7 @@ void platform_msi_domain_free(struct irq_domain *domain, 
unsigned int virq,
  *
  * @domain:The platform-msi domain
  * @virq:  The base irq from which to perform the allocate operation
- * @nvec:  How many interrupts to free from @virq
+ * @nr_irqs:   How many interrupts to free from @virq
  *
  * Return 0 on success, or an error code on failure. Must be called
  * with irq_domain_mutex held (which can only be done as part of a
-- 
2.7.4



Re: [RFC PATCH] fs: Move @f_count to different cacheline with @f_mode

2020-05-17 Thread Shaokun Zhang
Hi maintainers,

A gentle ping.

Thanks,
Shaokun

On 2020/4/30 11:25, Shaokun Zhang wrote:
> From: Yuqi Jin 
> 
> __fget_files does check the @f_mode with mask variable and will do some
> atomic operations on @f_count while both are on the same cacheline.
> Many CPU cores do file access and it will cause much conflicts on @f_count. 
> If we could make the two members into different cachelines, it shall relax
> the siutations.
> 
> We have tested this on ARM64 and X86, the result is as follows:
> 
> Syscall of unixbench has been run on Huawei Kunpeng920 with this patch:
> 24 x System Call Overhead  1
> 
> System Call Overhead3160841.4 lps   (10.0 s, 1 samples)
> 
> System Benchmarks Partial Index  BASELINE   RESULTINDEX
> System Call Overhead  15000.03160841.4   2107.2
>
> System Benchmarks Index Score (Partial Only) 2107.2
> 
> Without this patch:
> 24 x System Call Overhead  1
> 
> System Call Overhead456.0 lps   (10.0 s, 1 samples)
> 
> System Benchmarks Partial Index  BASELINE   RESULTINDEX
> System Call Overhead  15000.0456.0   1481.6
>
> System Benchmarks Index Score (Partial Only) 1481.6
> 
> And on Intel 6248 platform with this patch:
> 40 CPUs in system; running 24 parallel copies of tests
> 
> System Call Overhead4288509.1 lps   (10.0 s, 1 
> samples)
> 
> System Benchmarks Partial Index  BASELINE   RESULTINDEX
> System Call Overhead  15000.04288509.1   2859.0
>
> System Benchmarks Index Score (Partial Only) 2859.0
> 
> Without this patch:
> 40 CPUs in system; running 24 parallel copies of tests
> 
> System Call Overhead3666313.0 lps   (10.0 s, 1 
> samples)
> 
> System Benchmarks Partial Index  BASELINE   RESULTINDEX
> System Call Overhead  15000.03666313.0   2444.2
>====
> System Benchmarks Index Score (Partial Only) 2444.2
> 
> Cc: Alexander Viro 
> Signed-off-by: Yuqi Jin 
> Signed-off-by: Shaokun Zhang 
> ---
>  include/linux/fs.h | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 4f6f59b4f22a..90e76283f0fd 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -953,7 +953,6 @@ struct file {
>*/
>   spinlock_t  f_lock;
>   enum rw_hintf_write_hint;
> - atomic_long_t   f_count;
>   unsigned intf_flags;
>   fmode_t f_mode;
>   struct mutexf_pos_lock;
> @@ -976,6 +975,7 @@ struct file {
>  #endif /* #ifdef CONFIG_EPOLL */
>   struct address_space*f_mapping;
>   errseq_tf_wb_err;
> + atomic_long_t   f_count;
>  } __randomize_layout
>__attribute__((aligned(4)));   /* lest something weird decides that 2 
> is OK */
>  
> 



[PATCH] lib/sort: Remove unused pr_fmt

2020-05-16 Thread Shaokun Zhang
No pr_* is called in sort.c, let's remove the unused pr_fmt macro.

Cc: Kostenzer Felix 
Cc: Andrew Morton 
Signed-off-by: Shaokun Zhang 
---
 lib/sort.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/lib/sort.c b/lib/sort.c
index 3ad454411997..99ecab8d6f41 100644
--- a/lib/sort.c
+++ b/lib/sort.c
@@ -10,8 +10,6 @@
  * quicksort's O(n^2) worst case.
  */
 
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
 #include 
 #include 
 #include 
-- 
2.7.4



[PATCH v2] net: revert "net: get rid of an signed integer overflow in ip_idents_reserve()"

2020-05-15 Thread Shaokun Zhang
From: Yuqi Jin 

Commit adb03115f459 ("net: get rid of an signed integer overflow in 
ip_idents_reserve()")
used atomic_cmpxchg to replace "atomic_add_return" inside the function
"ip_idents_reserve". The reason was to avoid UBSAN warning.
However, this change has caused performance degrade and in GCC-8,
fno-strict-overflow is now mapped to -fwrapv -fwrapv-pointer
and signed integer overflow is now undefined by default at all
optimization levels[1]. Moreover, it was a bug in UBSAN vs -fwrapv
/-fno-strict-overflow, so Let's revert it safely.

[1] https://gcc.gnu.org/gcc-8/changes.html

Suggested-by: Peter Zijlstra 
Suggested-by: Eric Dumazet 
Cc: "David S. Miller" 
Cc: Alexey Kuznetsov 
Cc: Hideaki YOSHIFUJI 
Cc: Jakub Kicinski 
Cc: Jiri Pirko 
Cc: Arvind Sankar 
Cc: Peter Zijlstra 
Cc: Eric Dumazet 
Cc: Jiong Wang 
Signed-off-by: Yuqi Jin 
Signed-off-by: Shaokun Zhang 
---
ChangLog:
* Revise the commit log
* Add some comments. If it's wholly unnecessary, we
can remove it.

Patch v1: 
https://patchwork.ozlabs.org/project/netdev/patch/1579058620-26684-1-git-send-email-zhangshao...@hisilicon.com/

 net/ipv4/route.c | 14 ++
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 788c69d9bfe0..455871d6b3a0 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -491,18 +491,16 @@ u32 ip_idents_reserve(u32 hash, int segs)
atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
u32 old = READ_ONCE(*p_tstamp);
u32 now = (u32)jiffies;
-   u32 new, delta = 0;
+   u32 delta = 0;
 
if (old != now && cmpxchg(p_tstamp, old, now) == old)
delta = prandom_u32_max(now - old);
 
-   /* Do not use atomic_add_return() as it makes UBSAN unhappy */
-   do {
-   old = (u32)atomic_read(p_id);
-   new = old + delta + segs;
-   } while (atomic_cmpxchg(p_id, old, new) != old);
-
-   return new - segs;
+   /* If UBSAN reports an error there, please make sure your compiler
+* supports -fno-strict-overflow before reporting it that was a bug
+* in UBSAN, and it has been fixed in GCC-8.
+*/
+   return atomic_add_return(segs + delta, p_id) - segs;
 }
 EXPORT_SYMBOL(ip_idents_reserve);
 
-- 
2.7.4



[PATCH trivial] bootconfig: Fixup one typo

2020-05-09 Thread Shaokun Zhang
Fix up one typo: CONFIG_BOOTCONFIG -> CONFIG_BOOT_CONFIG

Cc: Jiri Kosina 
Cc: Steven Rostedt 
Cc: Masami Hiramatsu 
Signed-off-by: Shaokun Zhang 
---
 init/main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/init/main.c b/init/main.c
index 1a5da2c2660c..8369ba173ad8 100644
--- a/init/main.c
+++ b/init/main.c
@@ -460,7 +460,7 @@ static void __init setup_boot_config(const char *cmdline)
 
 static int __init warn_bootconfig(char *str)
 {
-   pr_warn("WARNING: 'bootconfig' found on the kernel command line but 
CONFIG_BOOTCONFIG is not set.\n");
+   pr_warn("WARNING: 'bootconfig' found on the kernel command line but 
CONFIG_BOOT_CONFIG is not set.\n");
return 0;
 }
 early_param("bootconfig", warn_bootconfig);
-- 
2.7.4



[tip: perf/core] perf pmu: Fix function name in comment, its get_cpuid_str(), not get_cpustr()

2020-05-08 Thread tip-bot2 for Shaokun Zhang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: 454a8be0cff954f18729964317b40ef2c3031364
Gitweb:
https://git.kernel.org/tip/454a8be0cff954f18729964317b40ef2c3031364
Author:Shaokun Zhang 
AuthorDate:Wed, 29 Apr 2020 14:33:12 +08:00
Committer: Arnaldo Carvalho de Melo 
CommitterDate: Thu, 30 Apr 2020 10:48:33 -03:00

perf pmu: Fix function name in comment, its get_cpuid_str(), not get_cpustr()

get_cpuid_str() is used in tools/perf/arch/xxx/util/header.c,
fix the name in comment.

Signed-off-by: Shaokun Zhang 
Cc: Andi Kleen 
Link: 
http://lore.kernel.org/lkml/1588141992-48382-1-git-send-email-zhangshao...@hisilicon.com
Signed-off-by: Arnaldo Carvalho de Melo 
---
 tools/perf/pmu-events/pmu-events.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/pmu-events/pmu-events.h 
b/tools/perf/pmu-events/pmu-events.h
index 53e76d5..c8f306b 100644
--- a/tools/perf/pmu-events/pmu-events.h
+++ b/tools/perf/pmu-events/pmu-events.h
@@ -26,7 +26,7 @@ struct pmu_event {
  * Map a CPU to its table of PMU events. The CPU is identified by the
  * cpuid field, which is an arch-specific identifier for the CPU.
  * The identifier specified in tools/perf/pmu-events/arch/xxx/mapfile
- * must match the get_cpustr() in tools/perf/arch/xxx/util/header.c)
+ * must match the get_cpuid_str() in tools/perf/arch/xxx/util/header.c)
  *
  * The  cpuid can contain any character other than the comma.
  */


Re: [PATCH] net: optimize cmpxchg in ip_idents_reserve

2020-05-07 Thread Shaokun Zhang
Hi Peter/Eric,

Shall we use atomic_add_return() unconditionally and add some comments? Or I 
missed
something.

Thanks,
Shaokun

On 2020/1/20 16:18, Peter Zijlstra wrote:
> On Fri, Jan 17, 2020 at 10:48:19AM -0800, Eric Dumazet wrote:
>>
>>
>> On 1/17/20 10:38 AM, Arvind Sankar wrote:
>>> On Fri, Jan 17, 2020 at 10:16:45AM -0800, Eric Dumazet wrote:
 Wasńt it the case back in 2016 already for linux-4.8 ?

 What will prevent someone to send another report to netdev/lkml ?

  -fno-strict-overflow support is not a prereq for CONFIG_UBSAN.

 Fact that we kept in lib/ubsan.c and lib/test_ubsan.c code for 
 test_ubsan_add_overflow() and test_ubsan_sub_overflow() is disturbing.

>>>
>>> No, it was bumped in 2018 in commit cafa0010cd51 ("Raise the minimum
>>> required gcc version to 4.6"). That raised it from 3.2 -> 4.6.
>>>
>>
>> This seems good to me, for gcc at least.
>>
>> Maybe it is time to enfore -fno-strict-overflow in KBUILD_CFLAGS 
>> instead of making it conditional.
> 
> IIRC there was a bug in UBSAN vs -fwrapv/-fno-strict-overflow that was
> only fixed in gcc-8 or 9 or so.
> 
> So while the -fwrapv/-fno-strict-overflow flag has been correctly
> supported since like forever, UBSAN was buggy until quite recent when
> used in conjustion with that flag.
> 
> .
> 



[RFC PATCH] fs: Move @f_count to different cacheline with @f_mode

2020-04-29 Thread Shaokun Zhang
From: Yuqi Jin 

__fget_files does check the @f_mode with mask variable and will do some
atomic operations on @f_count while both are on the same cacheline.
Many CPU cores do file access and it will cause much conflicts on @f_count. 
If we could make the two members into different cachelines, it shall relax
the siutations.

We have tested this on ARM64 and X86, the result is as follows:

Syscall of unixbench has been run on Huawei Kunpeng920 with this patch:
24 x System Call Overhead  1

System Call Overhead3160841.4 lps   (10.0 s, 1 samples)

System Benchmarks Partial Index  BASELINE   RESULTINDEX
System Call Overhead  15000.03160841.4   2107.2
   
System Benchmarks Index Score (Partial Only) 2107.2

Without this patch:
24 x System Call Overhead  1

System Call Overhead456.0 lps   (10.0 s, 1 samples)

System Benchmarks Partial Index  BASELINE   RESULTINDEX
System Call Overhead  15000.0456.0   1481.6
   
System Benchmarks Index Score (Partial Only) 1481.6

And on Intel 6248 platform with this patch:
40 CPUs in system; running 24 parallel copies of tests

System Call Overhead4288509.1 lps   (10.0 s, 1 samples)

System Benchmarks Partial Index  BASELINE   RESULTINDEX
System Call Overhead  15000.04288509.1   2859.0
   
System Benchmarks Index Score (Partial Only) 2859.0

Without this patch:
40 CPUs in system; running 24 parallel copies of tests

System Call Overhead3666313.0 lps   (10.0 s, 1 samples)

System Benchmarks Partial Index  BASELINE   RESULTINDEX
System Call Overhead  15000.03666313.0   2444.2
   
System Benchmarks Index Score (Partial Only) 2444.2

Cc: Alexander Viro 
Signed-off-by: Yuqi Jin 
Signed-off-by: Shaokun Zhang 
---
 include/linux/fs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 4f6f59b4f22a..90e76283f0fd 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -953,7 +953,6 @@ struct file {
 */
spinlock_t  f_lock;
enum rw_hintf_write_hint;
-   atomic_long_t   f_count;
unsigned intf_flags;
fmode_t f_mode;
struct mutexf_pos_lock;
@@ -976,6 +975,7 @@ struct file {
 #endif /* #ifdef CONFIG_EPOLL */
struct address_space*f_mapping;
errseq_tf_wb_err;
+   atomic_long_t   f_count;
 } __randomize_layout
   __attribute__((aligned(4))); /* lest something weird decides that 2 is OK */
 
-- 
2.7.4



[PATCH RESEND] perf tools: Fix function name in comment

2020-04-29 Thread Shaokun Zhang
get_cpuid_str() is used in tools/perf/arch/xxx/util/header.c,
fix the name in comment.

Cc: Arnaldo Carvalho de Melo 
CC: Andi Kleen 
Signed-off-by: Shaokun Zhang 
---
 tools/perf/pmu-events/pmu-events.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/pmu-events/pmu-events.h 
b/tools/perf/pmu-events/pmu-events.h
index 53e76d5d5b37..c8f306b572f4 100644
--- a/tools/perf/pmu-events/pmu-events.h
+++ b/tools/perf/pmu-events/pmu-events.h
@@ -26,7 +26,7 @@ struct pmu_event {
  * Map a CPU to its table of PMU events. The CPU is identified by the
  * cpuid field, which is an arch-specific identifier for the CPU.
  * The identifier specified in tools/perf/pmu-events/arch/xxx/mapfile
- * must match the get_cpustr() in tools/perf/arch/xxx/util/header.c)
+ * must match the get_cpuid_str() in tools/perf/arch/xxx/util/header.c)
  *
  * The  cpuid can contain any character other than the comma.
  */
-- 
2.7.4



Re: linux-next: Tree for Oct 23

2019-10-23 Thread Shaokun Zhang
Hi Geert,

On 2019/10/23 14:44, Geert Uytterhoeven wrote:
> On Wed, Oct 23, 2019 at 8:17 AM Shaokun Zhang
>  wrote:
>> +Cc: Mark Salyzyn
>>
>> There is a compiler failure on arm64 platform, as follow:
>> zhangshaokun@ubuntu:~/linux-next$ make -j64
>>   CALLscripts/atomic/check-atomics.sh
>>   CC  arch/arm64/kernel/asm-offsets.s
>> In file included from ./include/linux/sysctl.h:30:0,
>>  from ./include/linux/umh.h:9,
>>  from ./include/linux/kmod.h:9,
>>  from ./include/linux/module.h:13,
>>  from ./include/linux/acpi.h:29,
>>  from ./include/acpi/apei.h:9,
>>  from ./include/acpi/ghes.h:5,
>>  from ./include/linux/arm_sdei.h:8,
>>  from arch/arm64/kernel/asm-offsets.c:10:
>> ./include/uapi/linux/sysctl.h:561:29: error: expected ‘,’ or ‘}’ before 
>> ‘__attribute__’
>>   NET_IPV6_TEMP_PREFERED_LFT __attribute__((deprecated)) = /* NOTYPO */
>>  ^
>> scripts/Makefile.build:99: recipe for target 
>> 'arch/arm64/kernel/asm-offsets.s' failed
>> make[1]: *** [arch/arm64/kernel/asm-offsets.s] Error 1
>> Makefile:1108: recipe for target 'prepare0' failed
>> make: *** [prepare0] Error 2
>>
>> It's the commit <79f0cf35dccb> ("treewide: cleanup: replace prefered with 
>> preferred").
> 
> After receiving a report from kisskb for failures for m68k, looking at
> http://kisskb.ellerman.id.au/kisskb/head/f3c452cfc59c817950b150b51ec2b33409d7640b/
> and doing some testing, it looks like this construct is supported by gcc-7
> and gcc-8, but not by gcc-4.6.3 and gcc-5. Don't know about gcc-6.
> 

GCC version is 5.4.0
zhangshaokun@ubuntu:~/linux-next$ gcc --version
gcc (Ubuntu/Linaro 5.4.0-6ubuntu1~16.04.10) 5.4.0 20160609
Copyright (C) 2015 Free Software Foundation, Inc.
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

Thanks,

> Gr{oetje,eeting}s,
> 
> Geert
> 
> 
> --
> Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- 
> ge...@linux-m68k.org
> 
> In personal conversations with technical people, I call myself a hacker. But
> when I'm talking to journalists I just say "programmer" or something like 
> that.
> -- Linus Torvalds
> 
> .
> 



Re: linux-next: Tree for Oct 23

2019-10-23 Thread Shaokun Zhang
+Cc: Mark Salyzyn

There is a compiler failure on arm64 platform, as follow:
zhangshaokun@ubuntu:~/linux-next$ make -j64
  CALLscripts/atomic/check-atomics.sh
  CC  arch/arm64/kernel/asm-offsets.s
In file included from ./include/linux/sysctl.h:30:0,
 from ./include/linux/umh.h:9,
 from ./include/linux/kmod.h:9,
 from ./include/linux/module.h:13,
 from ./include/linux/acpi.h:29,
 from ./include/acpi/apei.h:9,
 from ./include/acpi/ghes.h:5,
 from ./include/linux/arm_sdei.h:8,
 from arch/arm64/kernel/asm-offsets.c:10:
./include/uapi/linux/sysctl.h:561:29: error: expected ‘,’ or ‘}’ before 
‘__attribute__’
  NET_IPV6_TEMP_PREFERED_LFT __attribute__((deprecated)) = /* NOTYPO */
 ^
scripts/Makefile.build:99: recipe for target 'arch/arm64/kernel/asm-offsets.s' 
failed
make[1]: *** [arch/arm64/kernel/asm-offsets.s] Error 1
Makefile:1108: recipe for target 'prepare0' failed
make: *** [prepare0] Error 2

It's the commit <79f0cf35dccb> ("treewide: cleanup: replace prefered with 
preferred").

Thanks,
Shaokun


On 2019/10/23 12:55, Stephen Rothwell wrote:
> Hi all,
> 
> Changes since 20191022:
> 
> Non-merge commits (relative to Linus' tree): 5530
>  5340 files changed, 192671 insertions(+), 90844 deletions(-)
> 
> 
> 
> I have created today's linux-next tree at
> git://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git
> (patches at http://www.kernel.org/pub/linux/kernel/next/ ).  If you
> are tracking the linux-next tree using git, you should not use "git pull"
> to do so as that will try to merge the new linux-next release with the
> old one.  You should use "git fetch" and checkout or reset to the new
> master.
> 
> You can see which trees have been included by looking in the Next/Trees
> file in the source.  There are also quilt-import.log and merge.log
> files in the Next directory.  Between each merge, the tree was built
> with a ppc64_defconfig for powerpc, an allmodconfig for x86_64, a
> multi_v7_defconfig for arm and a native build of tools/perf. After
> the final fixups (if any), I do an x86_64 modules_install followed by
> builds for x86_64 allnoconfig, powerpc allnoconfig (32 and 64 bit),
> ppc44x_defconfig, allyesconfig and pseries_le_defconfig and i386, sparc
> and sparc64 defconfig. And finally, a simple boot test of the powerpc
> pseries_le_defconfig kernel in qemu (with and without kvm enabled).
> 
> Below is a summary of the state of the merge.
> 
> I am currently merging 310 trees (counting Linus' and 78 trees of bug
> fix patches pending for the current merge release).
> 
> Stats about the size of the tree over time can be seen at
> http://neuling.org/linux-next-size.html .
> 
> Status of my local build tests will be at
> http://kisskb.ellerman.id.au/linux-next .  If maintainers want to give
> advice about cross compilers/configs that work, we are always open to add
> more builds.
> 
> Thanks to Randy Dunlap for doing many randconfig builds.  And to Paul
> Gortmaker for triage and bug fixes.
> 



Re: [RFC] lib: optimize cpumask_local_spread()

2019-10-21 Thread Shaokun Zhang
Hi Michal,

On 2019/10/17 20:37, Michal Hocko wrote:
> On Thu 17-10-19 18:23:08, Shaokun Zhang wrote:
>> From: yuqi jin 
>>
>> In the multi-processor and NUMA system, A device may have many numa
>> nodes belonging to multiple cpus. When we get a local numa, it is better
>> to find the node closest to the local numa node to return instead of
>> going to the online cpu immediately.
>>
>> For example, In Huawei Kunpeng 920 system, there are 4 NUMA node(0 -3)
>> in the 2-socket system(0 - 1). If the I/O device is in socket1
>> and the local NUMA node is 2, we shall choose the non-local node3 in
>> the same socket when cpu core in NUMA node2 is less that I/O requirements.
>> If we directly pick one cpu core from all online ones, it may be in
>> the another socket and it is not friendly for performance.
> 
> Could you be more specific on the effect of this patch please? Do you
> have any performance numbers?

The NIC driver calls this function to determine the core which irq will be 
binded to,
and the initialization of XPS depends on the binding of irqs. The NIC driver 
will get
the local NUMA node where it is located.

On Huawei Kunpeng 920 SoC, there are 4-NUMA nodes and there is 24-cores per 
node.
If the function paratmer @i = 0-23 and @node = 2, then the core which is 
located on
node 2 and irq will be binded to node2.
If the parameter @i = 24-47 and @node = 2, without this patch, it will return 
the
core which is on NUMA node0; Applied the patch, it will return NUMA node3 cpu 
cores
which are in the same sokcet.

without the patch, the performance is 22W QPS and added this patch, the 
performance
become better and it is 26W QPS.

I'm not sure whether anyone also hits this problem and send it as a RFC.

> Also is it safe and reasonable to perform GFP_KERNEL (aka sleepable)
> allocations from this function?
> 

Good catch, I missed it and it should be GFP_ATOMIC.

Thanks,
Shaokun.

>> Cc: Andrew Morton 
>> Cc: Mike Rapoport 
>> Cc: Paul Burton 
>> Cc: Michal Hocko 
>> Cc: Michael Ellerman 
>> Cc: Anshuman Khandual 
>> Signed-off-by: yuqi jin 
>> Signed-off-by: Shaokun Zhang 
>> ---
>>  lib/cpumask.c | 78 
>> ++-
>>  1 file changed, 67 insertions(+), 11 deletions(-)
>>
>> diff --git a/lib/cpumask.c b/lib/cpumask.c
>> index 0cb672eb107c..8f89c7cebfb0 100644
>> --- a/lib/cpumask.c
>> +++ b/lib/cpumask.c
>> @@ -192,6 +192,33 @@ void __init free_bootmem_cpumask_var(cpumask_var_t mask)
>>  }
>>  #endif
>>  
>> +static void calc_node_distance(int *node_dist, int node)
>> +{
>> +int i;
>> +
>> +for (i = 0; i < nr_node_ids; i++)
>> +node_dist[i] = node_distance(node, i);
>> +}
>> +
>> +static int find_nearest_node(int *node_dist, bool *used_flag)
>> +{
>> +int i, min_dist = node_dist[0], node_id = -1;
>> +
>> +for (i = 0; i < nr_node_ids; i++)
>> +if (used_flag[i] == 0) {
>> +min_dist = node_dist[i];
>> +node_id = i;
>> +break;
>> +}
>> +for (i = 0; i < nr_node_ids; i++)
>> +if (node_dist[i] < min_dist && used_flag[i] == 0) {
>> +min_dist = node_dist[i];
>> +node_id = i;
>> +}
>> +
>> +return node_id;
>> +}
>> +
>>  /**
>>   * cpumask_local_spread - select the i'th cpu with local numa cpu's first
>>   * @i: index number
>> @@ -205,7 +232,8 @@ void __init free_bootmem_cpumask_var(cpumask_var_t mask)
>>   */
>>  unsigned int cpumask_local_spread(unsigned int i, int node)
>>  {
>> -int cpu;
>> +int cpu, j, id, *node_dist;
>> +bool *used_flag;
>>  
>>  /* Wrap: we always want a cpu. */
>>  i %= num_online_cpus();
>> @@ -215,19 +243,47 @@ unsigned int cpumask_local_spread(unsigned int i, int 
>> node)
>>  if (i-- == 0)
>>  return cpu;
>>  } else {
>> -/* NUMA first. */
>> -for_each_cpu_and(cpu, cpumask_of_node(node), cpu_online_mask)
>> -if (i-- == 0)
>> -return cpu;
>> +node_dist = kmalloc_array(nr_node_ids,
>> +sizeof(int), GFP_KERNEL);
>> +if (!node_dist)
>> +for_each_cpu(cpu, cpu_online_mask)
>> +if (i-- == 0)
>> +return cpu;
>>  

[PATCH v2] net: stmmac: Fix the problem of tso_xmit

2019-10-20 Thread Shaokun Zhang
From: yuqi jin 

When the address width of DMA is greater than 32, the packet header occupies
a BD descriptor. The starting address of the data should be added to the
header length.

Fixes: a993db88d17d ("net: stmmac: Enable support for > 32 Bits addressing in 
XGMAC")
Cc: Eric Dumazet 
Cc: Giuseppe Cavallaro 
Cc: Alexandre Torgue 
Cc: Jose Abreu 
Cc: "David S. Miller" 
Cc: Maxime Coquelin 
Signed-off-by: yuqi jin 
Signed-off-by: Shaokun Zhang 
---
Changes in v2: 
-- Address Eric's comment: add the Fixes tag

 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c 
b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 3dfd04e0506a..4e9c848c67cc 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -2995,6 +2995,7 @@ static netdev_tx_t stmmac_tso_xmit(struct sk_buff *skb, 
struct net_device *dev)
} else {
stmmac_set_desc_addr(priv, first, des);
tmp_pay_len = pay_len;
+   des += proto_hdr_len;
}
 
stmmac_tso_allocator(priv, des, tmp_pay_len, (nfrags == 0), queue);
-- 
2.7.4



[PATCH] net: stmmac: Fix the problem of tso_xmit

2019-10-17 Thread Shaokun Zhang
From: yuqi jin 

When the address width of DMA is greater than 32, the packet header occupies
a BD descriptor. The starting address of the data should be added to the
header length.

Cc: Giuseppe Cavallaro 
Cc: Alexandre Torgue 
Cc: Jose Abreu 
Cc: "David S. Miller" 
Cc: Maxime Coquelin 
Signed-off-by: yuqi jin 
Signed-off-by: Shaokun Zhang 
---
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c 
b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index c76a1336a451..3e02e64c5fa0 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -2995,6 +2995,7 @@ static netdev_tx_t stmmac_tso_xmit(struct sk_buff *skb, 
struct net_device *dev)
} else {
stmmac_set_desc_addr(priv, first, des);
tmp_pay_len = pay_len;
+   des += proto_hdr_len;
}
 
stmmac_tso_allocator(priv, des, tmp_pay_len, (nfrags == 0), queue);
-- 
2.7.4



[RFC] lib: optimize cpumask_local_spread()

2019-10-17 Thread Shaokun Zhang
From: yuqi jin 

In the multi-processor and NUMA system, A device may have many numa
nodes belonging to multiple cpus. When we get a local numa, it is better
to find the node closest to the local numa node to return instead of
going to the online cpu immediately.

For example, In Huawei Kunpeng 920 system, there are 4 NUMA node(0 -3)
in the 2-socket system(0 - 1). If the I/O device is in socket1
and the local NUMA node is 2, we shall choose the non-local node3 in
the same socket when cpu core in NUMA node2 is less that I/O requirements.
If we directly pick one cpu core from all online ones, it may be in
the another socket and it is not friendly for performance.

Cc: Andrew Morton 
Cc: Mike Rapoport 
Cc: Paul Burton 
Cc: Michal Hocko 
Cc: Michael Ellerman 
Cc: Anshuman Khandual 
Signed-off-by: yuqi jin 
Signed-off-by: Shaokun Zhang 
---
 lib/cpumask.c | 78 ++-
 1 file changed, 67 insertions(+), 11 deletions(-)

diff --git a/lib/cpumask.c b/lib/cpumask.c
index 0cb672eb107c..8f89c7cebfb0 100644
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -192,6 +192,33 @@ void __init free_bootmem_cpumask_var(cpumask_var_t mask)
 }
 #endif
 
+static void calc_node_distance(int *node_dist, int node)
+{
+   int i;
+
+   for (i = 0; i < nr_node_ids; i++)
+   node_dist[i] = node_distance(node, i);
+}
+
+static int find_nearest_node(int *node_dist, bool *used_flag)
+{
+   int i, min_dist = node_dist[0], node_id = -1;
+
+   for (i = 0; i < nr_node_ids; i++)
+   if (used_flag[i] == 0) {
+   min_dist = node_dist[i];
+   node_id = i;
+   break;
+   }
+   for (i = 0; i < nr_node_ids; i++)
+   if (node_dist[i] < min_dist && used_flag[i] == 0) {
+   min_dist = node_dist[i];
+   node_id = i;
+   }
+
+   return node_id;
+}
+
 /**
  * cpumask_local_spread - select the i'th cpu with local numa cpu's first
  * @i: index number
@@ -205,7 +232,8 @@ void __init free_bootmem_cpumask_var(cpumask_var_t mask)
  */
 unsigned int cpumask_local_spread(unsigned int i, int node)
 {
-   int cpu;
+   int cpu, j, id, *node_dist;
+   bool *used_flag;
 
/* Wrap: we always want a cpu. */
i %= num_online_cpus();
@@ -215,19 +243,47 @@ unsigned int cpumask_local_spread(unsigned int i, int 
node)
if (i-- == 0)
return cpu;
} else {
-   /* NUMA first. */
-   for_each_cpu_and(cpu, cpumask_of_node(node), cpu_online_mask)
-   if (i-- == 0)
-   return cpu;
+   node_dist = kmalloc_array(nr_node_ids,
+   sizeof(int), GFP_KERNEL);
+   if (!node_dist)
+   for_each_cpu(cpu, cpu_online_mask)
+   if (i-- == 0)
+   return cpu;
 
-   for_each_cpu(cpu, cpu_online_mask) {
-   /* Skip NUMA nodes, done above. */
-   if (cpumask_test_cpu(cpu, cpumask_of_node(node)))
-   continue;
+   used_flag = kmalloc_array(nr_node_ids,
+   sizeof(bool), GFP_KERNEL);
+   if (!used_flag)
+   for_each_cpu(cpu, cpu_online_mask)
+   if (i-- == 0) {
+   kfree(node_dist);
+   return cpu;
+   }
+   memset(used_flag, 0, nr_node_ids * sizeof(bool));
 
-   if (i-- == 0)
-   return cpu;
+   calc_node_distance(node_dist, node);
+   for (j = 0; j < nr_node_ids; j++) {
+   id = find_nearest_node(node_dist, used_flag);
+   if (id < 0)
+   break;
+   for_each_cpu_and(cpu,
+   cpumask_of_node(id), cpu_online_mask)
+   if (i-- == 0) {
+   kfree(node_dist);
+   kfree(used_flag);
+   return cpu;
+   }
+   used_flag[id] = 1;
}
+
+   for_each_cpu(cpu, cpu_online_mask)
+   if (i-- == 0) {
+   kfree(node_dist);
+   kfree(used_flag);
+   return cpu;
+   }
+
+   kfree(node_dist);
+   kfree(used_flag);
}
BUG();
 }
-- 
2.7.4



Re: [PATCH 0/4] HiSilicon hip08 uncore PMU events additions

2019-10-08 Thread Shaokun Zhang
Hi John,

Thanks for your nice work, these are useful for performance profiling
if anyone is unfamiliar with the uncore PMU events on hip08.
For this patchset, please feel free to add
Reviewed-by: Shaokun Zhang 

Thanks,
Shaokun

On 2019/9/4 23:54, John Garry wrote:
> This patchset adds some missing uncore PMU events for the hip08 arm64
> platform.
> 
> The missing events were originally mentioned in
> https://lkml.org/lkml/2019/6/14/645, when upstreaming the JSONs initially.
> 
> It also includes a fix for a DDRC eventname.
> 
> John Garry (4):
>   perf jevents: Fix Hisi hip08 DDRC PMU eventname
>   perf jevents: Add some missing events for Hisi hip08 DDRC PMU
>   perf jevents: Add some missing events for Hisi hip08 L3C PMU
>   perf jevents: Add some missing events for Hisi hip08 HHA PMU
> 
>  .../arm64/hisilicon/hip08/uncore-ddrc.json| 16 +-
>  .../arm64/hisilicon/hip08/uncore-hha.json | 23 +++-
>  .../arm64/hisilicon/hip08/uncore-l3c.json | 56 +++
>  3 files changed, 93 insertions(+), 2 deletions(-)
> 



Re: [PATCH] Revert "net: get rid of an signed integer overflow in ip_idents_reserve()"

2019-09-05 Thread Shaokun Zhang
Hi Eric,

On 2019/7/26 17:58, Eric Dumazet wrote:
> 
> 
> On 7/26/19 11:17 AM, Shaokun Zhang wrote:
>> From: Yang Guo 
>>
>> There is an significant performance regression with the following
>> commit-id 
>> ("net: get rid of an signed integer overflow in ip_idents_reserve()").
>>
>>
> 
> So, you jump around and took ownership of this issue, while some of us
> are already working on it ?
> 

Any update about this issue?

Thanks,
Shaokun

> Have you first checked that current UBSAN versions will not complain anymore ?
> 
> A revert adding back the original issue would be silly, performance of
> benchmarks is nice but secondary.
> 
> 
> 



Re: [ext4] [confidence: ] 2f7f60cf9f: WARNING:at_lib/list_debug.c:#__list_add_valid

2019-08-29 Thread Shaokun Zhang
Hi Oliver,

On 2019/8/30 11:11, kernel test robot wrote:
> FYI, we noticed the following commit (built with gcc-7):
> 
> commit: 2f7f60cf9fbcd80200edee8c29b9b35681c63c3e ("[PATCH] ext4: change the 
> type of ext4 cache stats to percpu_counter to improve performance")

Thanks for the report.

This patch has been dropped and the updated patch has been sent to community.
https://lkml.org/lkml/2019/8/28/286

Thanks,
Shaokun

> url: 
> https://github.com/0day-ci/linux/commits/Shaokun-Zhang/ext4-change-the-type-of-ext4-cache-stats-to-percpu_counter-to-improve-performance/20190825-123505
> 
> 
> in testcase: ltp
> with following parameters:
> 
>   test: quickhit
> 
> test-description: The LTP testsuite contains a collection of tools for 
> testing the Linux kernel and related features.
> test-url: http://linux-test-project.github.io/
> 
> 
> on test machine: qemu-system-x86_64 -enable-kvm -cpu SandyBridge -smp 2 -m 8G
> 
> caused below changes (please refer to attached dmesg/kmsg for entire 
> log/backtrace):
> 
> 
> +-+++
> | | e67095fd2f | 
> 2f7f60cf9f |
> +-+++
> | boot_successes  | 25 | 12   
>   |
> | boot_failures   | 0  | 17   
>   |
> | WARNING:at_lib/list_debug.c:#__list_add_valid   | 0  | 17   
>   |
> | RIP:__list_add_valid| 0  | 17   
>   |
> | WARNING:at_lib/list_debug.c:#__list_del_entry_valid | 0  | 3
>   |
> | RIP:__list_del_entry_valid  | 0  | 3
>   |
> +-+++
> 
> 
> If you fix the issue, kindly add following tag
> Reported-by: kernel test robot 
> 
> 
> [   62.458944] WARNING: CPU: 1 PID: 2533 at lib/list_debug.c:25 
> __list_add_valid+0x36/0x70
> [   62.460445] Modules linked in: fuse vfat fat btrfs xor zstd_decompress 
> zstd_compress raid6_pq xfs libcrc32c ext4 mbcache jbd2 loop intel_rapl_msr 
> intel_rapl_common crct10dif_pclmul crc32_pclmul crc32c_intel 
> ghash_clmulni_intel sr_mod bochs_drm cdrom drm_vram_helper sg ttm ppdev 
> drm_kms_helper ata_generic pata_acpi syscopyarea sysfillrect sysimgblt 
> snd_pcm fb_sys_fops drm snd_timer snd aesni_intel crypto_simd cryptd 
> glue_helper soundcore pcspkr joydev serio_raw ata_piix libata i2c_piix4 
> floppy parport_pc parport ip_tables
> [   62.468134] CPU: 1 PID: 2533 Comm: fsync01 Not tainted 
> 5.3.0-rc5-00283-g2f7f60cf9fbcd #1
> [   62.469707] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
> 1.10.2-1 04/01/2014
> [   62.471293] RIP: 0010:__list_add_valid+0x36/0x70
> [   62.472332] Code: 48 8b 10 4c 39 c2 75 27 48 39 f8 74 39 48 39 fa 74 34 b8 
> 01 00 00 00 c3 48 89 d1 48 c7 c7 28 ce d2 82 48 89 c2 e8 ba 47 c2 ff <0f> 0b 
> 31 c0 c3 48 89 c1 4c 89 c6 48 c7 c7 a0 ce d2 82 e8 a3 47 c2
> [   62.475779] RSP: 0018:b815c0497cc0 EFLAGS: 00010082
> [   62.477028] RAX:  RBX: 9ab02e61c418 RCX: 
> 0006
> [   62.478540] RDX: 0007 RSI: 0086 RDI: 
> 9ab0bfd17770
> [   62.480096] RBP: 9ab02e61c428 R08: 0510 R09: 
> 00aa
> [   62.481707] R10: 0007 R11: 9ab097f6b8b0 R12: 
> 9ab02e61c450
> [   62.483231] R13: 8314ce80 R14: 0202 R15: 
> 
> [   62.484861] FS:  7f8b236e0500() GS:9ab0bfd0() 
> knlGS:
> [   62.486641] CS:  0010 DS:  ES:  CR0: 80050033
> [   62.488103] CR2: 55b3435d0a60 CR3: 00019b8d2000 CR4: 
> 000406e0
> [   62.489798] DR0:  DR1:  DR2: 
> 
> [   62.491343] DR3:  DR6: fffe0ff0 DR7: 
> 0400
> [   62.492925] Call Trace:
> [   62.494524]  __percpu_counter_init+0x64/0xa0
> [   62.495780]  ext4_es_register_shrinker+0x53/0x130 [ext4]
> [   62.497235]  ext4_fill_super+0x1cd4/0x3ad0 [ext4]
> [   62.498521]  ? ext4_calculate_overhead+0x4a0/0x4a0 [ext4]
> [   62.499946]  mount_bdev+0x173/0x1b0
> [   62.501120]  legacy_get_tree+0x27/0x40
> [   62.502315]  vfs_get_tree+0x25/0xf0
> [   62.503421]  do_mount+0x691/0x9c0
> [   62.504516]  ? memdup_user+0x4b/0x70
> [   62.505793]  ksys_mount+0x80/0xd0
> [   62.506858]  __x64_sys_mount+0x21/0x30
> [   62.507979]  do_syscall_64+0x5b/0x1f0
> [   62.509194]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
> [   62.

[PATCH v2] ext4: use percpu_counters for extent_status cache hits/misses

2019-08-28 Thread Shaokun Zhang
From: Yang Guo 

@es_stats_cache_hits and @es_stats_cache_misses are accessed frequently in
ext4_es_lookup_extent function, it would influence the ext4 read/write
performance in NUMA system. Let's optimize it using percpu_counter,
it is profitable for the performance.

The test command is as below:
fio -name=randwrite -numjobs=8 -filename=/mnt/test1 -rw=randwrite
-ioengine=libaio -direct=1 -iodepth=64 -sync=0 -norandommap
-group_reporting -runtime=120 -time_based -bs=4k -size=5G

And the result is better 10% than the initial implement:
without the patch,IOPS=197k, BW=770MiB/s (808MB/s)(90.3GiB/120002msec)
with the patch,  IOPS=218k, BW=852MiB/s (894MB/s)(99.9GiB/120002msec)

Cc: "Theodore Ts'o" 
Cc: Andreas Dilger 
Cc: Eric Biggers 
Signed-off-by: Yang Guo 
Signed-off-by: Shaokun Zhang 
---
ChangeLog:
Fix the issue that there is no percpu_counter_destroy() for the new
 percpu counters.

 fs/ext4/extents_status.c | 37 -
 fs/ext4/extents_status.h |  4 ++--
 2 files changed, 26 insertions(+), 15 deletions(-)

diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 7521de2dcf3a..3c03062a8d6a 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -947,9 +947,9 @@ int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t 
lblk,
es->es_pblk = es1->es_pblk;
if (!ext4_es_is_referenced(es1))
ext4_es_set_referenced(es1);
-   stats->es_stats_cache_hits++;
+   percpu_counter_inc(>es_stats_cache_hits);
} else {
-   stats->es_stats_cache_misses++;
+   percpu_counter_inc(>es_stats_cache_misses);
}
 
read_unlock(_I(inode)->i_es_lock);
@@ -1235,9 +1235,9 @@ int ext4_seq_es_shrinker_info_show(struct seq_file *seq, 
void *v)
seq_printf(seq, "stats:\n  %lld objects\n  %lld reclaimable objects\n",
   percpu_counter_sum_positive(_stats->es_stats_all_cnt),
   percpu_counter_sum_positive(_stats->es_stats_shk_cnt));
-   seq_printf(seq, "  %lu/%lu cache hits/misses\n",
-  es_stats->es_stats_cache_hits,
-  es_stats->es_stats_cache_misses);
+   seq_printf(seq, "  %lld/%lld cache hits/misses\n",
+  percpu_counter_sum_positive(_stats->es_stats_cache_hits),
+  
percpu_counter_sum_positive(_stats->es_stats_cache_misses));
if (inode_cnt)
seq_printf(seq, "  %d inodes on list\n", inode_cnt);
 
@@ -1264,35 +1264,46 @@ int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
sbi->s_es_nr_inode = 0;
spin_lock_init(>s_es_lock);
sbi->s_es_stats.es_stats_shrunk = 0;
-   sbi->s_es_stats.es_stats_cache_hits = 0;
-   sbi->s_es_stats.es_stats_cache_misses = 0;
+   err = percpu_counter_init(>s_es_stats.es_stats_cache_hits, 0,
+ GFP_KERNEL);
+   if (err)
+   return err;
+   err = percpu_counter_init(>s_es_stats.es_stats_cache_misses, 0,
+ GFP_KERNEL);
+   if (err)
+   goto err1;
sbi->s_es_stats.es_stats_scan_time = 0;
sbi->s_es_stats.es_stats_max_scan_time = 0;
err = percpu_counter_init(>s_es_stats.es_stats_all_cnt, 0, 
GFP_KERNEL);
if (err)
-   return err;
+   goto err2;
err = percpu_counter_init(>s_es_stats.es_stats_shk_cnt, 0, 
GFP_KERNEL);
if (err)
-   goto err1;
+   goto err3;
 
sbi->s_es_shrinker.scan_objects = ext4_es_scan;
sbi->s_es_shrinker.count_objects = ext4_es_count;
sbi->s_es_shrinker.seeks = DEFAULT_SEEKS;
err = register_shrinker(>s_es_shrinker);
if (err)
-   goto err2;
+   goto err4;
 
return 0;
-
-err2:
+err4:
percpu_counter_destroy(>s_es_stats.es_stats_shk_cnt);
-err1:
+err3:
percpu_counter_destroy(>s_es_stats.es_stats_all_cnt);
+err2:
+   percpu_counter_destroy(>s_es_stats.es_stats_cache_misses);
+err1:
+   percpu_counter_destroy(>s_es_stats.es_stats_cache_hits);
return err;
 }
 
 void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
 {
+   percpu_counter_destroy(>s_es_stats.es_stats_cache_hits);
+   percpu_counter_destroy(>s_es_stats.es_stats_cache_misses);
percpu_counter_destroy(>s_es_stats.es_stats_all_cnt);
percpu_counter_destroy(>s_es_stats.es_stats_shk_cnt);
unregister_shrinker(>s_es_shrinker);
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index 131a8b7df265..e722dd9bd06e 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -70,8 +70,8 @@ struct ext4_es_tree {
 
 struct ext4_es_stats {
unsigned long es_stats_shrunk;
-  

Re: [PATCH] ext4: change the type of ext4 cache stats to percpu_counter to improve performance

2019-08-26 Thread Shaokun Zhang
Hi Theodore,

On 2019/8/26 23:57, Theodore Y. Ts'o wrote:
> On Mon, Aug 26, 2019 at 04:24:20PM +0800, Shaokun Zhang wrote:
>>> The other problem with this patch is that it initializes
>>> es_stats_cache_hits and es_stats_cache_miesses too late.  They will
>>> get used when the journal inode is loaded.  This is mostly harmless,
>>
>> I have checked it again, @es_stats_cache_hits and @es_stats_cache_miesses
>> have been initialized before the journal inode is loaded, Maybe I miss
>> something else?
> 
> No, sorry, that was my mistake.  I misread things when I was looking
> over your patch last night.
> 
> Please resubmit your patch once you've fixed things up and tested it.
> 

Sure, will do it soon.

> I would recommend that you at least try running your patch using the
> kvm-xfstests's smoke test[1] before submitting them.  It will save you
> and me time.
> 

Ok, thanks your guidance.

Shaokun,

> [1] 
> https://github.com/tytso/xfstests-bld/blob/master/Documentation/kvm-quickstart.md
> 
> Thanks,
> 
>   - Ted
>   
> 
> .
> 



Re: [PATCH] ext4: change the type of ext4 cache stats to percpu_counter to improve performance

2019-08-26 Thread Shaokun Zhang
Hi Ted,

On 2019/8/26 8:47, Theodore Y. Ts'o wrote:
> On Sun, Aug 25, 2019 at 10:28:03AM -0700, Eric Biggers wrote:
>> This patch is causing the following.  Probably because there's no calls to
>> percpu_counter_destroy() for the new counters?
> 
> Yeah, I noticed this from my test runs last night as well.  It looks
> like original patch was never tested with CONFIG_HOTPLUG_CPU.
> 

Sorry that We may miss it completely, we shall double check it and
make the proper patch carefully.

> The other problem with this patch is that it initializes
> es_stats_cache_hits and es_stats_cache_miesses too late.  They will
> get used when the journal inode is loaded.  This is mostly harmless,

I have checked it again, @es_stats_cache_hits and @es_stats_cache_miesses
have been initialized before the journal inode is loaded, Maybe I miss
something else?

egrep "ext4_es_register_shrinker|ext4_load_journal" fs/ext4/super.c
4260:   if (ext4_es_register_shrinker(sbi))
4302:   err = ext4_load_journal(sb, es, journal_devnum);

Thanks,
Shaokun

> but it's also wrong.
> 
> I've dropped this patch from the ext4 git tree.
> 
> - Ted
> 
> .
> 



Re: [PATCH] ext4: change the type of ext4 cache stats to percpu_counter to improve performance

2019-08-26 Thread Shaokun Zhang
Hi Eric,

On 2019/8/26 1:28, Eric Biggers wrote:
> On Sat, Aug 24, 2019 at 11:25:24PM -0400, Theodore Y. Ts'o wrote:
>> On Fri, Aug 23, 2019 at 10:47:34AM +0800, Shaokun Zhang wrote:
>>> From: Yang Guo 
>>>
>>> @es_stats_cache_hits and @es_stats_cache_misses are accessed frequently in
>>> ext4_es_lookup_extent function, it would influence the ext4 read/write
>>> performance in NUMA system.
>>> Let's optimize it using percpu_counter, it is profitable for the
>>> performance.
>>>
>>> The test command is as below:
>>> fio -name=randwrite -numjobs=8 -filename=/mnt/test1 -rw=randwrite
>>> -ioengine=libaio -direct=1 -iodepth=64 -sync=0 -norandommap -group_reporting
>>> -runtime=120 -time_based -bs=4k -size=5G
>>>
>>> And the result is better 10% than the initial implement:
>>> without the patch,IOPS=197k, BW=770MiB/s (808MB/s)(90.3GiB/120002msec)
>>> with the patch,  IOPS=218k, BW=852MiB/s (894MB/s)(99.9GiB/120002msec)
>>>
>>> Cc: "Theodore Ts'o" 
>>> Cc: Andreas Dilger 
>>> Signed-off-by: Yang Guo 
>>> Signed-off-by: Shaokun Zhang 
>>
>> Applied with some adjustments so it would apply.  I also changed the patch 
>> summary to:
>>
>> ext4: use percpu_counters for extent_status cache hits/misses
>>
>>  - Ted
> 
> This patch is causing the following.  Probably because there's no calls to
> percpu_counter_destroy() for the new counters?
> 

Apologies, We missed it and let's fix it soon.

Thanks,
Shaokun

> ==
> BUG: KASAN: use-after-free in __list_del_entry_valid+0x168/0x180 
> lib/list_debug.c:51
> Read of size 8 at addr 888063168fa8 by task umount/611
> 
> CPU: 1 PID: 611 Comm: umount Not tainted 5.3.0-rc4-00015-gcc08b68e62ec #6
> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
> 1.12.0-20181126_142135-anatol 04/01/2014
> Call Trace:
>  __dump_stack lib/dump_stack.c:77 [inline]
>  dump_stack+0x86/0xca lib/dump_stack.c:113
>  print_address_description+0x6e/0x2e7 mm/kasan/report.c:351
>  __kasan_report.cold+0x1b/0x35 mm/kasan/report.c:482
>  kasan_report+0x12/0x17 mm/kasan/common.c:612
>  __asan_report_load8_noabort+0x14/0x20 mm/kasan/generic_report.c:132
>  __list_del_entry_valid+0x168/0x180 lib/list_debug.c:51
>  __list_del_entry include/linux/list.h:131 [inline]
>  list_del include/linux/list.h:139 [inline]
>  percpu_counter_destroy+0x5d/0x230 lib/percpu_counter.c:157
>  ext4_put_super+0x319/0xbb0 fs/ext4/super.c:1010
>  generic_shutdown_super+0x128/0x320 fs/super.c:458
>  kill_block_super+0x97/0xe0 fs/super.c:1310
>  deactivate_locked_super+0x7b/0xd0 fs/super.c:331
>  deactivate_super+0x138/0x150 fs/super.c:362
>  cleanup_mnt+0x298/0x3f0 fs/namespace.c:1102
>  __cleanup_mnt+0xd/0x10 fs/namespace.c:1109
>  task_work_run+0x103/0x180 kernel/task_work.c:113
>  tracehook_notify_resume include/linux/tracehook.h:188 [inline]
>  exit_to_usermode_loop+0x10b/0x130 arch/x86/entry/common.c:163
>  prepare_exit_to_usermode arch/x86/entry/common.c:194 [inline]
>  syscall_return_slowpath arch/x86/entry/common.c:274 [inline]
>  do_syscall_64+0x343/0x450 arch/x86/entry/common.c:299
>  entry_SYSCALL_64_after_hwframe+0x49/0xbe
> RIP: 0033:0x7f7caed23d77
> Code: 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 31 f6 e9 09 00 
> 00 00 66 0f 1f 84 00 00 00 00 00 b8 a6 00 8
> RSP: 002b:7ffe960e7c98 EFLAGS: 0246 ORIG_RAX: 00a6
> RAX:  RBX: 560c039e1060 RCX: 7f7caed23d77
> RDX: 0001 RSI:  RDI: 560c039e3c90
> RBP: 560c039e3c90 R08: 560c039e2ec0 R09: 0014
> R10: 06b4 R11: 0246 R12: 7f7caf225e64
> R13:  R14: 560c039e1240 R15: 7ffe960e7f20
> 
> Allocated by task 596:
>  save_stack mm/kasan/common.c:69 [inline]
>  set_track mm/kasan/common.c:77 [inline]
>  __kasan_kmalloc.part.0+0x41/0xb0 mm/kasan/common.c:487
>  __kasan_kmalloc.constprop.0+0xba/0xc0 mm/kasan/common.c:468
>  kasan_kmalloc+0x9/0x10 mm/kasan/common.c:501
>  kmem_cache_alloc_trace+0x11e/0x2e0 mm/slab.c:3550
>  kmalloc include/linux/slab.h:552 [inline]
>  kzalloc include/linux/slab.h:748 [inline]
>  ext4_fill_super+0x111/0x80a0 fs/ext4/super.c:3610
>  mount_bdev+0x286/0x350 fs/super.c:1283
>  ext4_mount+0x10/0x20 fs/ext4/super.c:6007
>  legacy_get_tree+0x101/0x1f0 fs/fs_context.c:661
>  vfs_get_tree+0x86/0x2e0 fs/super.c:1413
>  do_new_mount fs/namespace.c:2791 [inline]
>  do_mount+0x1093/0x1b30 fs/namespace.c:3111
>  ksys_mount+0x7d/0xd0 fs/namespace.c:3320
>

[PATCH] ext4: change the type of ext4 cache stats to percpu_counter to improve performance

2019-08-22 Thread Shaokun Zhang
From: Yang Guo 

@es_stats_cache_hits and @es_stats_cache_misses are accessed frequently in
ext4_es_lookup_extent function, it would influence the ext4 read/write
performance in NUMA system.
Let's optimize it using percpu_counter, it is profitable for the
performance.

The test command is as below:
fio -name=randwrite -numjobs=8 -filename=/mnt/test1 -rw=randwrite
-ioengine=libaio -direct=1 -iodepth=64 -sync=0 -norandommap -group_reporting
-runtime=120 -time_based -bs=4k -size=5G

And the result is better 10% than the initial implement:
without the patch,IOPS=197k, BW=770MiB/s (808MB/s)(90.3GiB/120002msec)
with the patch,  IOPS=218k, BW=852MiB/s (894MB/s)(99.9GiB/120002msec)

Cc: "Theodore Ts'o" 
Cc: Andreas Dilger 
Signed-off-by: Yang Guo 
Signed-off-by: Shaokun Zhang 
---
 fs/ext4/extents_status.c | 20 +---
 fs/ext4/extents_status.h |  4 ++--
 2 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 7521de2dcf3a..7699e80ae236 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -947,9 +947,9 @@ int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t 
lblk,
es->es_pblk = es1->es_pblk;
if (!ext4_es_is_referenced(es1))
ext4_es_set_referenced(es1);
-   stats->es_stats_cache_hits++;
+   percpu_counter_inc(>es_stats_cache_hits);
} else {
-   stats->es_stats_cache_misses++;
+   percpu_counter_inc(>es_stats_cache_misses);
}
 
read_unlock(_I(inode)->i_es_lock);
@@ -1235,9 +1235,9 @@ int ext4_seq_es_shrinker_info_show(struct seq_file *seq, 
void *v)
seq_printf(seq, "stats:\n  %lld objects\n  %lld reclaimable objects\n",
   percpu_counter_sum_positive(_stats->es_stats_all_cnt),
   percpu_counter_sum_positive(_stats->es_stats_shk_cnt));
-   seq_printf(seq, "  %lu/%lu cache hits/misses\n",
-  es_stats->es_stats_cache_hits,
-  es_stats->es_stats_cache_misses);
+   seq_printf(seq, "  %llu/%llu cache hits/misses\n",
+  percpu_counter_sum_positive(_stats->es_stats_cache_hits),
+  
percpu_counter_sum_positive(_stats->es_stats_cache_misses));
if (inode_cnt)
seq_printf(seq, "  %d inodes on list\n", inode_cnt);
 
@@ -1264,8 +1264,14 @@ int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
sbi->s_es_nr_inode = 0;
spin_lock_init(>s_es_lock);
sbi->s_es_stats.es_stats_shrunk = 0;
-   sbi->s_es_stats.es_stats_cache_hits = 0;
-   sbi->s_es_stats.es_stats_cache_misses = 0;
+   err = percpu_counter_init(>s_es_stats.es_stats_cache_hits, 0,
+ GFP_KERNEL);
+   if (err)
+   return err;
+   err = percpu_counter_init(>s_es_stats.es_stats_cache_misses, 0,
+ GFP_KERNEL);
+   if (err)
+   return err;
sbi->s_es_stats.es_stats_scan_time = 0;
sbi->s_es_stats.es_stats_max_scan_time = 0;
err = percpu_counter_init(>s_es_stats.es_stats_all_cnt, 0, 
GFP_KERNEL);
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index 131a8b7df265..e722dd9bd06e 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -70,8 +70,8 @@ struct ext4_es_tree {
 
 struct ext4_es_stats {
unsigned long es_stats_shrunk;
-   unsigned long es_stats_cache_hits;
-   unsigned long es_stats_cache_misses;
+   struct percpu_counter es_stats_cache_hits;
+   struct percpu_counter es_stats_cache_misses;
u64 es_stats_scan_time;
u64 es_stats_max_scan_time;
struct percpu_counter es_stats_all_cnt;
-- 
2.7.4



[PATCH v2] irqchip/gic-v3-its: Free unused vpt_page when alloc vpe table fail

2019-07-26 Thread Shaokun Zhang
From: Nianyao Tang 

In its_vpe_init, when its_alloc_vpe_table fails, we should free
vpt_page allocated just before, instead of vpe->vpt_page.
Let's fix it.

Cc: Thomas Gleixner 
Cc: Jason Cooper 
Cc: Marc Zyngier 
Signed-off-by: Nianyao Tang 
Signed-off-by: Shaokun Zhang 
---
 drivers/irqchip/irq-gic-v3-its.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index 730fbe0e2a9d..1b5c3672aea2 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -3010,7 +3010,7 @@ static int its_vpe_init(struct its_vpe *vpe)
 
if (!its_alloc_vpe_table(vpe_id)) {
its_vpe_id_free(vpe_id);
-   its_free_pending_table(vpe->vpt_page);
+   its_free_pending_table(vpt_page);
return -ENOMEM;
}
 
-- 
2.7.4



[PATCH] Revert "net: get rid of an signed integer overflow in ip_idents_reserve()"

2019-07-26 Thread Shaokun Zhang
From: Yang Guo 

There is an significant performance regression with the following
commit-id 
("net: get rid of an signed integer overflow in ip_idents_reserve()").

Both on x86 server(Skylake) and ARM64 server, when cpu core number
increase, the function ip_idents_reserve() of cpu usage is very high, 
and the performance will become bad. After revert the patch, we can
avoid this problem when cpu core number increases.

With the patch on x86, ip_idents_reserve() cpu usage is 63.05% when
iperf3 is run with 32 cpu cores.
Samples: 18K of event 'cycles:ppp', Event count (approx.)
  Children  Self  Command  Shared Object  Symbol
63.18%63.05%  iperf3   [kernel.vmlinux]   [k] ip_idents_reserve

And the IOPS is 4483830pps.
10:46:13 AM IFACE   rxpck/s   txpck/srxkB/stxkB/s
10:46:14 AMlo 4483830.00 4483830.00 192664.57 192664.57

Resert the patch, ip_idents_reserve() cpu usage is 17.05%.
Samples: 37K of event 'cycles:ppp', 4000 Hz, Event count (approx.)
  Children  Self  Shared Object  Symbol
17.07%17.05%  [kernel]   [k] ip_idents_reserve

And the IOPS is 1160021pps.
05:03:15 PM IFACE   rxpck/s   txpck/srxkB/stxkB/s
05:03:16 PMlo 11600213.00 11600213.00 498446.65 498446.65

The performance regression was also found on ARM64 server and discussed
a few days ago:
https://lore.kernel.org/netdev/98b95fbe-adcc-c95f-7f3d-6c57122f4586
@pengutronix.de/T/#t

Cc: "David S. Miller"  
Cc: Alexey Kuznetsov  
Cc: Hideaki YOSHIFUJI 
Cc: Eric Dumazet 
Cc: Jiri Pirko 
Signed-off-by: Yang Guo 
---
 net/ipv4/route.c | 10 ++
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 517300d..dff457b 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -489,18 +489,12 @@ u32 ip_idents_reserve(u32 hash, int segs)
atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
u32 old = READ_ONCE(*p_tstamp);
u32 now = (u32)jiffies;
-   u32 new, delta = 0;
+   u32 delta = 0;
 
if (old != now && cmpxchg(p_tstamp, old, now) == old)
delta = prandom_u32_max(now - old);
 
-   /* Do not use atomic_add_return() as it makes UBSAN unhappy */
-   do {
-   old = (u32)atomic_read(p_id);
-   new = old + delta + segs;
-   } while (atomic_cmpxchg(p_id, old, new) != old);
-
-   return new - segs;
+   return atomic_add_return(segs + delta, p_id) - segs;
 }
 EXPORT_SYMBOL(ip_idents_reserve);
 
-- 
1.8.3.1



[PATCH] irqchip/gic-v3-its: Free unused vpt_page when alloc vpe table fail

2019-07-25 Thread Shaokun Zhang
From: Nianyao Tang 

In its_vpe_init, when its_alloc_vpe_table fails, we should free
vpt_page allocated just before, instead of vpe->vpt_page.
Let's fix it.

Cc: Thomas Gleixner  
Cc: Jason Cooper 
Cc: Marc Zyngier 
Signed-off-by: Nianyao Tang 
---
 drivers/irqchip/irq-gic-v3-its.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index 730fbe0..1b5c367 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -3010,7 +3010,7 @@ static int its_vpe_init(struct its_vpe *vpe)
 
if (!its_alloc_vpe_table(vpe_id)) {
its_vpe_id_free(vpe_id);
-   its_free_pending_table(vpe->vpt_page);
+   its_free_pending_table(vpt_page);
return -ENOMEM;
}
 
-- 
2.7.4



[PATCH v4 1/2] drivers: base: cacheinfo: Add variable to record max cache line size

2019-05-27 Thread Shaokun Zhang
Add coherency_max_size variable to record the maximum cache line size
for different cache levels. If it is available, we will synchronize
it as cache line size, otherwise we will use CTR_EL0.CWG reporting
in cache_line_size() for arm64.

Cc: Greg Kroah-Hartman 
Cc: "Rafael J. Wysocki" 
Cc: Sudeep Holla 
Cc: Catalin Marinas 
Cc: Jeremy Linton 
Cc: Will Deacon 
Signed-off-by: Shaokun Zhang 
---
ChangeLog since v3:
  -- Address Greg's comments
  -- Fix some commit information

ChangeLog since v2:
  -- Rebase to 5.2-rc2
  -- Export cache_line_size for I/O driver

ChangeLog since v1:
  -- Move coherency_max_size to drivers/base/cacheinfo.c
  -- Address Catalin's comments
  Link: https://www.spinics.net/lists/arm-kernel/msg723615.html

 drivers/base/cacheinfo.c  | 5 +
 include/linux/cacheinfo.h | 2 ++
 2 files changed, 7 insertions(+)

diff --git a/drivers/base/cacheinfo.c b/drivers/base/cacheinfo.c
index a7359535caf5..8827c60f51e2 100644
--- a/drivers/base/cacheinfo.c
+++ b/drivers/base/cacheinfo.c
@@ -213,6 +213,8 @@ int __weak cache_setup_acpi(unsigned int cpu)
return -ENOTSUPP;
 }
 
+unsigned int coherency_max_size;
+
 static int cache_shared_cpu_map_setup(unsigned int cpu)
 {
struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
@@ -251,6 +253,9 @@ static int cache_shared_cpu_map_setup(unsigned int cpu)
cpumask_set_cpu(i, _leaf->shared_cpu_map);
}
}
+   /* record the maximum cache line size */
+   if (this_leaf->coherency_line_size > coherency_max_size)
+   coherency_max_size = this_leaf->coherency_line_size;
}
 
return 0;
diff --git a/include/linux/cacheinfo.h b/include/linux/cacheinfo.h
index 70e19bc6cc9f..46b92cd61d0c 100644
--- a/include/linux/cacheinfo.h
+++ b/include/linux/cacheinfo.h
@@ -17,6 +17,8 @@ enum cache_type {
CACHE_TYPE_UNIFIED = BIT(2),
 };
 
+extern unsigned int coherency_max_size;
+
 /**
  * struct cacheinfo - represent a cache leaf node
  * @id: This cache's id. It is unique among caches with the same (type, level).
-- 
2.7.4



[PATCH v4 2/2] arm64: cacheinfo: Update cache_line_size detected from DT or PPTT

2019-05-27 Thread Shaokun Zhang
cache_line_size is derived from CTR_EL0.CWG field and is called mostly
for I/O device drivers. For HiSilicon certain plantform, like the
Kunpeng920 server SoC, cache line sizes are different between L1/2
cache and L3 cache while L1 cache line size is 64-byte and L3 is 128-byte,
but CTR_EL0.CWG is misreporting using L1 cache line size.

We shall correct the right value which is important for I/O performance.
Let's update the cache line size if it is detected from DT or PPTT
information.

Cc: Catalin Marinas 
Cc: Will Deacon 
Cc: Sudeep Holla 
Cc: Jeremy Linton 
Cc: Zhenfa Qiu 
Reported-by: Zhenfa Qiu 
Suggested-by: Catalin Marinas 
Signed-off-by: Shaokun Zhang 
---
 arch/arm64/include/asm/cache.h |  6 +-
 arch/arm64/kernel/cacheinfo.c  | 11 +++
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/include/asm/cache.h b/arch/arm64/include/asm/cache.h
index 926434f413fa..758af6340314 100644
--- a/arch/arm64/include/asm/cache.h
+++ b/arch/arm64/include/asm/cache.h
@@ -91,11 +91,7 @@ static inline u32 cache_type_cwg(void)
 
 #define __read_mostly __attribute__((__section__(".data..read_mostly")))
 
-static inline int cache_line_size(void)
-{
-   u32 cwg = cache_type_cwg();
-   return cwg ? 4 << cwg : ARCH_DMA_MINALIGN;
-}
+int cache_line_size(void);
 
 /*
  * Read the effective value of CTR_EL0.
diff --git a/arch/arm64/kernel/cacheinfo.c b/arch/arm64/kernel/cacheinfo.c
index 0bf0a835122f..0c0cd4d26b87 100644
--- a/arch/arm64/kernel/cacheinfo.c
+++ b/arch/arm64/kernel/cacheinfo.c
@@ -28,6 +28,17 @@
 #define CLIDR_CTYPE(clidr, level)  \
(((clidr) & CLIDR_CTYPE_MASK(level)) >> CLIDR_CTYPE_SHIFT(level))
 
+int cache_line_size(void)
+{
+   u32 cwg = cache_type_cwg();
+
+   if (coherency_max_size != 0)
+   return coherency_max_size;
+
+   return cwg ? 4 << cwg : ARCH_DMA_MINALIGN;
+}
+EXPORT_SYMBOL_GPL(cache_line_size);
+
 static inline enum cache_type get_cache_type(int level)
 {
u64 clidr;
-- 
2.7.4



[PATCH v3 1/2] drivers: base: cacheinfo: Add variable to record max cache line size

2019-05-26 Thread Shaokun Zhang
Add coherency_max_size variable to record the maximum cache line size
for different cache levels. We will synchronize it with CTR_EL0.CWG
reporting in cache_line_size() for arm64.

Cc: Greg Kroah-Hartman 
Cc: "Rafael J. Wysocki" 
Cc: Sudeep Holla 
Cc: Catalin Marinas 
Cc: Jeremy Linton 
Cc: Will Deacon 
Signed-off-by: Shaokun Zhang 
---
ChangeLog since v2:
  -- Rebase to 5.2-rc2
  -- Export cache_line_size for I/O driver
ChangeLog since v1:
  -- Move coherency_max_size to drivers/base/cacheinfo.c
  -- Address Catalin's comments
  Link: https://www.spinics.net/lists/arm-kernel/msg723615.html

 drivers/base/cacheinfo.c  | 5 +
 include/linux/cacheinfo.h | 2 ++
 2 files changed, 7 insertions(+)

diff --git a/drivers/base/cacheinfo.c b/drivers/base/cacheinfo.c
index a7359535caf5..8827c60f51e2 100644
--- a/drivers/base/cacheinfo.c
+++ b/drivers/base/cacheinfo.c
@@ -213,6 +213,8 @@ int __weak cache_setup_acpi(unsigned int cpu)
return -ENOTSUPP;
 }
 
+unsigned int coherency_max_size;
+
 static int cache_shared_cpu_map_setup(unsigned int cpu)
 {
struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
@@ -251,6 +253,9 @@ static int cache_shared_cpu_map_setup(unsigned int cpu)
cpumask_set_cpu(i, _leaf->shared_cpu_map);
}
}
+   /* record the maximum cache line size */
+   if (this_leaf->coherency_line_size > coherency_max_size)
+   coherency_max_size = this_leaf->coherency_line_size;
}
 
return 0;
diff --git a/include/linux/cacheinfo.h b/include/linux/cacheinfo.h
index 70e19bc6cc9f..46b92cd61d0c 100644
--- a/include/linux/cacheinfo.h
+++ b/include/linux/cacheinfo.h
@@ -17,6 +17,8 @@ enum cache_type {
CACHE_TYPE_UNIFIED = BIT(2),
 };
 
+extern unsigned int coherency_max_size;
+
 /**
  * struct cacheinfo - represent a cache leaf node
  * @id: This cache's id. It is unique among caches with the same (type, level).
-- 
2.7.4



[PATCH v3 2/2] arm64: cacheinfo: Update cache_line_size detected from DT or PPTT

2019-05-26 Thread Shaokun Zhang
cache_line_size is derived from CTR_EL0.CWG field and is called mostly
for I/O device drivers. For HiSilicon certain plantform, like the
Kunpeng920 server SoC, cache line sizes are different between L1/2
cache and L3 cache while L1 cache line size is 64-byte and L3 is 128-byte,
but CTR_EL0.CWG is misreporting using L1 cache line size.

We shall correct the right value which is important for I/O performance.
Let's update the cache line size if it is detected from DT or PPTT
information.

Cc: Catalin Marinas 
Cc: Will Deacon 
Cc: Sudeep Holla 
Cc: Jeremy Linton 
Cc: Zhenfa Qiu 
Reported-by: Zhenfa Qiu 
Suggested-by: Catalin Marinas 
Signed-off-by: Shaokun Zhang 
---
 arch/arm64/include/asm/cache.h |  6 +-
 arch/arm64/kernel/cacheinfo.c  | 11 +++
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/include/asm/cache.h b/arch/arm64/include/asm/cache.h
index 926434f413fa..758af6340314 100644
--- a/arch/arm64/include/asm/cache.h
+++ b/arch/arm64/include/asm/cache.h
@@ -91,11 +91,7 @@ static inline u32 cache_type_cwg(void)
 
 #define __read_mostly __attribute__((__section__(".data..read_mostly")))
 
-static inline int cache_line_size(void)
-{
-   u32 cwg = cache_type_cwg();
-   return cwg ? 4 << cwg : ARCH_DMA_MINALIGN;
-}
+int cache_line_size(void);
 
 /*
  * Read the effective value of CTR_EL0.
diff --git a/arch/arm64/kernel/cacheinfo.c b/arch/arm64/kernel/cacheinfo.c
index 0bf0a835122f..3d54b0024246 100644
--- a/arch/arm64/kernel/cacheinfo.c
+++ b/arch/arm64/kernel/cacheinfo.c
@@ -28,6 +28,17 @@
 #define CLIDR_CTYPE(clidr, level)  \
(((clidr) & CLIDR_CTYPE_MASK(level)) >> CLIDR_CTYPE_SHIFT(level))
 
+int cache_line_size(void)
+{
+   u32 cwg = cache_type_cwg();
+
+   if (coherency_max_size != 0)
+   return coherency_max_size;
+
+   return cwg ? 4 << cwg : ARCH_DMA_MINALIGN;
+}
+EXPORT_SYMBOL(cache_line_size);
+
 static inline enum cache_type get_cache_type(int level)
 {
u64 clidr;
-- 
2.7.4



[PATCH v2] intel_th: msu: Fix unused variable warning on arm64 platform

2019-05-20 Thread Shaokun Zhang
drivers/hwtracing/intel_th/msu.c: In function ‘msc_buffer_win_alloc’:
drivers/hwtracing/intel_th/msu.c:783:21: warning: unused variable ‘i’ 
[-Wunused-variable]
  int ret = -ENOMEM, i;
 ^
drivers/hwtracing/intel_th/msu.c: In function ‘msc_buffer_win_free’:
drivers/hwtracing/intel_th/msu.c:863:6: warning: unused variable ‘i’ 
[-Wunused-variable]
  int i;
  ^
Fix this compiler warning on arm64 platform.

Cc: Alexander Shishkin 
Cc: Greg Kroah-Hartman 
Suggested-by: Alexander Shishkin 
Signed-off-by: Shaokun Zhang 
---
 drivers/hwtracing/intel_th/msu.c | 40 +++-
 1 file changed, 27 insertions(+), 13 deletions(-)

diff --git a/drivers/hwtracing/intel_th/msu.c b/drivers/hwtracing/intel_th/msu.c
index 81bb54fa3ce8..49e64ca760e6 100644
--- a/drivers/hwtracing/intel_th/msu.c
+++ b/drivers/hwtracing/intel_th/msu.c
@@ -767,6 +767,30 @@ static int __msc_buffer_win_alloc(struct msc_window *win,
return -ENOMEM;
 }
 
+#ifdef CONFIG_X86
+static void msc_buffer_set_uc(struct msc_window *win, unsigned int nr_blocks)
+{
+   int i;
+
+   for (i = 0; i < nr_blocks; i++)
+   /* Set the page as uncached */
+   set_memory_uc((unsigned long)msc_win_block(win, i), 1);
+}
+
+static void msc_buffer_set_wb(struct msc_window *win)
+{
+   int i;
+
+   for (i = 0; i < win->nr_blocks; i++)
+   /* Reset the page to write-back */
+   set_memory_wb((unsigned long)msc_win_block(win, i), 1);
+}
+#else /* !X86 */
+static inline void msc_buffer_set_uc(struct msc_window *win,
+unsigned int nr_blocks) {}
+static inline void msc_buffer_set_wb(struct msc_window *win) {}
+#endif /* CONFIG_X86 */
+
 /**
  * msc_buffer_win_alloc() - alloc a window for a multiblock mode
  * @msc:   MSC device
@@ -780,7 +804,7 @@ static int __msc_buffer_win_alloc(struct msc_window *win,
 static int msc_buffer_win_alloc(struct msc *msc, unsigned int nr_blocks)
 {
struct msc_window *win;
-   int ret = -ENOMEM, i;
+   int ret = -ENOMEM;
 
if (!nr_blocks)
return 0;
@@ -811,11 +835,7 @@ static int msc_buffer_win_alloc(struct msc *msc, unsigned 
int nr_blocks)
if (ret < 0)
goto err_nomem;
 
-#ifdef CONFIG_X86
-   for (i = 0; i < ret; i++)
-   /* Set the page as uncached */
-   set_memory_uc((unsigned long)msc_win_block(win, i), 1);
-#endif
+   msc_buffer_set_uc(win, ret);
 
win->nr_blocks = ret;
 
@@ -860,8 +880,6 @@ static void __msc_buffer_win_free(struct msc *msc, struct 
msc_window *win)
  */
 static void msc_buffer_win_free(struct msc *msc, struct msc_window *win)
 {
-   int i;
-
msc->nr_pages -= win->nr_blocks;
 
list_del(>entry);
@@ -870,11 +888,7 @@ static void msc_buffer_win_free(struct msc *msc, struct 
msc_window *win)
msc->base_addr = 0;
}
 
-#ifdef CONFIG_X86
-   for (i = 0; i < win->nr_blocks; i++)
-   /* Reset the page to write-back */
-   set_memory_wb((unsigned long)msc_win_block(win, i), 1);
-#endif
+   msc_buffer_set_wb(win);
 
__msc_buffer_win_free(msc, win);
 
-- 
2.7.4



[RESEND PATCH] intel_th: msu: Fix unused variable warning on arm64 platform

2019-05-20 Thread Shaokun Zhang
drivers/hwtracing/intel_th/msu.c: In function ‘msc_buffer_win_alloc’:
drivers/hwtracing/intel_th/msu.c:783:21: warning: unused variable ‘i’ 
[-Wunused-variable]
  int ret = -ENOMEM, i;
 ^
drivers/hwtracing/intel_th/msu.c: In function ‘msc_buffer_win_free’:
drivers/hwtracing/intel_th/msu.c:863:6: warning: unused variable ‘i’ 
[-Wunused-variable]
  int i;
  ^
Fix this compiler warning on arm64 platform.

Cc: Alexander Shishkin 
Cc: Greg Kroah-Hartman 
Signed-off-by: Shaokun Zhang 
---
 drivers/hwtracing/intel_th/msu.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/hwtracing/intel_th/msu.c b/drivers/hwtracing/intel_th/msu.c
index 81bb54fa3ce8..833a5a8f13ad 100644
--- a/drivers/hwtracing/intel_th/msu.c
+++ b/drivers/hwtracing/intel_th/msu.c
@@ -780,7 +780,10 @@ static int __msc_buffer_win_alloc(struct msc_window *win,
 static int msc_buffer_win_alloc(struct msc *msc, unsigned int nr_blocks)
 {
struct msc_window *win;
-   int ret = -ENOMEM, i;
+   int ret = -ENOMEM;
+#ifdef CONFIG_X86
+   int i;
+#endif
 
if (!nr_blocks)
return 0;
@@ -860,7 +863,9 @@ static void __msc_buffer_win_free(struct msc *msc, struct 
msc_window *win)
  */
 static void msc_buffer_win_free(struct msc *msc, struct msc_window *win)
 {
+#ifdef CONFIG_X86
int i;
+#endif
 
msc->nr_pages -= win->nr_blocks;
 
-- 
2.7.4



[PATCH] intel_th: msu: Fix unused variable warning on arm64 platform

2019-05-19 Thread Shaokun Zhang
Fix this compiler warning on arm64 platform.

Cc: Alexander Shishkin 
Cc: Greg Kroah-Hartman 
Signed-off-by: Shaokun Zhang 
---
 drivers/hwtracing/intel_th/msu.c | 8 +++-
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/drivers/hwtracing/intel_th/msu.c b/drivers/hwtracing/intel_th/msu.c
index 81bb54fa3ce8..e15ed5c308e1 100644
--- a/drivers/hwtracing/intel_th/msu.c
+++ b/drivers/hwtracing/intel_th/msu.c
@@ -780,7 +780,7 @@ static int __msc_buffer_win_alloc(struct msc_window *win,
 static int msc_buffer_win_alloc(struct msc *msc, unsigned int nr_blocks)
 {
struct msc_window *win;
-   int ret = -ENOMEM, i;
+   int ret = -ENOMEM;
 
if (!nr_blocks)
return 0;
@@ -812,7 +812,7 @@ static int msc_buffer_win_alloc(struct msc *msc, unsigned 
int nr_blocks)
goto err_nomem;
 
 #ifdef CONFIG_X86
-   for (i = 0; i < ret; i++)
+   for (int i = 0; i < ret; i++)
/* Set the page as uncached */
set_memory_uc((unsigned long)msc_win_block(win, i), 1);
 #endif
@@ -860,8 +860,6 @@ static void __msc_buffer_win_free(struct msc *msc, struct 
msc_window *win)
  */
 static void msc_buffer_win_free(struct msc *msc, struct msc_window *win)
 {
-   int i;
-
msc->nr_pages -= win->nr_blocks;
 
list_del(>entry);
@@ -871,7 +869,7 @@ static void msc_buffer_win_free(struct msc *msc, struct 
msc_window *win)
}
 
 #ifdef CONFIG_X86
-   for (i = 0; i < win->nr_blocks; i++)
+   for (int i = 0; i < win->nr_blocks; i++)
/* Reset the page to write-back */
set_memory_wb((unsigned long)msc_win_block(win, i), 1);
 #endif
-- 
2.7.4



[PATCH v2 2/2] arm64: cacheinfo: Update cache_line_size detected from DT or PPTT

2019-05-06 Thread Shaokun Zhang
cache_line_size is derived from CTR_EL0.CWG field and is called mostly
for I/O device drivers. For HiSilicon certain plantform, like the
Kunpeng920 server SoC, cache line sizes are different between L1/2
cache and L3 cache while L1 cache line size is 64-byte and L3 is 128-byte,
but CTR_EL0.CWG is misreporting using L1 cache line size.

We shall correct the right value which is important for I/O performance.
Let's update the cache line size if it is detected from DT or PPTT
information.

Cc: Catalin Marinas 
Cc: Will Deacon 
Cc: Sudeep Holla 
Cc: Jeremy Linton 
Reported-by: Zhenfa Qiu 
Suggested-by: Catalin Marinas 
Signed-off-by: Shaokun Zhang 
---
 arch/arm64/include/asm/cache.h |  6 +-
 arch/arm64/kernel/cacheinfo.c  | 10 ++
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/include/asm/cache.h b/arch/arm64/include/asm/cache.h
index 926434f413fa..758af6340314 100644
--- a/arch/arm64/include/asm/cache.h
+++ b/arch/arm64/include/asm/cache.h
@@ -91,11 +91,7 @@ static inline u32 cache_type_cwg(void)
 
 #define __read_mostly __attribute__((__section__(".data..read_mostly")))
 
-static inline int cache_line_size(void)
-{
-   u32 cwg = cache_type_cwg();
-   return cwg ? 4 << cwg : ARCH_DMA_MINALIGN;
-}
+int cache_line_size(void);
 
 /*
  * Read the effective value of CTR_EL0.
diff --git a/arch/arm64/kernel/cacheinfo.c b/arch/arm64/kernel/cacheinfo.c
index 0bf0a835122f..6ffe908d476c 100644
--- a/arch/arm64/kernel/cacheinfo.c
+++ b/arch/arm64/kernel/cacheinfo.c
@@ -28,6 +28,16 @@
 #define CLIDR_CTYPE(clidr, level)  \
(((clidr) & CLIDR_CTYPE_MASK(level)) >> CLIDR_CTYPE_SHIFT(level))
 
+int cache_line_size(void)
+{
+   u32 cwg = cache_type_cwg();
+
+   if (coherency_max_size != 0)
+   return coherency_max_size;
+
+   return cwg ? 4 << cwg : ARCH_DMA_MINALIGN;
+}
+
 static inline enum cache_type get_cache_type(int level)
 {
u64 clidr;
-- 
2.7.4



[PATCH v2 1/2] drivers: base: cacheinfo: Add variable to record max cache line size

2019-05-06 Thread Shaokun Zhang
Add coherency_max_size variable to record the maximum cache line size
for different cache levels. We will synchronize it with CTR_EL0.CWG
reporting in cache_line_size() for arm64.

Cc: Greg Kroah-Hartman 
Cc: "Rafael J. Wysocki" 
Cc: Sudeep Holla 
Cc: Catalin Marinas 
Cc: Jeremy Linton 
Cc: Will Deacon 
Signed-off-by: Shaokun Zhang 
---
ChangeLog since v1
  -- Move coherency_max_size to drivers/base/cacheinfo.c
  -- Address Catalin's comments
  Link: https://www.spinics.net/lists/arm-kernel/msg723615.html

 drivers/base/cacheinfo.c  | 5 +
 include/linux/cacheinfo.h | 2 ++
 2 files changed, 7 insertions(+)

diff --git a/drivers/base/cacheinfo.c b/drivers/base/cacheinfo.c
index a7359535caf5..8827c60f51e2 100644
--- a/drivers/base/cacheinfo.c
+++ b/drivers/base/cacheinfo.c
@@ -213,6 +213,8 @@ int __weak cache_setup_acpi(unsigned int cpu)
return -ENOTSUPP;
 }
 
+unsigned int coherency_max_size;
+
 static int cache_shared_cpu_map_setup(unsigned int cpu)
 {
struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
@@ -251,6 +253,9 @@ static int cache_shared_cpu_map_setup(unsigned int cpu)
cpumask_set_cpu(i, _leaf->shared_cpu_map);
}
}
+   /* record the maximum cache line size */
+   if (this_leaf->coherency_line_size > coherency_max_size)
+   coherency_max_size = this_leaf->coherency_line_size;
}
 
return 0;
diff --git a/include/linux/cacheinfo.h b/include/linux/cacheinfo.h
index 70e19bc6cc9f..46b92cd61d0c 100644
--- a/include/linux/cacheinfo.h
+++ b/include/linux/cacheinfo.h
@@ -17,6 +17,8 @@ enum cache_type {
CACHE_TYPE_UNIFIED = BIT(2),
 };
 
+extern unsigned int coherency_max_size;
+
 /**
  * struct cacheinfo - represent a cache leaf node
  * @id: This cache's id. It is unique among caches with the same (type, level).
-- 
2.7.4



[tip:perf/core] perf/headers: Fix stale comment for struct perf_addr_filter

2019-04-03 Thread tip-bot for Shaokun Zhang
Commit-ID:  1279e41d535e28cc3b56fa4a09e71a709641cae6
Gitweb: https://git.kernel.org/tip/1279e41d535e28cc3b56fa4a09e71a709641cae6
Author: Shaokun Zhang 
AuthorDate: Wed, 3 Apr 2019 14:54:24 +0800
Committer:  Ingo Molnar 
CommitDate: Wed, 3 Apr 2019 11:40:02 +0200

perf/headers: Fix stale comment for struct perf_addr_filter

The @inode field has been removed after:

  9511bce9fe8e ("perf/core: Fix bad use of igrab()")

Update the description.

Signed-off-by: Shaokun Zhang 
Signed-off-by: Peter Zijlstra (Intel) 
Cc: Alexander Shishkin 
Cc: Arnaldo Carvalho de Melo 
Cc: Arnaldo Carvalho de Melo 
Cc: Jiri Olsa 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Song Liu 
Cc: Stephane Eranian 
Cc: Thomas Gleixner 
Cc: Vince Weaver 
Link: 
https://lkml.kernel.org/r/1554274464-5739-1-git-send-email-zhangshao...@hisilicon.com
Signed-off-by: Ingo Molnar 
---
 include/linux/perf_event.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index e47ef764f613..085a95e2582a 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -464,7 +464,7 @@ enum perf_addr_filter_action_t {
 /**
  * struct perf_addr_filter - address range filter definition
  * @entry: event's filter list linkage
- * @inode: object file's inode for file-based filters
+ * @path:  object file's path for file-based filters
  * @offset:filter range offset
  * @size:  filter range size (size==0 means single address trigger)
  * @action:filter/start/stop


[PATCH] perf: Fix stale comment for struct perf_addr_filter

2019-04-03 Thread Shaokun Zhang
inode has been removed after commit 9511bce9fe8e
("perf/core: Fix bad use of igrab()"), Let's fix the stale comment. 

Cc: Peter Zijlstra 
Cc: Ingo Molnar 
Cc: Song Liu 
Cc: Arnaldo Carvalho de Melo  
Signed-off-by: Shaokun Zhang 
---
 include/linux/perf_event.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index e47ef764f613..085a95e2582a 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -464,7 +464,7 @@ enum perf_addr_filter_action_t {
 /**
  * struct perf_addr_filter - address range filter definition
  * @entry: event's filter list linkage
- * @inode: object file's inode for file-based filters
+ * @path:  object file's path for file-based filters
  * @offset:filter range offset
  * @size:  filter range size (size==0 means single address trigger)
  * @action:filter/start/stop
-- 
2.7.4



[PATCH -next] net: dsa: mv88e6xxx: Fix build warning when CONFIG_NET_DSA_LEGACY is n

2019-03-04 Thread Shaokun Zhang
When CONFIG_NET_DSA_LEGACY is n, there is a GCC bulid warning:
drivers/net/dsa/mv88e6xxx/chip.c:4623:13: warning: 
???mv88e6xxx_ports_cmode_init??? defined but not used [-Wunused-function]
static void mv88e6xxx_ports_cmode_init(struct mv88e6xxx_chip *chip)
Let's fix it.

Fixes: ed8fe20205ac ("net: dsa: mv88e6xxx: prevent interrupt storm caused by 
mv88e6390x_port_set_cmode")
Cc: Heiner Kallweit 
Cc: Andrew Lunn 
Cc: Vivien Didelot  
Cc: Florian Fainelli  
Cc: "David S. Miller" 
Signed-off-by: Shaokun Zhang 
---
 drivers/net/dsa/mv88e6xxx/chip.c | 16 
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index e4ad16b2dc38..168d4898c36f 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -4620,14 +4620,6 @@ static int mv88e6xxx_smi_init(struct mv88e6xxx_chip 
*chip,
return 0;
 }
 
-static void mv88e6xxx_ports_cmode_init(struct mv88e6xxx_chip *chip)
-{
-   int i;
-
-   for (i = 0; i < mv88e6xxx_num_ports(chip); i++)
-   chip->ports[i].cmode = MV88E6XXX_PORT_STS_CMODE_INVALID;
-}
-
 static enum dsa_tag_protocol mv88e6xxx_get_tag_protocol(struct dsa_switch *ds,
int port)
 {
@@ -4637,6 +4629,14 @@ static enum dsa_tag_protocol 
mv88e6xxx_get_tag_protocol(struct dsa_switch *ds,
 }
 
 #if IS_ENABLED(CONFIG_NET_DSA_LEGACY)
+static void mv88e6xxx_ports_cmode_init(struct mv88e6xxx_chip *chip)
+{
+   int i;
+
+   for (i = 0; i < mv88e6xxx_num_ports(chip); i++)
+   chip->ports[i].cmode = MV88E6XXX_PORT_STS_CMODE_INVALID;
+}
+
 static const char *mv88e6xxx_drv_probe(struct device *dsa_dev,
   struct device *host_dev, int sw_addr,
   void **priv)
-- 
2.7.4



[PATCH] perf tools: Fix function name in comment

2019-02-28 Thread Shaokun Zhang
get_cpuid_str() is used in tools/perf/arch/xxx/util/header.c,
fix the name in comment.

Cc: Arnaldo Carvalho de Melo 
CC: Andi Kleen 
Signed-off-by: Shaokun Zhang 
---
 tools/perf/pmu-events/pmu-events.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/pmu-events/pmu-events.h 
b/tools/perf/pmu-events/pmu-events.h
index 92a4d15ee0b9..9889e4701399 100644
--- a/tools/perf/pmu-events/pmu-events.h
+++ b/tools/perf/pmu-events/pmu-events.h
@@ -24,7 +24,7 @@ struct pmu_event {
  * Map a CPU to its table of PMU events. The CPU is identified by the
  * cpuid field, which is an arch-specific identifier for the CPU.
  * The identifier specified in tools/perf/pmu-events/arch/xxx/mapfile
- * must match the get_cpustr() in tools/perf/arch/xxx/util/header.c)
+ * must match the get_cpuid_str() in tools/perf/arch/xxx/util/header.c)
  *
  * The  cpuid can contain any character other than the comma.
  */
-- 
2.7.4



[tip:x86/cleanups] x86/smpboot: Remove unused phys_id variable

2019-02-18 Thread tip-bot for Shaokun Zhang
Commit-ID:  f91fecc09e498529230b4d5053cb361619a0c42d
Gitweb: https://git.kernel.org/tip/f91fecc09e498529230b4d5053cb361619a0c42d
Author: Shaokun Zhang 
AuthorDate: Mon, 18 Feb 2019 21:05:01 +0800
Committer:  Borislav Petkov 
CommitDate: Mon, 18 Feb 2019 17:09:24 +0100

x86/smpboot: Remove unused phys_id variable

The 'phys_id' local variable became unused after commit

  ce4b1b16502b ("x86/smpboot: Initialize secondary CPU only if master CPU will 
wait for it").

Remove it.

Signed-off-by: Shaokun Zhang 
Signed-off-by: Borislav Petkov 
Cc: Alison Schofield 
Cc: "H. Peter Anvin" 
Cc: Igor Mammedov 
Cc: Ingo Molnar 
Cc: Konrad Rzeszutek Wilk 
Cc: Mike Rapoport 
Cc: Pu Wen 
Cc: Suravee Suthikulpanit 
Cc: Thomas Gleixner 
Cc: x86-ml 
Cc: Yazen Ghannam 
Cc: Zhenzhong Duan 
Link: 
https://lkml.kernel.org/r/1550495101-41755-1-git-send-email-zhangshao...@hisilicon.com
---
 arch/x86/kernel/smpboot.c | 7 +--
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index ccd1f2a8e557..5d5421b48e55 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -149,7 +149,7 @@ static inline void smpboot_restore_warm_reset_vector(void)
  */
 static void smp_callin(void)
 {
-   int cpuid, phys_id;
+   int cpuid;
 
/*
 * If waken up by an INIT in an 82489DX configuration
@@ -159,11 +159,6 @@ static void smp_callin(void)
 */
cpuid = smp_processor_id();
 
-   /*
-* (This works even if the APIC is not enabled.)
-*/
-   phys_id = read_apic_id();
-
/*
 * the boot CPU has finished the init stage and is spinning
 * on callin_map until we finish. We are free to set up this


[PATCH] x86/smpboot: Remove unused phys_id variable

2019-02-18 Thread Shaokun Zhang
The 'phys_id' local variable became unused after commit ce4b1b16502b
("x86/smpboot: Initialize secondary CPU only if master CPU will wait for it").
Remove it.

Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: Borislav Petkov 
Cc: "H. Peter Anvin" 
Cc: Igor Mammedov 
Signed-off-by: Shaokun Zhang 
---
 arch/x86/kernel/smpboot.c | 7 +--
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index ccd1f2a8e557..5d5421b48e55 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -149,7 +149,7 @@ static inline void smpboot_restore_warm_reset_vector(void)
  */
 static void smp_callin(void)
 {
-   int cpuid, phys_id;
+   int cpuid;
 
/*
 * If waken up by an INIT in an 82489DX configuration
@@ -160,11 +160,6 @@ static void smp_callin(void)
cpuid = smp_processor_id();
 
/*
-* (This works even if the APIC is not enabled.)
-*/
-   phys_id = read_apic_id();
-
-   /*
 * the boot CPU has finished the init stage and is spinning
 * on callin_map until we finish. We are free to set up this
 * CPU, first the APIC. (this is probably redundant on most
-- 
2.7.4



[tip:x86/cleanups] x86/mm/dump_pagetables: Remove the unused prev_pud variable

2019-02-14 Thread tip-bot for Shaokun Zhang
Commit-ID:  8e8a3cea7ea5f5458fdf2287713626892e7715f5
Gitweb: https://git.kernel.org/tip/8e8a3cea7ea5f5458fdf2287713626892e7715f5
Author: Shaokun Zhang 
AuthorDate: Thu, 14 Feb 2019 17:33:49 +0800
Committer:  Borislav Petkov 
CommitDate: Thu, 14 Feb 2019 17:09:43 +0100

x86/mm/dump_pagetables: Remove the unused prev_pud variable

The 'prev_pud' local variable became unused after commit

  04b67022fb6d ("x86/mm/dump_pagetables: Speed up page tables dump for 
CONFIG_KASAN=y").

Remove it.

Signed-off-by: Shaokun Zhang 
Signed-off-by: Borislav Petkov 
Cc: Andrey Ryabinin 
Cc: Andy Lutomirski 
Cc: Dave Hansen 
Cc: "H. Peter Anvin" 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
Cc: x86-ml 
Link: 
https://lkml.kernel.org/r/1550136829-49088-1-git-send-email-zhangshao...@hisilicon.com
---
 arch/x86/mm/dump_pagetables.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index e3cdc85ce5b6..ee8f8ab46941 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -444,7 +444,6 @@ static void walk_pud_level(struct seq_file *m, struct 
pg_state *st, p4d_t addr,
int i;
pud_t *start, *pud_start;
pgprotval_t prot, eff;
-   pud_t *prev_pud = NULL;
 
pud_start = start = (pud_t *)p4d_page_vaddr(addr);
 
@@ -462,7 +461,6 @@ static void walk_pud_level(struct seq_file *m, struct 
pg_state *st, p4d_t addr,
} else
note_page(m, st, __pgprot(0), 0, 3);
 
-   prev_pud = start;
start++;
}
 }


[PATCH] x86/mm/dump_pagetables: Remove the unused prev_pud variable

2019-02-14 Thread Shaokun Zhang
The 'prev_pud' local variable became unused after commit 04b67022fb6d 
("x86/mm/dump_pagetables: Speed up page tables dump for CONFIG_KASAN=y"),
let's remove it.

Cc: Dave Hansen 
Cc: Andy Lutomirski 
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: Borislav Petkov 
Cc: "H. Peter Anvin" 
Cc: Andrey Ryabinin 
Signed-off-by: Shaokun Zhang 
---
 arch/x86/mm/dump_pagetables.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index e3cdc85ce5b6..ee8f8ab46941 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -444,7 +444,6 @@ static void walk_pud_level(struct seq_file *m, struct 
pg_state *st, p4d_t addr,
int i;
pud_t *start, *pud_start;
pgprotval_t prot, eff;
-   pud_t *prev_pud = NULL;
 
pud_start = start = (pud_t *)p4d_page_vaddr(addr);
 
@@ -462,7 +461,6 @@ static void walk_pud_level(struct seq_file *m, struct 
pg_state *st, p4d_t addr,
} else
note_page(m, st, __pgprot(0), 0, 3);
 
-   prev_pud = start;
start++;
}
 }
-- 
2.7.4



[tip:x86/cleanups] x86/mm/tlb: Remove unused cpu variable

2019-01-29 Thread tip-bot for Shaokun Zhang
Commit-ID:  691b9ab6c9676e5868a4787be9041dd990005311
Gitweb: https://git.kernel.org/tip/691b9ab6c9676e5868a4787be9041dd990005311
Author: Shaokun Zhang 
AuthorDate: Tue, 29 Jan 2019 15:36:57 +0800
Committer:  Borislav Petkov 
CommitDate: Tue, 29 Jan 2019 18:32:30 +0100

x86/mm/tlb: Remove unused cpu variable

The "cpu" local variable became unused after

  a2055abe9c67 ("x86/mm: Pass flush_tlb_info to flush_tlb_others() etc").

Remove it.

Signed-off-by: Shaokun Zhang 
Signed-off-by: Borislav Petkov 
Cc: Andy Lutomirski 
Cc: Dave Hansen 
Cc: "H. Peter Anvin" 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
Cc: x86-ml 
Link: 
https://lkml.kernel.org/r/1548747417-33551-1-git-send-email-zhangshao...@hisilicon.com
---
 arch/x86/mm/tlb.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 999d6d8f0bef..bc4bc7b2f075 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -685,9 +685,6 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
 * that UV should be updated so that smp_call_function_many(),
 * etc, are optimal on UV.
 */
-   unsigned int cpu;
-
-   cpu = smp_processor_id();
cpumask = uv_flush_tlb_others(cpumask, info);
if (cpumask)
smp_call_function_many(cpumask, flush_tlb_func_remote,


[PATCH] x86/mm: Remove unused cpu variable

2019-01-28 Thread Shaokun Zhang
cpu variable is never used after commit  ("x86/mm: Pass
flush_tlb_info to flush_tlb_others() etc"), so remove it.

Cc: Andy Lutomirski 
Cc: Dave Hansen 
Cc: Peter Zijlstra 
Signed-off-by: Shaokun Zhang 
---
 arch/x86/mm/tlb.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 999d6d8f0bef..bc4bc7b2f075 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -685,9 +685,6 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
 * that UV should be updated so that smp_call_function_many(),
 * etc, are optimal on UV.
 */
-   unsigned int cpu;
-
-   cpu = smp_processor_id();
cpumask = uv_flush_tlb_others(cpumask, info);
if (cpumask)
smp_call_function_many(cpumask, flush_tlb_func_remote,
-- 
2.7.4



[PATCH] iommu/dma: Remove unused variable

2019-01-23 Thread Shaokun Zhang
end_pfn is never used after commit  ('iommu/iova: Make dma
32bit pfn implicit'), cleanup it.

Cc: Joerg Roedel 
Cc: Robin Murphy 
Cc: Zhen Lei 
Signed-off-by: Shaokun Zhang 
---
 drivers/iommu/dma-iommu.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index d19f3d6b43c1..77aabe637a60 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -289,7 +289,7 @@ int iommu_dma_init_domain(struct iommu_domain *domain, 
dma_addr_t base,
 {
struct iommu_dma_cookie *cookie = domain->iova_cookie;
struct iova_domain *iovad = >iovad;
-   unsigned long order, base_pfn, end_pfn;
+   unsigned long order, base_pfn;
int attr;
 
if (!cookie || cookie->type != IOMMU_DMA_IOVA_COOKIE)
@@ -298,7 +298,6 @@ int iommu_dma_init_domain(struct iommu_domain *domain, 
dma_addr_t base,
/* Use the smallest supported page size for IOVA granularity */
order = __ffs(domain->pgsize_bitmap);
base_pfn = max_t(unsigned long, 1, base >> order);
-   end_pfn = (base + size - 1) >> order;
 
/* Check the domain allows at least some access to the device... */
if (domain->geometry.force_aperture) {
-- 
2.7.4



[PATCH] driver core: remove unnecessary function extern declare

2018-07-15 Thread Shaokun Zhang
device_private_init is called only in core.c, extern declare is
unnecessary and make it static.

Cc: Greg Kroah-Hartman 
Cc: "Rafael J. Wysocki" 
Signed-off-by: Shaokun Zhang 
---
 drivers/base/base.h | 2 --
 drivers/base/core.c | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/base/base.h b/drivers/base/base.h
index a75c302..7a419a7 100644
--- a/drivers/base/base.h
+++ b/drivers/base/base.h
@@ -84,8 +84,6 @@ struct device_private {
 #define to_device_private_bus(obj) \
container_of(obj, struct device_private, knode_bus)
 
-extern int device_private_init(struct device *dev);
-
 /* initialisation functions */
 extern int devices_init(void);
 extern int buses_init(void);
diff --git a/drivers/base/core.c b/drivers/base/core.c
index df3e1a4..2ba30ce 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -1736,7 +1736,7 @@ static void device_remove_sys_dev_entry(struct device 
*dev)
}
 }
 
-int device_private_init(struct device *dev)
+static int device_private_init(struct device *dev)
 {
dev->p = kzalloc(sizeof(*dev->p), GFP_KERNEL);
if (!dev->p)
-- 
2.7.4



[PATCH] driver core: remove unnecessary function extern declare

2018-07-15 Thread Shaokun Zhang
device_private_init is called only in core.c, extern declare is
unnecessary and make it static.

Cc: Greg Kroah-Hartman 
Cc: "Rafael J. Wysocki" 
Signed-off-by: Shaokun Zhang 
---
 drivers/base/base.h | 2 --
 drivers/base/core.c | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/base/base.h b/drivers/base/base.h
index a75c302..7a419a7 100644
--- a/drivers/base/base.h
+++ b/drivers/base/base.h
@@ -84,8 +84,6 @@ struct device_private {
 #define to_device_private_bus(obj) \
container_of(obj, struct device_private, knode_bus)
 
-extern int device_private_init(struct device *dev);
-
 /* initialisation functions */
 extern int devices_init(void);
 extern int buses_init(void);
diff --git a/drivers/base/core.c b/drivers/base/core.c
index df3e1a4..2ba30ce 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -1736,7 +1736,7 @@ static void device_remove_sys_dev_entry(struct device 
*dev)
}
 }
 
-int device_private_init(struct device *dev)
+static int device_private_init(struct device *dev)
 {
dev->p = kzalloc(sizeof(*dev->p), GFP_KERNEL);
if (!dev->p)
-- 
2.7.4



[PATCH] iommu/vt-d, trivial: Remove unused variable

2018-03-22 Thread Shaokun Zhang
Unused after commit <42e8c186b595> ("iommu/vt-d: Simplify io/tlb flushing
in intel_iommu_unmap"), cleanup it.

Cc: David Woodhouse <dw...@infradead.org>
Cc: Joerg Roedel <j...@8bytes.org>
Signed-off-by: Shaokun Zhang <zhangshao...@hisilicon.com>
---
 drivers/iommu/intel-iommu.c | 6 +-
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 582fd01..d49e0d3 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -5072,7 +5072,6 @@ static size_t intel_iommu_unmap(struct iommu_domain 
*domain,
 {
struct dmar_domain *dmar_domain = to_dmar_domain(domain);
struct page *freelist = NULL;
-   struct intel_iommu *iommu;
unsigned long start_pfn, last_pfn;
unsigned int npages;
int iommu_id, level = 0;
@@ -5091,12 +5090,9 @@ static size_t intel_iommu_unmap(struct iommu_domain 
*domain,
 
npages = last_pfn - start_pfn + 1;
 
-   for_each_domain_iommu(iommu_id, dmar_domain) {
-   iommu = g_iommus[iommu_id];
-
+   for_each_domain_iommu(iommu_id, dmar_domain)
iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
  start_pfn, npages, !freelist, 0);
-   }
 
dma_free_pagelist(freelist);
 
-- 
1.9.1



[PATCH] iommu/vt-d, trivial: Remove unused variable

2018-03-22 Thread Shaokun Zhang
Unused after commit <42e8c186b595> ("iommu/vt-d: Simplify io/tlb flushing
in intel_iommu_unmap"), cleanup it.

Cc: David Woodhouse 
Cc: Joerg Roedel 
Signed-off-by: Shaokun Zhang 
---
 drivers/iommu/intel-iommu.c | 6 +-
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 582fd01..d49e0d3 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -5072,7 +5072,6 @@ static size_t intel_iommu_unmap(struct iommu_domain 
*domain,
 {
struct dmar_domain *dmar_domain = to_dmar_domain(domain);
struct page *freelist = NULL;
-   struct intel_iommu *iommu;
unsigned long start_pfn, last_pfn;
unsigned int npages;
int iommu_id, level = 0;
@@ -5091,12 +5090,9 @@ static size_t intel_iommu_unmap(struct iommu_domain 
*domain,
 
npages = last_pfn - start_pfn + 1;
 
-   for_each_domain_iommu(iommu_id, dmar_domain) {
-   iommu = g_iommus[iommu_id];
-
+   for_each_domain_iommu(iommu_id, dmar_domain)
iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
  start_pfn, npages, !freelist, 0);
-   }
 
dma_free_pagelist(freelist);
 
-- 
1.9.1



[PATCH v6 2/6] perf: hisi: Add support for HiSilicon SoC uncore PMU driver

2017-10-19 Thread Shaokun Zhang
This patch adds support HiSilicon SoC uncore PMU driver framework and
interfaces.

Reviewed-by: Jonathan Cameron <jonathan.came...@huawei.com>
Signed-off-by: Shaokun Zhang <zhangshao...@hisilicon.com>
Signed-off-by: Anurup M <anuru...@huawei.com>
---
 drivers/perf/Kconfig |   7 +
 drivers/perf/Makefile|   1 +
 drivers/perf/hisilicon/Makefile  |   1 +
 drivers/perf/hisilicon/hisi_uncore_pmu.c | 444 +++
 drivers/perf/hisilicon/hisi_uncore_pmu.h | 102 +++
 5 files changed, 555 insertions(+)
 create mode 100644 drivers/perf/hisilicon/Makefile
 create mode 100644 drivers/perf/hisilicon/hisi_uncore_pmu.c
 create mode 100644 drivers/perf/hisilicon/hisi_uncore_pmu.h

diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig
index e5197ff..b1a3894 100644
--- a/drivers/perf/Kconfig
+++ b/drivers/perf/Kconfig
@@ -17,6 +17,13 @@ config ARM_PMU_ACPI
depends on ARM_PMU && ACPI
def_bool y
 
+config HISI_PMU
+   bool "HiSilicon SoC PMU"
+   depends on ARM64 && ACPI
+   help
+ Support for HiSilicon SoC uncore performance monitoring
+ unit (PMU), such as: L3C, HHA and DDRC.
+
 config QCOM_L2_PMU
bool "Qualcomm Technologies L2-cache PMU"
depends on ARCH_QCOM && ARM64 && ACPI
diff --git a/drivers/perf/Makefile b/drivers/perf/Makefile
index 6420bd4..41d3342 100644
--- a/drivers/perf/Makefile
+++ b/drivers/perf/Makefile
@@ -1,5 +1,6 @@
 obj-$(CONFIG_ARM_PMU) += arm_pmu.o arm_pmu_platform.o
 obj-$(CONFIG_ARM_PMU_ACPI) += arm_pmu_acpi.o
+obj-$(CONFIG_HISI_PMU) += hisilicon/
 obj-$(CONFIG_QCOM_L2_PMU)  += qcom_l2_pmu.o
 obj-$(CONFIG_QCOM_L3_PMU) += qcom_l3_pmu.o
 obj-$(CONFIG_XGENE_PMU) += xgene_pmu.o
diff --git a/drivers/perf/hisilicon/Makefile b/drivers/perf/hisilicon/Makefile
new file mode 100644
index 000..2783bb3
--- /dev/null
+++ b/drivers/perf/hisilicon/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_HISI_PMU) += hisi_uncore_pmu.o
diff --git a/drivers/perf/hisilicon/hisi_uncore_pmu.c 
b/drivers/perf/hisilicon/hisi_uncore_pmu.c
new file mode 100644
index 000..2bff43f
--- /dev/null
+++ b/drivers/perf/hisilicon/hisi_uncore_pmu.c
@@ -0,0 +1,444 @@
+/*
+ * HiSilicon SoC Hardware event counters support
+ *
+ * Copyright (C) 2017 Hisilicon Limited
+ * Author: Anurup M <anuru...@huawei.com>
+ * Shaokun Zhang <zhangshao...@hisilicon.com>
+ *
+ * This code is based on the uncore PMUs like arm-cci and arm-ccn.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+
+#include "hisi_uncore_pmu.h"
+
+#define HISI_GET_EVENTID(ev) (ev->hw.config_base & 0xff)
+#define HISI_MAX_PERIOD(nr) (BIT_ULL(nr) - 1)
+
+/*
+ * PMU format attributes
+ */
+ssize_t hisi_format_sysfs_show(struct device *dev,
+  struct device_attribute *attr, char *buf)
+{
+   struct dev_ext_attribute *eattr;
+
+   eattr = container_of(attr, struct dev_ext_attribute, attr);
+
+   return sprintf(buf, "%s\n", (char *)eattr->var);
+}
+
+/*
+ * PMU event attributes
+ */
+ssize_t hisi_event_sysfs_show(struct device *dev,
+ struct device_attribute *attr, char *page)
+{
+   struct dev_ext_attribute *eattr;
+
+   eattr = container_of(attr, struct dev_ext_attribute, attr);
+
+   return sprintf(page, "config=0x%lx\n", (unsigned long)eattr->var);
+}
+
+/*
+ * sysfs cpumask attributes. For uncore PMU, we only have a single CPU to show
+ */
+ssize_t hisi_cpumask_sysfs_show(struct device *dev,
+   struct device_attribute *attr, char *buf)
+{
+   struct hisi_pmu *hisi_pmu = to_hisi_pmu(dev_get_drvdata(dev));
+
+   return sprintf(buf, "%d\n", hisi_pmu->on_cpu);
+}
+
+static bool hisi_validate_event_group(struct perf_event *event)
+{
+   struct perf_event *sibling, *leader = event->group_leader;
+   struct hisi_pmu *hisi_pmu = to_hisi_pmu(event->pmu);
+   /* Include count for the event */
+   int counters = 1;
+
+   /*
+* We must NOT create groups containing mixed PMUs, although
+* software events are acceptable
+*/
+   if (leader->pmu != event->pmu && !is_software_event(leader))
+   return false;
+
+   /* Increment counter for the leader */
+   counters++;
+
+   list_for_each_entry(sibling, >group_leader->sibling_list,
+   group_entry) {
+   if (is_software_event(sibling))
+   continue;
+   if (sibling->pmu != event->pmu)
+   return false;
+   /* Increment counter for

[PATCH v6 2/6] perf: hisi: Add support for HiSilicon SoC uncore PMU driver

2017-10-19 Thread Shaokun Zhang
This patch adds support HiSilicon SoC uncore PMU driver framework and
interfaces.

Reviewed-by: Jonathan Cameron 
Signed-off-by: Shaokun Zhang 
Signed-off-by: Anurup M 
---
 drivers/perf/Kconfig |   7 +
 drivers/perf/Makefile|   1 +
 drivers/perf/hisilicon/Makefile  |   1 +
 drivers/perf/hisilicon/hisi_uncore_pmu.c | 444 +++
 drivers/perf/hisilicon/hisi_uncore_pmu.h | 102 +++
 5 files changed, 555 insertions(+)
 create mode 100644 drivers/perf/hisilicon/Makefile
 create mode 100644 drivers/perf/hisilicon/hisi_uncore_pmu.c
 create mode 100644 drivers/perf/hisilicon/hisi_uncore_pmu.h

diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig
index e5197ff..b1a3894 100644
--- a/drivers/perf/Kconfig
+++ b/drivers/perf/Kconfig
@@ -17,6 +17,13 @@ config ARM_PMU_ACPI
depends on ARM_PMU && ACPI
def_bool y
 
+config HISI_PMU
+   bool "HiSilicon SoC PMU"
+   depends on ARM64 && ACPI
+   help
+ Support for HiSilicon SoC uncore performance monitoring
+ unit (PMU), such as: L3C, HHA and DDRC.
+
 config QCOM_L2_PMU
bool "Qualcomm Technologies L2-cache PMU"
depends on ARCH_QCOM && ARM64 && ACPI
diff --git a/drivers/perf/Makefile b/drivers/perf/Makefile
index 6420bd4..41d3342 100644
--- a/drivers/perf/Makefile
+++ b/drivers/perf/Makefile
@@ -1,5 +1,6 @@
 obj-$(CONFIG_ARM_PMU) += arm_pmu.o arm_pmu_platform.o
 obj-$(CONFIG_ARM_PMU_ACPI) += arm_pmu_acpi.o
+obj-$(CONFIG_HISI_PMU) += hisilicon/
 obj-$(CONFIG_QCOM_L2_PMU)  += qcom_l2_pmu.o
 obj-$(CONFIG_QCOM_L3_PMU) += qcom_l3_pmu.o
 obj-$(CONFIG_XGENE_PMU) += xgene_pmu.o
diff --git a/drivers/perf/hisilicon/Makefile b/drivers/perf/hisilicon/Makefile
new file mode 100644
index 000..2783bb3
--- /dev/null
+++ b/drivers/perf/hisilicon/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_HISI_PMU) += hisi_uncore_pmu.o
diff --git a/drivers/perf/hisilicon/hisi_uncore_pmu.c 
b/drivers/perf/hisilicon/hisi_uncore_pmu.c
new file mode 100644
index 000..2bff43f
--- /dev/null
+++ b/drivers/perf/hisilicon/hisi_uncore_pmu.c
@@ -0,0 +1,444 @@
+/*
+ * HiSilicon SoC Hardware event counters support
+ *
+ * Copyright (C) 2017 Hisilicon Limited
+ * Author: Anurup M 
+ * Shaokun Zhang 
+ *
+ * This code is based on the uncore PMUs like arm-cci and arm-ccn.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+
+#include "hisi_uncore_pmu.h"
+
+#define HISI_GET_EVENTID(ev) (ev->hw.config_base & 0xff)
+#define HISI_MAX_PERIOD(nr) (BIT_ULL(nr) - 1)
+
+/*
+ * PMU format attributes
+ */
+ssize_t hisi_format_sysfs_show(struct device *dev,
+  struct device_attribute *attr, char *buf)
+{
+   struct dev_ext_attribute *eattr;
+
+   eattr = container_of(attr, struct dev_ext_attribute, attr);
+
+   return sprintf(buf, "%s\n", (char *)eattr->var);
+}
+
+/*
+ * PMU event attributes
+ */
+ssize_t hisi_event_sysfs_show(struct device *dev,
+ struct device_attribute *attr, char *page)
+{
+   struct dev_ext_attribute *eattr;
+
+   eattr = container_of(attr, struct dev_ext_attribute, attr);
+
+   return sprintf(page, "config=0x%lx\n", (unsigned long)eattr->var);
+}
+
+/*
+ * sysfs cpumask attributes. For uncore PMU, we only have a single CPU to show
+ */
+ssize_t hisi_cpumask_sysfs_show(struct device *dev,
+   struct device_attribute *attr, char *buf)
+{
+   struct hisi_pmu *hisi_pmu = to_hisi_pmu(dev_get_drvdata(dev));
+
+   return sprintf(buf, "%d\n", hisi_pmu->on_cpu);
+}
+
+static bool hisi_validate_event_group(struct perf_event *event)
+{
+   struct perf_event *sibling, *leader = event->group_leader;
+   struct hisi_pmu *hisi_pmu = to_hisi_pmu(event->pmu);
+   /* Include count for the event */
+   int counters = 1;
+
+   /*
+* We must NOT create groups containing mixed PMUs, although
+* software events are acceptable
+*/
+   if (leader->pmu != event->pmu && !is_software_event(leader))
+   return false;
+
+   /* Increment counter for the leader */
+   counters++;
+
+   list_for_each_entry(sibling, >group_leader->sibling_list,
+   group_entry) {
+   if (is_software_event(sibling))
+   continue;
+   if (sibling->pmu != event->pmu)
+   return false;
+   /* Increment counter for each sibling */
+   counters++;
+   }
+
+   /* The group can not count events more than the counters in the HW */
+   return count

[PATCH v6 3/6] perf: hisi: Add support for HiSilicon SoC L3C PMU driver

2017-10-19 Thread Shaokun Zhang
This patch adds support for L3C PMU driver in HiSilicon SoC chip, Each
L3C has own control, counter and interrupt registers and is an separate
PMU. For each L3C PMU, it has 8-programable counters and each counter
is free-running. Interrupt is supported to handle counter (48-bits)
overflow.

Reviewed-by: Jonathan Cameron <jonathan.came...@huawei.com>
Signed-off-by: Shaokun Zhang <zhangshao...@hisilicon.com>
Signed-off-by: Anurup M <anuru...@huawei.com>
---
 drivers/perf/hisilicon/Makefile  |   2 +-
 drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c | 463 +++
 include/linux/cpuhotplug.h   |   1 +
 3 files changed, 465 insertions(+), 1 deletion(-)
 create mode 100644 drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c

diff --git a/drivers/perf/hisilicon/Makefile b/drivers/perf/hisilicon/Makefile
index 2783bb3..4a3d3e6 100644
--- a/drivers/perf/hisilicon/Makefile
+++ b/drivers/perf/hisilicon/Makefile
@@ -1 +1 @@
-obj-$(CONFIG_HISI_PMU) += hisi_uncore_pmu.o
+obj-$(CONFIG_HISI_PMU) += hisi_uncore_pmu.o hisi_uncore_l3c_pmu.o
diff --git a/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c 
b/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c
new file mode 100644
index 000..0bde5d9
--- /dev/null
+++ b/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c
@@ -0,0 +1,463 @@
+/*
+ * HiSilicon SoC L3C uncore Hardware event counters support
+ *
+ * Copyright (C) 2017 Hisilicon Limited
+ * Author: Anurup M <anuru...@huawei.com>
+ * Shaokun Zhang <zhangshao...@hisilicon.com>
+ *
+ * This code is based on the uncore PMUs like arm-cci and arm-ccn.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "hisi_uncore_pmu.h"
+
+/* L3C register definition */
+#define L3C_PERF_CTRL  0x0408
+#define L3C_INT_MASK   0x0800
+#define L3C_INT_STATUS 0x0808
+#define L3C_INT_CLEAR  0x080c
+#define L3C_EVENT_CTRL 0x1c00
+#define L3C_EVENT_TYPE00x1d00
+/*
+ * Each counter is 48-bits and [48:63] are reserved
+ * which are Read-As-Zero and Writes-Ignored.
+ */
+#define L3C_CNTR0_LOWER0x1e00
+
+/* L3C has 8-counters */
+#define L3C_NR_COUNTERS0x8
+
+#define L3C_PERF_CTRL_EN   0x2
+#define L3C_EVTYPE_NONE0xff
+
+/*
+ * Select the counter register offset using the counter index
+ */
+static u32 hisi_l3c_pmu_get_counter_offset(int cntr_idx)
+{
+   return (L3C_CNTR0_LOWER + (cntr_idx * 8));
+}
+
+static u64 hisi_l3c_pmu_read_counter(struct hisi_pmu *l3c_pmu,
+struct hw_perf_event *hwc)
+{
+   u32 idx = hwc->idx;
+
+   if (!hisi_uncore_pmu_counter_valid(l3c_pmu, idx)) {
+   dev_err(l3c_pmu->dev, "Unsupported event index:%d!\n", idx);
+   return 0;
+   }
+
+   /* Read 64-bits and the upper 16 bits are RAZ */
+   return readq(l3c_pmu->base + hisi_l3c_pmu_get_counter_offset(idx));
+}
+
+static void hisi_l3c_pmu_write_counter(struct hisi_pmu *l3c_pmu,
+  struct hw_perf_event *hwc, u64 val)
+{
+   u32 idx = hwc->idx;
+
+   if (!hisi_uncore_pmu_counter_valid(l3c_pmu, idx)) {
+   dev_err(l3c_pmu->dev, "Unsupported event index:%d!\n", idx);
+   return;
+   }
+
+   /* Write 64-bits and the upper 16 bits are WI */
+   writeq(val, l3c_pmu->base + hisi_l3c_pmu_get_counter_offset(idx));
+}
+
+static void hisi_l3c_pmu_write_evtype(struct hisi_pmu *l3c_pmu, int idx,
+ u32 type)
+{
+   u32 reg, reg_idx, shift, val;
+
+   /*
+* Select the appropriate event select register(L3C_EVENT_TYPE0/1).
+* There are 2 event select registers for the 8 hardware counters.
+* Event code is 8-bits and for the former 4 hardware counters,
+* L3C_EVENT_TYPE0 is chosen. For the latter 4 hardware counters,
+* L3C_EVENT_TYPE1 is chosen.
+*/
+   reg = L3C_EVENT_TYPE0 + (idx / 4) * 4;
+   reg_idx = idx % 4;
+   shift = 8 * reg_idx;
+
+   /* Write event code to L3C_EVENT_TYPEx Register */
+   val = readl(l3c_pmu->base + reg);
+   val &= ~(L3C_EVTYPE_NONE << shift);
+   val |= (type << shift);
+   writel(val, l3c_pmu->base + reg);
+}
+
+static void hisi_l3c_pmu_start_counters(struct hisi_pmu *l3c_pmu)
+{
+   u32 val;
+
+   /*
+* Set perf_enable bit in L3C_PERF_CTRL register to start counting
+* for all enabled counters.
+*/
+   val = readl(l3c_pmu->base + L3C_PERF_CTRL);
+   val |= L3C_PERF_CTRL_EN;
+   writel(val, l3c_pmu->base + L3C_PERF_CTRL);
+}
+
+static void hisi_l3c_pmu

[PATCH v6 3/6] perf: hisi: Add support for HiSilicon SoC L3C PMU driver

2017-10-19 Thread Shaokun Zhang
This patch adds support for L3C PMU driver in HiSilicon SoC chip, Each
L3C has own control, counter and interrupt registers and is an separate
PMU. For each L3C PMU, it has 8-programable counters and each counter
is free-running. Interrupt is supported to handle counter (48-bits)
overflow.

Reviewed-by: Jonathan Cameron 
Signed-off-by: Shaokun Zhang 
Signed-off-by: Anurup M 
---
 drivers/perf/hisilicon/Makefile  |   2 +-
 drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c | 463 +++
 include/linux/cpuhotplug.h   |   1 +
 3 files changed, 465 insertions(+), 1 deletion(-)
 create mode 100644 drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c

diff --git a/drivers/perf/hisilicon/Makefile b/drivers/perf/hisilicon/Makefile
index 2783bb3..4a3d3e6 100644
--- a/drivers/perf/hisilicon/Makefile
+++ b/drivers/perf/hisilicon/Makefile
@@ -1 +1 @@
-obj-$(CONFIG_HISI_PMU) += hisi_uncore_pmu.o
+obj-$(CONFIG_HISI_PMU) += hisi_uncore_pmu.o hisi_uncore_l3c_pmu.o
diff --git a/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c 
b/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c
new file mode 100644
index 000..0bde5d9
--- /dev/null
+++ b/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c
@@ -0,0 +1,463 @@
+/*
+ * HiSilicon SoC L3C uncore Hardware event counters support
+ *
+ * Copyright (C) 2017 Hisilicon Limited
+ * Author: Anurup M 
+ * Shaokun Zhang 
+ *
+ * This code is based on the uncore PMUs like arm-cci and arm-ccn.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "hisi_uncore_pmu.h"
+
+/* L3C register definition */
+#define L3C_PERF_CTRL  0x0408
+#define L3C_INT_MASK   0x0800
+#define L3C_INT_STATUS 0x0808
+#define L3C_INT_CLEAR  0x080c
+#define L3C_EVENT_CTRL 0x1c00
+#define L3C_EVENT_TYPE00x1d00
+/*
+ * Each counter is 48-bits and [48:63] are reserved
+ * which are Read-As-Zero and Writes-Ignored.
+ */
+#define L3C_CNTR0_LOWER0x1e00
+
+/* L3C has 8-counters */
+#define L3C_NR_COUNTERS0x8
+
+#define L3C_PERF_CTRL_EN   0x2
+#define L3C_EVTYPE_NONE0xff
+
+/*
+ * Select the counter register offset using the counter index
+ */
+static u32 hisi_l3c_pmu_get_counter_offset(int cntr_idx)
+{
+   return (L3C_CNTR0_LOWER + (cntr_idx * 8));
+}
+
+static u64 hisi_l3c_pmu_read_counter(struct hisi_pmu *l3c_pmu,
+struct hw_perf_event *hwc)
+{
+   u32 idx = hwc->idx;
+
+   if (!hisi_uncore_pmu_counter_valid(l3c_pmu, idx)) {
+   dev_err(l3c_pmu->dev, "Unsupported event index:%d!\n", idx);
+   return 0;
+   }
+
+   /* Read 64-bits and the upper 16 bits are RAZ */
+   return readq(l3c_pmu->base + hisi_l3c_pmu_get_counter_offset(idx));
+}
+
+static void hisi_l3c_pmu_write_counter(struct hisi_pmu *l3c_pmu,
+  struct hw_perf_event *hwc, u64 val)
+{
+   u32 idx = hwc->idx;
+
+   if (!hisi_uncore_pmu_counter_valid(l3c_pmu, idx)) {
+   dev_err(l3c_pmu->dev, "Unsupported event index:%d!\n", idx);
+   return;
+   }
+
+   /* Write 64-bits and the upper 16 bits are WI */
+   writeq(val, l3c_pmu->base + hisi_l3c_pmu_get_counter_offset(idx));
+}
+
+static void hisi_l3c_pmu_write_evtype(struct hisi_pmu *l3c_pmu, int idx,
+ u32 type)
+{
+   u32 reg, reg_idx, shift, val;
+
+   /*
+* Select the appropriate event select register(L3C_EVENT_TYPE0/1).
+* There are 2 event select registers for the 8 hardware counters.
+* Event code is 8-bits and for the former 4 hardware counters,
+* L3C_EVENT_TYPE0 is chosen. For the latter 4 hardware counters,
+* L3C_EVENT_TYPE1 is chosen.
+*/
+   reg = L3C_EVENT_TYPE0 + (idx / 4) * 4;
+   reg_idx = idx % 4;
+   shift = 8 * reg_idx;
+
+   /* Write event code to L3C_EVENT_TYPEx Register */
+   val = readl(l3c_pmu->base + reg);
+   val &= ~(L3C_EVTYPE_NONE << shift);
+   val |= (type << shift);
+   writel(val, l3c_pmu->base + reg);
+}
+
+static void hisi_l3c_pmu_start_counters(struct hisi_pmu *l3c_pmu)
+{
+   u32 val;
+
+   /*
+* Set perf_enable bit in L3C_PERF_CTRL register to start counting
+* for all enabled counters.
+*/
+   val = readl(l3c_pmu->base + L3C_PERF_CTRL);
+   val |= L3C_PERF_CTRL_EN;
+   writel(val, l3c_pmu->base + L3C_PERF_CTRL);
+}
+
+static void hisi_l3c_pmu_stop_counters(struct hisi_pmu *l3c_pmu)
+{
+   u32 val;
+
+   /*
+* Clear perf_enable bit in L3C_PERF_CTRL register to stop counting
+ 

[PATCH v6 6/6] arm64: MAINTAINERS: hisi: Add HiSilicon SoC PMU support

2017-10-19 Thread Shaokun Zhang
Add support HiSilicon SoC uncore PMU driver.

Signed-off-by: Shaokun Zhang <zhangshao...@hisilicon.com>
---
 MAINTAINERS | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index a74227a..96c583c 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6242,6 +6242,13 @@ S:   Maintained
 F: drivers/net/ethernet/hisilicon/
 F: Documentation/devicetree/bindings/net/hisilicon*.txt
 
+HISILICON PMU DRIVER
+M: Shaokun Zhang <zhangshao...@hisilicon.com>
+W: http://www.hisilicon.com
+S: Supported
+F: drivers/perf/hisilicon
+F: Documentation/perf/hisi-pmu.txt
+
 HISILICON ROCE DRIVER
 M: Lijun Ou <ouli...@huawei.com>
 M: Wei Hu(Xavier) <xavier.hu...@huawei.com>
-- 
1.9.1



[PATCH v6 6/6] arm64: MAINTAINERS: hisi: Add HiSilicon SoC PMU support

2017-10-19 Thread Shaokun Zhang
Add support HiSilicon SoC uncore PMU driver.

Signed-off-by: Shaokun Zhang 
---
 MAINTAINERS | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index a74227a..96c583c 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6242,6 +6242,13 @@ S:   Maintained
 F: drivers/net/ethernet/hisilicon/
 F: Documentation/devicetree/bindings/net/hisilicon*.txt
 
+HISILICON PMU DRIVER
+M: Shaokun Zhang 
+W: http://www.hisilicon.com
+S: Supported
+F: drivers/perf/hisilicon
+F: Documentation/perf/hisi-pmu.txt
+
 HISILICON ROCE DRIVER
 M: Lijun Ou 
 M: Wei Hu(Xavier) 
-- 
1.9.1



[PATCH v6 5/6] perf: hisi: Add support for HiSilicon SoC DDRC PMU driver

2017-10-19 Thread Shaokun Zhang
This patch adds support for DDRC PMU driver in HiSilicon SoC chip, Each
DDRC has own control, counter and interrupt registers and is an separate
PMU. For each DDRC PMU, it has 8-fixed-purpose counters which have been
mapped to 8-events by hardware, it assumes that counter index is equal
to event code (0 - 7) in DDRC PMU driver. Interrupt is supported to
handle counter (32-bits) overflow.

Reviewed-by: Jonathan Cameron <jonathan.came...@huawei.com>
Signed-off-by: Shaokun Zhang <zhangshao...@hisilicon.com>
Signed-off-by: Anurup M <anuru...@huawei.com>
---
 drivers/perf/hisilicon/Makefile   |   2 +-
 drivers/perf/hisilicon/hisi_uncore_ddrc_pmu.c | 463 ++
 include/linux/cpuhotplug.h|   1 +
 3 files changed, 465 insertions(+), 1 deletion(-)
 create mode 100644 drivers/perf/hisilicon/hisi_uncore_ddrc_pmu.c

diff --git a/drivers/perf/hisilicon/Makefile b/drivers/perf/hisilicon/Makefile
index a72afe8..2621d51 100644
--- a/drivers/perf/hisilicon/Makefile
+++ b/drivers/perf/hisilicon/Makefile
@@ -1 +1 @@
-obj-$(CONFIG_HISI_PMU) += hisi_uncore_pmu.o hisi_uncore_l3c_pmu.o 
hisi_uncore_hha_pmu.o
+obj-$(CONFIG_HISI_PMU) += hisi_uncore_pmu.o hisi_uncore_l3c_pmu.o 
hisi_uncore_hha_pmu.o hisi_uncore_ddrc_pmu.o
diff --git a/drivers/perf/hisilicon/hisi_uncore_ddrc_pmu.c 
b/drivers/perf/hisilicon/hisi_uncore_ddrc_pmu.c
new file mode 100644
index 000..1b10ea0
--- /dev/null
+++ b/drivers/perf/hisilicon/hisi_uncore_ddrc_pmu.c
@@ -0,0 +1,463 @@
+/*
+ * HiSilicon SoC DDRC uncore Hardware event counters support
+ *
+ * Copyright (C) 2017 Hisilicon Limited
+ * Author: Shaokun Zhang <zhangshao...@hisilicon.com>
+ * Anurup M <anuru...@huawei.com>
+ *
+ * This code is based on the uncore PMUs like arm-cci and arm-ccn.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "hisi_uncore_pmu.h"
+
+/* DDRC register definition */
+#define DDRC_PERF_CTRL 0x010
+#define DDRC_FLUX_WR   0x380
+#define DDRC_FLUX_RD   0x384
+#define DDRC_FLUX_WCMD  0x388
+#define DDRC_FLUX_RCMD  0x38c
+#define DDRC_PRE_CMD0x3c0
+#define DDRC_ACT_CMD0x3c4
+#define DDRC_BNK_CHG0x3c8
+#define DDRC_RNK_CHG0x3cc
+#define DDRC_EVENT_CTRL 0x6C0
+#define DDRC_INT_MASK  0x6c8
+#define DDRC_INT_STATUS0x6cc
+#define DDRC_INT_CLEAR 0x6d0
+
+/* DDRC has 8-counters */
+#define DDRC_NR_COUNTERS   0x8
+#define DDRC_PERF_CTRL_EN  0x2
+
+/*
+ * For DDRC PMU, there are eight-events and every event has been mapped
+ * to fixed-purpose counters which register offset is not consistent.
+ * Therefore there is no write event type and we assume that event
+ * code (0 to 7) is equal to counter index in PMU driver.
+ */
+#define GET_DDRC_EVENTID(hwc)  (hwc->config_base & 0x7)
+
+static const u32 ddrc_reg_off[] = {
+   DDRC_FLUX_WR, DDRC_FLUX_RD, DDRC_FLUX_WCMD, DDRC_FLUX_RCMD,
+   DDRC_PRE_CMD, DDRC_ACT_CMD, DDRC_BNK_CHG, DDRC_RNK_CHG
+};
+
+/*
+ * Select the counter register offset using the counter index.
+ * In DDRC there are no programmable counter, the count
+ * is readed form the statistics counter register itself.
+ */
+static u32 hisi_ddrc_pmu_get_counter_offset(int cntr_idx)
+{
+   return ddrc_reg_off[cntr_idx];
+}
+
+static u64 hisi_ddrc_pmu_read_counter(struct hisi_pmu *ddrc_pmu,
+ struct hw_perf_event *hwc)
+{
+   /* Use event code as counter index */
+   u32 idx = GET_DDRC_EVENTID(hwc);
+
+   if (!hisi_uncore_pmu_counter_valid(ddrc_pmu, idx)) {
+   dev_err(ddrc_pmu->dev, "Unsupported event index:%d!\n", idx);
+   return 0;
+   }
+
+   return readl(ddrc_pmu->base + hisi_ddrc_pmu_get_counter_offset(idx));
+}
+
+static void hisi_ddrc_pmu_write_counter(struct hisi_pmu *ddrc_pmu,
+   struct hw_perf_event *hwc, u64 val)
+{
+   u32 idx = GET_DDRC_EVENTID(hwc);
+
+   if (!hisi_uncore_pmu_counter_valid(ddrc_pmu, idx)) {
+   dev_err(ddrc_pmu->dev, "Unsupported event index:%d!\n", idx);
+   return;
+   }
+
+   writel((u32)val,
+  ddrc_pmu->base + hisi_ddrc_pmu_get_counter_offset(idx));
+}
+
+/*
+ * For DDRC PMU, event has been mapped to fixed-purpose counter by hardware,
+ * so there is no need to write event type.
+ */
+static void hisi_ddrc_pmu_write_evtype(struct hisi_pmu *hha_pmu, int idx,
+  u32 type)
+{
+}
+
+static void hisi_ddrc_pmu_start_counters(struct hisi_pmu *ddrc_pmu)
+{
+   u32 val;
+
+   /* Set perf_enable in DDRC_PERF_CTRL

[PATCH v6 5/6] perf: hisi: Add support for HiSilicon SoC DDRC PMU driver

2017-10-19 Thread Shaokun Zhang
This patch adds support for DDRC PMU driver in HiSilicon SoC chip, Each
DDRC has own control, counter and interrupt registers and is an separate
PMU. For each DDRC PMU, it has 8-fixed-purpose counters which have been
mapped to 8-events by hardware, it assumes that counter index is equal
to event code (0 - 7) in DDRC PMU driver. Interrupt is supported to
handle counter (32-bits) overflow.

Reviewed-by: Jonathan Cameron 
Signed-off-by: Shaokun Zhang 
Signed-off-by: Anurup M 
---
 drivers/perf/hisilicon/Makefile   |   2 +-
 drivers/perf/hisilicon/hisi_uncore_ddrc_pmu.c | 463 ++
 include/linux/cpuhotplug.h|   1 +
 3 files changed, 465 insertions(+), 1 deletion(-)
 create mode 100644 drivers/perf/hisilicon/hisi_uncore_ddrc_pmu.c

diff --git a/drivers/perf/hisilicon/Makefile b/drivers/perf/hisilicon/Makefile
index a72afe8..2621d51 100644
--- a/drivers/perf/hisilicon/Makefile
+++ b/drivers/perf/hisilicon/Makefile
@@ -1 +1 @@
-obj-$(CONFIG_HISI_PMU) += hisi_uncore_pmu.o hisi_uncore_l3c_pmu.o 
hisi_uncore_hha_pmu.o
+obj-$(CONFIG_HISI_PMU) += hisi_uncore_pmu.o hisi_uncore_l3c_pmu.o 
hisi_uncore_hha_pmu.o hisi_uncore_ddrc_pmu.o
diff --git a/drivers/perf/hisilicon/hisi_uncore_ddrc_pmu.c 
b/drivers/perf/hisilicon/hisi_uncore_ddrc_pmu.c
new file mode 100644
index 000..1b10ea0
--- /dev/null
+++ b/drivers/perf/hisilicon/hisi_uncore_ddrc_pmu.c
@@ -0,0 +1,463 @@
+/*
+ * HiSilicon SoC DDRC uncore Hardware event counters support
+ *
+ * Copyright (C) 2017 Hisilicon Limited
+ * Author: Shaokun Zhang 
+ * Anurup M 
+ *
+ * This code is based on the uncore PMUs like arm-cci and arm-ccn.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "hisi_uncore_pmu.h"
+
+/* DDRC register definition */
+#define DDRC_PERF_CTRL 0x010
+#define DDRC_FLUX_WR   0x380
+#define DDRC_FLUX_RD   0x384
+#define DDRC_FLUX_WCMD  0x388
+#define DDRC_FLUX_RCMD  0x38c
+#define DDRC_PRE_CMD0x3c0
+#define DDRC_ACT_CMD0x3c4
+#define DDRC_BNK_CHG0x3c8
+#define DDRC_RNK_CHG0x3cc
+#define DDRC_EVENT_CTRL 0x6C0
+#define DDRC_INT_MASK  0x6c8
+#define DDRC_INT_STATUS0x6cc
+#define DDRC_INT_CLEAR 0x6d0
+
+/* DDRC has 8-counters */
+#define DDRC_NR_COUNTERS   0x8
+#define DDRC_PERF_CTRL_EN  0x2
+
+/*
+ * For DDRC PMU, there are eight-events and every event has been mapped
+ * to fixed-purpose counters which register offset is not consistent.
+ * Therefore there is no write event type and we assume that event
+ * code (0 to 7) is equal to counter index in PMU driver.
+ */
+#define GET_DDRC_EVENTID(hwc)  (hwc->config_base & 0x7)
+
+static const u32 ddrc_reg_off[] = {
+   DDRC_FLUX_WR, DDRC_FLUX_RD, DDRC_FLUX_WCMD, DDRC_FLUX_RCMD,
+   DDRC_PRE_CMD, DDRC_ACT_CMD, DDRC_BNK_CHG, DDRC_RNK_CHG
+};
+
+/*
+ * Select the counter register offset using the counter index.
+ * In DDRC there are no programmable counter, the count
+ * is readed form the statistics counter register itself.
+ */
+static u32 hisi_ddrc_pmu_get_counter_offset(int cntr_idx)
+{
+   return ddrc_reg_off[cntr_idx];
+}
+
+static u64 hisi_ddrc_pmu_read_counter(struct hisi_pmu *ddrc_pmu,
+ struct hw_perf_event *hwc)
+{
+   /* Use event code as counter index */
+   u32 idx = GET_DDRC_EVENTID(hwc);
+
+   if (!hisi_uncore_pmu_counter_valid(ddrc_pmu, idx)) {
+   dev_err(ddrc_pmu->dev, "Unsupported event index:%d!\n", idx);
+   return 0;
+   }
+
+   return readl(ddrc_pmu->base + hisi_ddrc_pmu_get_counter_offset(idx));
+}
+
+static void hisi_ddrc_pmu_write_counter(struct hisi_pmu *ddrc_pmu,
+   struct hw_perf_event *hwc, u64 val)
+{
+   u32 idx = GET_DDRC_EVENTID(hwc);
+
+   if (!hisi_uncore_pmu_counter_valid(ddrc_pmu, idx)) {
+   dev_err(ddrc_pmu->dev, "Unsupported event index:%d!\n", idx);
+   return;
+   }
+
+   writel((u32)val,
+  ddrc_pmu->base + hisi_ddrc_pmu_get_counter_offset(idx));
+}
+
+/*
+ * For DDRC PMU, event has been mapped to fixed-purpose counter by hardware,
+ * so there is no need to write event type.
+ */
+static void hisi_ddrc_pmu_write_evtype(struct hisi_pmu *hha_pmu, int idx,
+  u32 type)
+{
+}
+
+static void hisi_ddrc_pmu_start_counters(struct hisi_pmu *ddrc_pmu)
+{
+   u32 val;
+
+   /* Set perf_enable in DDRC_PERF_CTRL to start event counting */
+   val = readl(ddrc_pmu->base + DDRC_PERF_CTRL);
+   val |= DDRC_PERF_CTRL_EN;
+   writel(val, ddrc_pmu->base + DDRC_PE

[PATCH v6 1/6] Documentation: perf: hisi: Documentation for HiSilicon SoC PMU driver

2017-10-19 Thread Shaokun Zhang
This patch adds documentation for the uncore PMUs on HiSilicon SoC.

Reviewed-by: Jonathan Cameron <jonathan.came...@huawei.com>
Signed-off-by: Shaokun Zhang <zhangshao...@hisilicon.com>
Signed-off-by: Anurup M <anuru...@huawei.com>
---
 Documentation/perf/hisi-pmu.txt | 53 +
 1 file changed, 53 insertions(+)
 create mode 100644 Documentation/perf/hisi-pmu.txt

diff --git a/Documentation/perf/hisi-pmu.txt b/Documentation/perf/hisi-pmu.txt
new file mode 100644
index 000..267a028
--- /dev/null
+++ b/Documentation/perf/hisi-pmu.txt
@@ -0,0 +1,53 @@
+HiSilicon SoC uncore Performance Monitoring Unit (PMU)
+==
+The HiSilicon SoC chip includes various independent system device PMUs
+such as L3 cache (L3C), Hydra Home Agent (HHA) and DDRC. These PMUs are
+independent and have hardware logic to gather statistics and performance
+information.
+
+The HiSilicon SoC encapsulates multiple CPU and IO dies. Each CPU cluster
+(CCL) is made up of 4 cpu cores sharing one L3 cache; each CPU die is
+called Super CPU cluster (SCCL) and is made up of 6 CCLs. Each SCCL has
+two HHAs (0 - 1) and four DDRCs (0 - 3), respectively.
+
+HiSilicon SoC uncore PMU driver
+---
+Each device PMU has separate registers for event counting, control and
+interrupt, and the PMU driver shall register perf PMU drivers like L3C,
+HHA and DDRC etc. The available events and configuration options shall
+be described in the sysfs, see :
+/sys/devices/hisi_sccl{X}_<l3c{Y}/hha{Y}/ddrc{Y}>/, or
+/sys/bus/event_source/devices/hisi_sccl{X}_<l3c{Y}/hha{Y}/ddrc{Y}>.
+The "perf list" command shall list the available events from sysfs.
+
+Each L3C, HHA and DDRC is registered as a separate PMU with perf. The PMU
+name will appear in event listing as hisi_sccl_module.
+where "sccl-id" is the identifier of the SCCL and "index-id" is the index of
+module.
+e.g. hisi_sccl3_l3c0/rd_hit_cpipe is READ_HIT_CPIPE event of L3C index #0 in
+SCCL ID #3.
+e.g. hisi_sccl1_hha0/rx_operations is RX_OPERATIONS event of HHA index #0 in
+SCCL ID #1.
+
+The driver also provides a "cpumask" sysfs attribute, which shows the CPU core
+ID used to count the uncore PMU event.
+
+Example usage of perf:
+$# perf list
+hisi_sccl3_l3c0/rd_hit_cpipe/ [kernel PMU event]
+--
+hisi_sccl3_l3c0/wr_hit_cpipe/ [kernel PMU event]
+--
+hisi_sccl1_l3c0/rd_hit_cpipe/ [kernel PMU event]
+--
+hisi_sccl1_l3c0/wr_hit_cpipe/ [kernel PMU event]
+--
+
+$# perf stat -a -e hisi_sccl3_l3c0/rd_hit_cpipe/ sleep 5
+$# perf stat -a -e hisi_sccl3_l3c0/config=0x02/ sleep 5
+
+The current driver does not support sampling. So "perf record" is unsupported.
+Also attach to a task is unsupported as the events are all uncore.
+
+Note: Please contact the maintainer for a complete list of events supported for
+the PMU devices in the SoC and its information if needed.
-- 
1.9.1



[PATCH v6 1/6] Documentation: perf: hisi: Documentation for HiSilicon SoC PMU driver

2017-10-19 Thread Shaokun Zhang
This patch adds documentation for the uncore PMUs on HiSilicon SoC.

Reviewed-by: Jonathan Cameron 
Signed-off-by: Shaokun Zhang 
Signed-off-by: Anurup M 
---
 Documentation/perf/hisi-pmu.txt | 53 +
 1 file changed, 53 insertions(+)
 create mode 100644 Documentation/perf/hisi-pmu.txt

diff --git a/Documentation/perf/hisi-pmu.txt b/Documentation/perf/hisi-pmu.txt
new file mode 100644
index 000..267a028
--- /dev/null
+++ b/Documentation/perf/hisi-pmu.txt
@@ -0,0 +1,53 @@
+HiSilicon SoC uncore Performance Monitoring Unit (PMU)
+==
+The HiSilicon SoC chip includes various independent system device PMUs
+such as L3 cache (L3C), Hydra Home Agent (HHA) and DDRC. These PMUs are
+independent and have hardware logic to gather statistics and performance
+information.
+
+The HiSilicon SoC encapsulates multiple CPU and IO dies. Each CPU cluster
+(CCL) is made up of 4 cpu cores sharing one L3 cache; each CPU die is
+called Super CPU cluster (SCCL) and is made up of 6 CCLs. Each SCCL has
+two HHAs (0 - 1) and four DDRCs (0 - 3), respectively.
+
+HiSilicon SoC uncore PMU driver
+---
+Each device PMU has separate registers for event counting, control and
+interrupt, and the PMU driver shall register perf PMU drivers like L3C,
+HHA and DDRC etc. The available events and configuration options shall
+be described in the sysfs, see :
+/sys/devices/hisi_sccl{X}_/, or
+/sys/bus/event_source/devices/hisi_sccl{X}_.
+The "perf list" command shall list the available events from sysfs.
+
+Each L3C, HHA and DDRC is registered as a separate PMU with perf. The PMU
+name will appear in event listing as hisi_sccl_module.
+where "sccl-id" is the identifier of the SCCL and "index-id" is the index of
+module.
+e.g. hisi_sccl3_l3c0/rd_hit_cpipe is READ_HIT_CPIPE event of L3C index #0 in
+SCCL ID #3.
+e.g. hisi_sccl1_hha0/rx_operations is RX_OPERATIONS event of HHA index #0 in
+SCCL ID #1.
+
+The driver also provides a "cpumask" sysfs attribute, which shows the CPU core
+ID used to count the uncore PMU event.
+
+Example usage of perf:
+$# perf list
+hisi_sccl3_l3c0/rd_hit_cpipe/ [kernel PMU event]
+--
+hisi_sccl3_l3c0/wr_hit_cpipe/ [kernel PMU event]
+--
+hisi_sccl1_l3c0/rd_hit_cpipe/ [kernel PMU event]
+--
+hisi_sccl1_l3c0/wr_hit_cpipe/ [kernel PMU event]
+--
+
+$# perf stat -a -e hisi_sccl3_l3c0/rd_hit_cpipe/ sleep 5
+$# perf stat -a -e hisi_sccl3_l3c0/config=0x02/ sleep 5
+
+The current driver does not support sampling. So "perf record" is unsupported.
+Also attach to a task is unsupported as the events are all uncore.
+
+Note: Please contact the maintainer for a complete list of events supported for
+the PMU devices in the SoC and its information if needed.
-- 
1.9.1



[PATCH v6 4/6] perf: hisi: Add support for HiSilicon SoC HHA PMU driver

2017-10-19 Thread Shaokun Zhang
L3 cache coherence is maintained by Hydra Home Agent (HHA) in HiSilicon
SoC. This patch adds support for HHA PMU driver, Each HHA has own
control, counter and interrupt registers and is an separate PMU. For
each HHA PMU, it has 16-programable counters and each counter is
free-running. Interrupt is supported to handle counter (48-bits)
overflow.

Reviewed-by: Jonathan Cameron <jonathan.came...@huawei.com>
Signed-off-by: Shaokun Zhang <zhangshao...@hisilicon.com>
Signed-off-by: Anurup M <anuru...@huawei.com>
---
 drivers/perf/hisilicon/Makefile  |   2 +-
 drivers/perf/hisilicon/hisi_uncore_hha_pmu.c | 473 +++
 include/linux/cpuhotplug.h   |   1 +
 3 files changed, 475 insertions(+), 1 deletion(-)
 create mode 100644 drivers/perf/hisilicon/hisi_uncore_hha_pmu.c

diff --git a/drivers/perf/hisilicon/Makefile b/drivers/perf/hisilicon/Makefile
index 4a3d3e6..a72afe8 100644
--- a/drivers/perf/hisilicon/Makefile
+++ b/drivers/perf/hisilicon/Makefile
@@ -1 +1 @@
-obj-$(CONFIG_HISI_PMU) += hisi_uncore_pmu.o hisi_uncore_l3c_pmu.o
+obj-$(CONFIG_HISI_PMU) += hisi_uncore_pmu.o hisi_uncore_l3c_pmu.o 
hisi_uncore_hha_pmu.o
diff --git a/drivers/perf/hisilicon/hisi_uncore_hha_pmu.c 
b/drivers/perf/hisilicon/hisi_uncore_hha_pmu.c
new file mode 100644
index 000..443906e
--- /dev/null
+++ b/drivers/perf/hisilicon/hisi_uncore_hha_pmu.c
@@ -0,0 +1,473 @@
+/*
+ * HiSilicon SoC HHA uncore Hardware event counters support
+ *
+ * Copyright (C) 2017 Hisilicon Limited
+ * Author: Shaokun Zhang <zhangshao...@hisilicon.com>
+ * Anurup M <anuru...@huawei.com>
+ *
+ * This code is based on the uncore PMUs like arm-cci and arm-ccn.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "hisi_uncore_pmu.h"
+
+/* HHA register definition */
+#define HHA_INT_MASK   0x0804
+#define HHA_INT_STATUS 0x0808
+#define HHA_INT_CLEAR  0x080C
+#define HHA_PERF_CTRL  0x1E00
+#define HHA_EVENT_CTRL 0x1E04
+#define HHA_EVENT_TYPE00x1E80
+/*
+ * Each counter is 48-bits and [48:63] are reserved
+ * which are Read-As-Zero and Writes-Ignored.
+ */
+#define HHA_CNT0_LOWER 0x1F00
+
+/* HHA has 16-counters */
+#define HHA_NR_COUNTERS0x10
+
+#define HHA_PERF_CTRL_EN   0x1
+#define HHA_EVTYPE_NONE0xff
+
+/*
+ * Select the counter register offset using the counter index
+ * each counter is 48-bits.
+ */
+static u32 hisi_hha_pmu_get_counter_offset(int cntr_idx)
+{
+   return (HHA_CNT0_LOWER + (cntr_idx * 8));
+}
+
+static u64 hisi_hha_pmu_read_counter(struct hisi_pmu *hha_pmu,
+struct hw_perf_event *hwc)
+{
+   u32 idx = hwc->idx;
+
+   if (!hisi_uncore_pmu_counter_valid(hha_pmu, idx)) {
+   dev_err(hha_pmu->dev, "Unsupported event index:%d!\n", idx);
+   return 0;
+   }
+
+   /* Read 64 bits and like L3C, top 16 bits are RAZ */
+   return readq(hha_pmu->base + hisi_hha_pmu_get_counter_offset(idx));
+}
+
+static void hisi_hha_pmu_write_counter(struct hisi_pmu *hha_pmu,
+  struct hw_perf_event *hwc, u64 val)
+{
+   u32 idx = hwc->idx;
+
+   if (!hisi_uncore_pmu_counter_valid(hha_pmu, idx)) {
+   dev_err(hha_pmu->dev, "Unsupported event index:%d!\n", idx);
+   return;
+   }
+
+   /* Write 64 bits and like L3C, top 16 bits are WI */
+   writeq(val, hha_pmu->base + hisi_hha_pmu_get_counter_offset(idx));
+}
+
+static void hisi_hha_pmu_write_evtype(struct hisi_pmu *hha_pmu, int idx,
+ u32 type)
+{
+   u32 reg, reg_idx, shift, val;
+
+   /*
+* Select the appropriate event select register(HHA_EVENT_TYPEx).
+* There are 4 event select registers for the 16 hardware counters.
+* Event code is 8-bits and for the first 4 hardware counters,
+* HHA_EVENT_TYPE0 is chosen. For the next 4 hardware counters,
+* HHA_EVENT_TYPE1 is chosen and so on.
+*/
+   reg = HHA_EVENT_TYPE0 + 4 * (idx / 4);
+   reg_idx = idx % 4;
+   shift = 8 * reg_idx;
+
+   /* Write event code to HHA_EVENT_TYPEx register */
+   val = readl(hha_pmu->base + reg);
+   val &= ~(HHA_EVTYPE_NONE << shift);
+   val |= (type << shift);
+   writel(val, hha_pmu->base + reg);
+}
+
+static void hisi_hha_pmu_start_counters(struct hisi_pmu *hha_pmu)
+{
+   u32 val;
+
+   /*
+* Set perf_enable bit in HHA_PERF_CTRL to start event
+* counting for all enabled counters.
+*/
+   val = readl(hha_pmu->ba

  1   2   >