[PATCH bpf-next v1] selftests/bpf:Enhance bpf ability to detect ksym read error by libcap

2024-09-14 Thread Lin Yikai
Ksym addr access is restricted
by ``kptr_restrict``(/proc/sys/kernel/kptr_restrict).

On some OS systems(like Android),
ksym addr access is not accessed because ``kptr_restrict=2`.
And it took me a long time to find the root case.

-When ``kptr_restrict==0``, addr is accessed.
# echo 0 > /proc/sys/kernel/kptr_restrict
# cat /proc/kallsyms | grep bpf_link_fops
ffd6bfd3fb60 d bpf_link_fops
-When ``kptr_restrict==2``, addr is replaced by ZERO.
# echo 2 > /proc/sys/kernel/kptr_restrict
# cat /proc/kallsyms | grep bpf_link_fops
 d bpf_link_fops
-When ``kptr_restrict==1``, addr is accessed for user having CAP_SYSLOG.

So we should perform a check to remind users for these conditions
before reading /proc/kallsyms.

[before]:
# echo 2 > /proc/sys/kernel/kptr_restrict
# ./test_progs -t ksyms
#133 ksyms:FAIL

[after]:
# echo 2 > /proc/sys/kernel/kptr_restrict
# ./test_progs -t ksym
ksyms restricted, please check /proc/sys/kernel/kptr_restrict
#133 ksyms:FAIL

Signed-off-by: Lin Yikai 
---
 tools/testing/selftests/bpf/Makefile|  7 ++-
 tools/testing/selftests/bpf/trace_helpers.c | 63 +
 2 files changed, 69 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/bpf/Makefile 
b/tools/testing/selftests/bpf/Makefile
index 04716a5e43f1..369c5ad8fc4a 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -183,7 +183,7 @@ NON_CHECK_FEAT_TARGETS := clean docs-clean
 CHECK_FEAT := $(filter-out $(NON_CHECK_FEAT_TARGETS),$(or $(MAKECMDGOALS), 
"none"))
 ifneq ($(CHECK_FEAT),)
 FEATURE_USER := .selftests
-FEATURE_TESTS := llvm
+FEATURE_TESTS := llvm libcap
 FEATURE_DISPLAY := $(FEATURE_TESTS)
 
 # Makefile.feature expects OUTPUT to end with a slash
@@ -208,6 +208,11 @@ ifeq ($(feature-llvm),1)
   LLVM_LDFLAGS += $(shell $(LLVM_CONFIG) --ldflags)
 endif
 
+ifeq ($(feature-libcap), 1)
+  CFLAGS += -DHAVE_LIBCAP_SUPPORT
+  LDLIBS += -lcap
+endif
+
 SCRATCH_DIR := $(OUTPUT)/tools
 BUILD_DIR := $(SCRATCH_DIR)/build
 INCLUDE_DIR := $(SCRATCH_DIR)/include
diff --git a/tools/testing/selftests/bpf/trace_helpers.c 
b/tools/testing/selftests/bpf/trace_helpers.c
index 2d742fdac6b9..8d2f951464ff 100644
--- a/tools/testing/selftests/bpf/trace_helpers.c
+++ b/tools/testing/selftests/bpf/trace_helpers.c
@@ -17,6 +17,10 @@
 #include 
 #include 
 #include 
+#include 
+#include 
+#include 
+#include 
 #include "bpf/libbpf_internal.h"
 
 #define TRACEFS_PIPE   "/sys/kernel/tracing/trace_pipe"
@@ -31,6 +35,55 @@ struct ksyms {
 static struct ksyms *ksyms;
 static pthread_mutex_t ksyms_mutex = PTHREAD_MUTEX_INITIALIZER;
 
+#ifdef HAVE_LIBCAP_SUPPORT
+#include 
+static bool bpf_cap__capable(cap_value_t cap)
+{
+   cap_flag_value_t val;
+   cap_t caps = cap_get_proc();
+
+   if (!caps)
+   return false;
+
+   if (cap_get_flag(caps, cap, CAP_EFFECTIVE, &val) != 0)
+   val = CAP_CLEAR;
+
+   if (cap_free(caps) != 0)
+   return false;
+
+   return val == CAP_SET;
+}
+#else
+static inline bool bpf_cap__capable(int cap __maybe_unused)
+{
+   return geteuid() == 0;
+}
+#endif /* HAVE_LIBCAP_SUPPORT */
+
+/* For older systems */
+#ifndef CAP_SYSLOG
+#define CAP_SYSLOG 34
+#endif
+
+static bool ksyms__kptr_restrict(void)
+{
+   bool value = false;
+   FILE *fp = fopen("/proc/sys/kernel/kptr_restrict", "r");
+
+   if (fp != NULL) {
+   char line[8];
+
+   if (fgets(line, sizeof(line), fp) != NULL)
+   value = bpf_cap__capable(CAP_SYSLOG) ?
+   (atoi(line) >= 2) :
+   (atoi(line) != 0);
+
+   fclose(fp);
+   }
+
+   return value;
+}
+
 static int ksyms__add_symbol(struct ksyms *ksyms, const char *name,
 unsigned long addr)
 {
@@ -72,6 +125,11 @@ static struct ksyms *load_kallsyms_local_common(ksym_cmp_t 
cmp_cb)
int ret;
struct ksyms *ksyms;
 
+   if (ksyms__kptr_restrict()) {
+   printf("ksyms restricted, please check 
/proc/sys/kernel/kptr_restrict\n");
+   return NULL;
+   }
+
f = fopen("/proc/kallsyms", "r");
if (!f)
return NULL;
@@ -218,6 +276,11 @@ int kallsyms_find(const char *sym, unsigned long long 
*addr)
int err = 0;
FILE *f;
 
+   if (ksyms__kptr_restrict()) {
+   printf("ksyms restricted, please check 
/proc/sys/kernel/kptr_restrict\n");
+   return -EINVAL;
+   }
+
f = fopen("/proc/kallsyms", "r");
if (!f)
return -EINVAL;
-- 
2.34.1




Re: [PATCH net-next] page_pool: add a test module for page_pool

2024-09-11 Thread Yunsheng Lin
On 2024/9/10 19:27, Jesper Dangaard Brouer wrote:
...

>>
>> The main issue I remembered was that it only support x86:(
>>
> 
> Yes, because I've added ASM code for reading TSC counter in a very
> precise manor. Given we run many iterations, then I don't think we
> need this precise reading.  I guess it can simply be replaced with
> get_cycles() or get_cycles64().  Then it should work on all archs.

Agreed.

> 
> The code already supports wall-clock time via ktime_get() (specifically
> ktime_get_real_ts64()).
> 
> 
>>>
>>> My preference here (for the performance part) is to upstream the
>>> out-of-tree tests that Jesper (and probably others) are using, rather
>>> than adding a new performance test that is not as battle-hardened.
>>
>> I looked through the out-of-tree tests again, it seems we can take the
>> best of them.
>> For Jesper' ko:
>> It seems we can do prefill as something that pp_fill_ptr_ring() does
>> in bench_page_pool_simple.c to avoid the noise from the page allocator.
>>
>>
>> For the ko in this patch:
>> It uses NAPI instead of tasklet mimicking the NAPI context, support
>> PP_FLAG_DMA_MAP flag testing, and return '-EAGAIN' in module_init()
>> to use perf stat for collecting and calculating performance data.
>>
> My bench don't return minus-number on module load, because I used perf
> record, and to see symbols decoded with perf report, I needed the module
> to be loaded.
> 
> I started on reading the PMU counters[1] around the bench loop, it works
> if enabling PMU counters yourself/manually, but I never finished that work.
> 
>  [1] 
> https://github.com/netoptimizer/prototype-kernel/blob/master/kernel/include/linux/time_bench.h#L195-L209
> 
> 
>> Is there other testcase or better practicing that we can learn from
>> Jesper' out of tree ko?
>>
> 
> I created a time_bench.c [2] module that other modules [3] can use to
> easier reuse the benchmarking code in other modules.
> 
>  [2] 
> https://github.com/netoptimizer/prototype-kernel/blob/master/kernel/lib/time_bench.c
> 
>  [3] 
> https://github.com/netoptimizer/prototype-kernel/blob/master/kernel/lib/bench_page_pool_simple.c

Will take a look at it, thanks.

> 
> --Jesper



Re: [PATCH net-next] page_pool: add a test module for page_pool

2024-09-10 Thread Yunsheng Lin
On 2024/9/10 1:28, Mina Almasry wrote:
> On Mon, Sep 9, 2024 at 2:25 AM Yunsheng Lin  wrote:
>>
>> The testing is done by ensuring that the page allocated from
>> the page_pool instance is pushed into a ptr_ring instance in
>> a kthread/napi binded to a specified cpu, and a kthread/napi
>> binded to a specified cpu will pop the page from the ptr_ring
>> and free it back to the page_pool.
>>
>> Signed-off-by: Yunsheng Lin 
> 
> It seems this test is has a correctness part and a performance part.
> For the performance test, Jesper has out of tree tests for the
> page_pool:
> https://github.com/netoptimizer/prototype-kernel/blob/master/kernel/lib/bench_page_pool_simple.c
> 
> I have these rebased on top of net-next and use them to verify devmem
> & memory-provider performance:
> https://github.com/mina/linux/commit/07fd1c04591395d15d83c07298b4d37f6b56157f

Yes, I used that testing ko too when adding frag API support for
page_pool.

The main issue I remembered was that it only support x86:(

> 
> My preference here (for the performance part) is to upstream the
> out-of-tree tests that Jesper (and probably others) are using, rather
> than adding a new performance test that is not as battle-hardened.

I looked through the out-of-tree tests again, it seems we can take the
best of them.
For Jesper' ko:
It seems we can do prefill as something that pp_fill_ptr_ring() does
in bench_page_pool_simple.c to avoid the noise from the page allocator.


For the ko in this patch:
It uses NAPI instead of tasklet mimicking the NAPI context, support
PP_FLAG_DMA_MAP flag testing, and return '-EAGAIN' in module_init()
to use perf stat for collecting and calculating performance data.

Is there other testcase or better practicing that we can learn from
Jesper' out of tree ko?

> 
> --
> Thanks,
> Mina
> 



[PATCH net-next] page_pool: add a test module for page_pool

2024-09-09 Thread Yunsheng Lin
The testing is done by ensuring that the page allocated from
the page_pool instance is pushed into a ptr_ring instance in
a kthread/napi binded to a specified cpu, and a kthread/napi
binded to a specified cpu will pop the page from the ptr_ring
and free it back to the page_pool.

Signed-off-by: Yunsheng Lin 
---
 tools/testing/selftests/net/Makefile  |   3 +
 .../testing/selftests/net/page_pool/Makefile  |  18 +
 .../selftests/net/page_pool/page_pool_test.c  | 433 ++
 tools/testing/selftests/net/test_page_pool.sh | 175 +++
 4 files changed, 629 insertions(+)
 create mode 100644 tools/testing/selftests/net/page_pool/Makefile
 create mode 100644 tools/testing/selftests/net/page_pool/page_pool_test.c
 create mode 100755 tools/testing/selftests/net/test_page_pool.sh

diff --git a/tools/testing/selftests/net/Makefile 
b/tools/testing/selftests/net/Makefile
index 27362e40eb37..4d4ddd853ef8 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -6,6 +6,8 @@ CFLAGS += -I../../../../usr/include/ $(KHDR_INCLUDES)
 # Additional include paths needed by kselftest.h
 CFLAGS += -I../
 
+TEST_GEN_MODS_DIR := page_pool
+
 TEST_PROGS := run_netsocktests run_afpackettests test_bpf.sh netdevice.sh \
  rtnetlink.sh xfrm_policy.sh test_blackhole_dev.sh
 TEST_PROGS += fib_tests.sh fib-onlink-tests.sh pmtu.sh udpgso.sh ip_defrag.sh
@@ -96,6 +98,7 @@ TEST_PROGS += fdb_flush.sh
 TEST_PROGS += fq_band_pktlimit.sh
 TEST_PROGS += vlan_hw_filter.sh
 TEST_PROGS += bpf_offload.py
+TEST_PROGS += test_page_pool.sh
 
 TEST_FILES := settings
 TEST_FILES += in_netns.sh lib.sh net_helper.sh setup_loopback.sh setup_veth.sh
diff --git a/tools/testing/selftests/net/page_pool/Makefile 
b/tools/testing/selftests/net/page_pool/Makefile
new file mode 100644
index ..4380a70d6391
--- /dev/null
+++ b/tools/testing/selftests/net/page_pool/Makefile
@@ -0,0 +1,18 @@
+PAGE_POOL_TEST_DIR := $(realpath $(dir $(abspath $(lastword 
$(MAKEFILE_LIST)
+KDIR ?= $(abspath $(PAGE_POOL_TEST_DIR)/../../../../..)
+
+ifeq ($(V),1)
+Q =
+else
+Q = @
+endif
+
+MODULES = page_pool_test.ko
+
+obj-m += page_pool_test.o
+
+all:
+   +$(Q)make -C $(KDIR) M=$(PAGE_POOL_TEST_DIR) modules
+
+clean:
+   +$(Q)make -C $(KDIR) M=$(PAGE_POOL_TEST_DIR) clean
diff --git a/tools/testing/selftests/net/page_pool/page_pool_test.c 
b/tools/testing/selftests/net/page_pool/page_pool_test.c
new file mode 100644
index ..475b64f21b78
--- /dev/null
+++ b/tools/testing/selftests/net/page_pool/page_pool_test.c
@@ -0,0 +1,433 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Test module for page_pool
+ *
+ * Copyright (C) 2024 Yunsheng Lin 
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+static struct ptr_ring ptr_ring;
+static int nr_objs = 512;
+static atomic_t nthreads;
+static struct completion wait;
+static struct page_pool *test_pool;
+static struct device *dev;
+static u64 dma_mask = DMA_BIT_MASK(64);
+
+static int nr_test = 200;
+module_param(nr_test, int, 0);
+MODULE_PARM_DESC(nr_test, "number of iterations to test");
+
+static bool test_frag;
+module_param(test_frag, bool, 0);
+MODULE_PARM_DESC(test_frag, "use frag API for testing");
+
+static bool test_dma;
+module_param(test_dma, bool, 0);
+MODULE_PARM_DESC(test_dma, "enable dma mapping for testing");
+
+static bool test_napi;
+module_param(test_napi, bool, 0);
+MODULE_PARM_DESC(test_napi, "use NAPI softirq for testing");
+
+static bool test_direct;
+module_param(test_direct, bool, 0);
+MODULE_PARM_DESC(test_direct, "enable direct recycle for testing");
+
+static int test_alloc_len = 2048;
+module_param(test_alloc_len, int, 0);
+MODULE_PARM_DESC(test_alloc_len, "alloc len for testing");
+
+static int test_push_cpu;
+module_param(test_push_cpu, int, 0);
+MODULE_PARM_DESC(test_push_cpu, "test cpu for pushing page");
+
+static int test_pop_cpu;
+module_param(test_pop_cpu, int, 0);
+MODULE_PARM_DESC(test_pop_cpu, "test cpu for popping page");
+
+static void page_pool_test_dev_release(struct device *dev)
+{
+   kfree(dev);
+}
+
+static struct page_pool *page_pool_test_create(void)
+{
+   struct page_pool_params page_pool_params = {
+   .pool_size = nr_objs,
+   .flags = 0,
+   .nid = cpu_to_mem(test_push_cpu),
+   };
+   int ret;
+
+   if (test_dma) {
+   dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+   if (!dev)
+   return ERR_PTR(-ENOMEM);
+
+   dev->release = page_pool_test_dev_release;
+   dev->dma_mask = &dma_mask;
+   device_initialize(dev);
+
+   ret = dev_set_name(dev, "page_pool_dev");
+   if (ret) {
+   pr_err("page_pool_test dev_set_name() failed: %d\n",
+   

[PATCH net-next v18 11/14] mm: page_frag: add testing for the newly added prepare API

2024-09-06 Thread Yunsheng Lin
Add testing for the newly added prepare API, for both aligned
and non-aligned API, also probe API is also tested along with
prepare API.

CC: Alexander Duyck 
Signed-off-by: Yunsheng Lin 
---
 .../selftests/mm/page_frag/page_frag_test.c   | 66 +--
 tools/testing/selftests/mm/run_vmtests.sh |  4 ++
 tools/testing/selftests/mm/test_page_frag.sh  | 31 +
 3 files changed, 96 insertions(+), 5 deletions(-)

diff --git a/tools/testing/selftests/mm/page_frag/page_frag_test.c 
b/tools/testing/selftests/mm/page_frag/page_frag_test.c
index a4bd543d6950..7cfa896f69cb 100644
--- a/tools/testing/selftests/mm/page_frag/page_frag_test.c
+++ b/tools/testing/selftests/mm/page_frag/page_frag_test.c
@@ -27,6 +27,10 @@ static bool test_align;
 module_param(test_align, bool, 0);
 MODULE_PARM_DESC(test_align, "use align API for testing");
 
+static bool test_prepare;
+module_param(test_prepare, bool, 0);
+MODULE_PARM_DESC(test_prepare, "use prepare API for testing");
+
 static int test_alloc_len = 2048;
 module_param(test_alloc_len, int, 0);
 MODULE_PARM_DESC(test_alloc_len, "alloc len for testing");
@@ -67,6 +71,18 @@ static int page_frag_pop_thread(void *arg)
return 0;
 }
 
+static void frag_frag_test_commit(struct page_frag_cache *nc,
+ struct page_frag *prepare_pfrag,
+ struct page_frag *probe_pfrag,
+ unsigned int used_sz)
+{
+   WARN_ON_ONCE(prepare_pfrag->page != probe_pfrag->page ||
+prepare_pfrag->offset != probe_pfrag->offset ||
+prepare_pfrag->size != probe_pfrag->size);
+
+   page_frag_commit(nc, prepare_pfrag, used_sz);
+}
+
 static int page_frag_push_thread(void *arg)
 {
struct ptr_ring *ring = arg;
@@ -80,13 +96,52 @@ static int page_frag_push_thread(void *arg)
int ret;
 
if (test_align) {
-   va = page_frag_alloc_align(&test_nc, test_alloc_len,
-  GFP_KERNEL, SMP_CACHE_BYTES);
+   if (test_prepare) {
+   struct page_frag prepare_frag, probe_frag;
+   void *probe_va;
+
+   va = 
page_frag_alloc_refill_prepare_align(&test_nc,
+ 
test_alloc_len,
+ 
&prepare_frag,
+ 
GFP_KERNEL,
+ 
SMP_CACHE_BYTES);
+
+   probe_va = 
__page_frag_alloc_refill_probe_align(&test_nc,
+   
test_alloc_len,
+   
&probe_frag,
+   
-SMP_CACHE_BYTES);
+   WARN_ON_ONCE(va != probe_va);
+
+   if (likely(va))
+   frag_frag_test_commit(&test_nc, 
&prepare_frag,
+ &probe_frag, 
test_alloc_len);
+   } else {
+   va = page_frag_alloc_align(&test_nc,
+  test_alloc_len,
+  GFP_KERNEL,
+  SMP_CACHE_BYTES);
+   }
 
WARN_ONCE((unsigned long)va & (SMP_CACHE_BYTES - 1),
  "unaligned va returned\n");
} else {
-   va = page_frag_alloc(&test_nc, test_alloc_len, 
GFP_KERNEL);
+   if (test_prepare) {
+   struct page_frag prepare_frag, probe_frag;
+   void *probe_va;
+
+   va = page_frag_alloc_refill_prepare(&test_nc, 
test_alloc_len,
+   
&prepare_frag, GFP_KERNEL);
+
+   probe_va = 
page_frag_alloc_refill_probe(&test_nc, test_alloc_len,
+   
&probe_frag);
+
+   WARN_ON_ONCE(va != probe_va);
+   if (likely(va))
+   frag_frag_test_commit(&test_nc, 
&prepare_frag,
+ &probe_frag, 
test_alloc

[PATCH net-next v18 04/14] mm: page_frag: avoid caller accessing 'page_frag_cache' directly

2024-09-06 Thread Yunsheng Lin
Use appropriate frag_page API instead of caller accessing
'page_frag_cache' directly.

CC: Alexander Duyck 
Signed-off-by: Yunsheng Lin 
Reviewed-by: Alexander Duyck 
Acked-by: Chuck Lever 
---
 drivers/vhost/net.c   |  2 +-
 include/linux/page_frag_cache.h   | 10 ++
 net/core/skbuff.c |  6 +++---
 net/rxrpc/conn_object.c   |  4 +---
 net/rxrpc/local_object.c  |  4 +---
 net/sunrpc/svcsock.c  |  6 ++
 tools/testing/selftests/mm/page_frag/page_frag_test.c |  2 +-
 7 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index f16279351db5..9ad37c012189 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -1325,7 +1325,7 @@ static int vhost_net_open(struct inode *inode, struct 
file *f)
vqs[VHOST_NET_VQ_RX]);
 
f->private_data = n;
-   n->pf_cache.va = NULL;
+   page_frag_cache_init(&n->pf_cache);
 
return 0;
 }
diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h
index 67ac8626ed9b..0a52f7a179c8 100644
--- a/include/linux/page_frag_cache.h
+++ b/include/linux/page_frag_cache.h
@@ -7,6 +7,16 @@
 #include 
 #include 
 
+static inline void page_frag_cache_init(struct page_frag_cache *nc)
+{
+   nc->va = NULL;
+}
+
+static inline bool page_frag_cache_is_pfmemalloc(struct page_frag_cache *nc)
+{
+   return !!nc->pfmemalloc;
+}
+
 void page_frag_cache_drain(struct page_frag_cache *nc);
 void __page_frag_cache_drain(struct page *page, unsigned int count);
 void *__page_frag_alloc_align(struct page_frag_cache *nc, unsigned int fragsz,
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index a52638363ea5..a5f8e4e0c649 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -752,14 +752,14 @@ struct sk_buff *__netdev_alloc_skb(struct net_device 
*dev, unsigned int len,
if (in_hardirq() || irqs_disabled()) {
nc = this_cpu_ptr(&netdev_alloc_cache);
data = page_frag_alloc(nc, len, gfp_mask);
-   pfmemalloc = nc->pfmemalloc;
+   pfmemalloc = page_frag_cache_is_pfmemalloc(nc);
} else {
local_bh_disable();
local_lock_nested_bh(&napi_alloc_cache.bh_lock);
 
nc = this_cpu_ptr(&napi_alloc_cache.page);
data = page_frag_alloc(nc, len, gfp_mask);
-   pfmemalloc = nc->pfmemalloc;
+   pfmemalloc = page_frag_cache_is_pfmemalloc(nc);
 
local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
local_bh_enable();
@@ -849,7 +849,7 @@ struct sk_buff *napi_alloc_skb(struct napi_struct *napi, 
unsigned int len)
len = SKB_HEAD_ALIGN(len);
 
data = page_frag_alloc(&nc->page, len, gfp_mask);
-   pfmemalloc = nc->page.pfmemalloc;
+   pfmemalloc = page_frag_cache_is_pfmemalloc(&nc->page);
}
local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
 
diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c
index 1539d315afe7..694c4df7a1a3 100644
--- a/net/rxrpc/conn_object.c
+++ b/net/rxrpc/conn_object.c
@@ -337,9 +337,7 @@ static void rxrpc_clean_up_connection(struct work_struct 
*work)
 */
rxrpc_purge_queue(&conn->rx_queue);
 
-   if (conn->tx_data_alloc.va)
-   __page_frag_cache_drain(virt_to_page(conn->tx_data_alloc.va),
-   conn->tx_data_alloc.pagecnt_bias);
+   page_frag_cache_drain(&conn->tx_data_alloc);
call_rcu(&conn->rcu, rxrpc_rcu_free_connection);
 }
 
diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c
index 504453c688d7..a8cffe47cf01 100644
--- a/net/rxrpc/local_object.c
+++ b/net/rxrpc/local_object.c
@@ -452,9 +452,7 @@ void rxrpc_destroy_local(struct rxrpc_local *local)
 #endif
rxrpc_purge_queue(&local->rx_queue);
rxrpc_purge_client_connections(local);
-   if (local->tx_alloc.va)
-   __page_frag_cache_drain(virt_to_page(local->tx_alloc.va),
-   local->tx_alloc.pagecnt_bias);
+   page_frag_cache_drain(&local->tx_alloc);
 }
 
 /*
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 6b3f01beb294..dcfd84cf0694 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1609,7 +1609,6 @@ static void svc_tcp_sock_detach(struct svc_xprt *xprt)
 static void svc_sock_free(struct svc_xprt *xprt)
 {
struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
-   struct page_frag_cache *pfc = &svsk->sk_frag_cache;
struct socket *sock = svsk->sk_sock;
 
trace_svcsock_free(svsk, sock);
@@ -1619,8 +1618,7 @@ static void 

[PATCH net-next v18 02/14] mm: move the page fragment allocator from page_alloc into its own file

2024-09-06 Thread Yunsheng Lin
Inspired by [1], move the page fragment allocator from page_alloc
into its own c file and header file, as we are about to make more
change for it to replace another page_frag implementation in
sock.c

As this patchset is going to replace 'struct page_frag' with
'struct page_frag_cache' in sched.h, including page_frag_cache.h
in sched.h has a compiler error caused by interdependence between
mm_types.h and mm.h for asm-offsets.c, see [2]. So avoid the compiler
error by moving 'struct page_frag_cache' to mm_types_task.h as
suggested by Alexander, see [3].

1. https://lore.kernel.org/all/20230411160902.4134381-3-dhowe...@redhat.com/
2. https://lore.kernel.org/all/15623dac-9358-4597-b3ee-3694a5956...@gmail.com/
3. 
https://lore.kernel.org/all/CAKgT0UdH1yD=LSCXFJ=ym_aia4oomd-2wxyko42bizawmt_...@mail.gmail.com/
CC: David Howells 
CC: Alexander Duyck 
Signed-off-by: Yunsheng Lin 
Acked-by: Andrew Morton 
Reviewed-by: Alexander Duyck 
---
 include/linux/gfp.h   |  22 ---
 include/linux/mm_types.h  |  18 ---
 include/linux/mm_types_task.h |  18 +++
 include/linux/page_frag_cache.h   |  31 
 include/linux/skbuff.h|   1 +
 mm/Makefile   |   1 +
 mm/page_alloc.c   | 136 
 mm/page_frag_cache.c  | 145 ++
 .../selftests/mm/page_frag/page_frag_test.c   |   2 +-
 9 files changed, 197 insertions(+), 177 deletions(-)
 create mode 100644 include/linux/page_frag_cache.h
 create mode 100644 mm/page_frag_cache.c

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index f53f76e0b17e..01a49be7c98d 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -371,28 +371,6 @@ __meminit void *alloc_pages_exact_nid_noprof(int nid, 
size_t size, gfp_t gfp_mas
 extern void __free_pages(struct page *page, unsigned int order);
 extern void free_pages(unsigned long addr, unsigned int order);
 
-struct page_frag_cache;
-void page_frag_cache_drain(struct page_frag_cache *nc);
-extern void __page_frag_cache_drain(struct page *page, unsigned int count);
-void *__page_frag_alloc_align(struct page_frag_cache *nc, unsigned int fragsz,
- gfp_t gfp_mask, unsigned int align_mask);
-
-static inline void *page_frag_alloc_align(struct page_frag_cache *nc,
- unsigned int fragsz, gfp_t gfp_mask,
- unsigned int align)
-{
-   WARN_ON_ONCE(!is_power_of_2(align));
-   return __page_frag_alloc_align(nc, fragsz, gfp_mask, -align);
-}
-
-static inline void *page_frag_alloc(struct page_frag_cache *nc,
-unsigned int fragsz, gfp_t gfp_mask)
-{
-   return __page_frag_alloc_align(nc, fragsz, gfp_mask, ~0u);
-}
-
-extern void page_frag_free(void *addr);
-
 #define __free_page(page) __free_pages((page), 0)
 #define free_page(addr) free_pages((addr), 0)
 
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 485424979254..843d75412105 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -521,9 +521,6 @@ static_assert(sizeof(struct ptdesc) <= sizeof(struct page));
  */
 #define STRUCT_PAGE_MAX_SHIFT  (order_base_2(sizeof(struct page)))
 
-#define PAGE_FRAG_CACHE_MAX_SIZE   __ALIGN_MASK(32768, ~PAGE_MASK)
-#define PAGE_FRAG_CACHE_MAX_ORDER  get_order(PAGE_FRAG_CACHE_MAX_SIZE)
-
 /*
  * page_private can be used on tail pages.  However, PagePrivate is only
  * checked by the VM on the head page.  So page_private on the tail pages
@@ -542,21 +539,6 @@ static inline void *folio_get_private(struct folio *folio)
return folio->private;
 }
 
-struct page_frag_cache {
-   void * va;
-#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
-   __u16 offset;
-   __u16 size;
-#else
-   __u32 offset;
-#endif
-   /* we maintain a pagecount bias, so that we dont dirty cache line
-* containing page->_refcount every time we allocate a fragment.
-*/
-   unsigned intpagecnt_bias;
-   bool pfmemalloc;
-};
-
 typedef unsigned long vm_flags_t;
 
 /*
diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h
index a2f6179b672b..cdc1e3696439 100644
--- a/include/linux/mm_types_task.h
+++ b/include/linux/mm_types_task.h
@@ -8,6 +8,7 @@
  * (These are defined separately to decouple sched.h from mm_types.h as much 
as possible.)
  */
 
+#include 
 #include 
 
 #include 
@@ -46,6 +47,23 @@ struct page_frag {
 #endif
 };
 
+#define PAGE_FRAG_CACHE_MAX_SIZE   __ALIGN_MASK(32768, ~PAGE_MASK)
+#define PAGE_FRAG_CACHE_MAX_ORDER  get_order(PAGE_FRAG_CACHE_MAX_SIZE)
+struct page_frag_cache {
+   void *va;
+#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
+   __u16 offset;
+   __u16 size;
+#else
+   __u32 offset;
+#endif
+   /* we maintain a pagecount bias, so that w

[PATCH net-next v18 01/14] mm: page_frag: add a test module for page_frag

2024-09-06 Thread Yunsheng Lin
The testing is done by ensuring that the fragment allocated
from a frag_frag_cache instance is pushed into a ptr_ring
instance in a kthread binded to a specified cpu, and a kthread
binded to a specified cpu will pop the fragment from the
ptr_ring and free the fragment.

CC: Alexander Duyck 
Signed-off-by: Yunsheng Lin 
Reviewed-by: Alexander Duyck 
---
 tools/testing/selftests/mm/Makefile   |   3 +
 tools/testing/selftests/mm/page_frag/Makefile |  18 ++
 .../selftests/mm/page_frag/page_frag_test.c   | 170 +
 tools/testing/selftests/mm/run_vmtests.sh |   8 +
 tools/testing/selftests/mm/test_page_frag.sh  | 171 ++
 5 files changed, 370 insertions(+)
 create mode 100644 tools/testing/selftests/mm/page_frag/Makefile
 create mode 100644 tools/testing/selftests/mm/page_frag/page_frag_test.c
 create mode 100755 tools/testing/selftests/mm/test_page_frag.sh

diff --git a/tools/testing/selftests/mm/Makefile 
b/tools/testing/selftests/mm/Makefile
index cfad627e8d94..e98ec779b2aa 100644
--- a/tools/testing/selftests/mm/Makefile
+++ b/tools/testing/selftests/mm/Makefile
@@ -36,6 +36,8 @@ MAKEFLAGS += --no-builtin-rules
 CFLAGS = -Wall -I $(top_srcdir) $(EXTRA_CFLAGS) $(KHDR_INCLUDES) 
$(TOOLS_INCLUDES)
 LDLIBS = -lrt -lpthread -lm
 
+TEST_GEN_MODS_DIR := page_frag
+
 TEST_GEN_FILES = cow
 TEST_GEN_FILES += compaction_test
 TEST_GEN_FILES += gup_longterm
@@ -125,6 +127,7 @@ TEST_FILES += test_hmm.sh
 TEST_FILES += va_high_addr_switch.sh
 TEST_FILES += charge_reserved_hugetlb.sh
 TEST_FILES += hugetlb_reparenting_test.sh
+TEST_FILES += test_page_frag.sh
 
 # required by charge_reserved_hugetlb.sh
 TEST_FILES += write_hugetlb_memory.sh
diff --git a/tools/testing/selftests/mm/page_frag/Makefile 
b/tools/testing/selftests/mm/page_frag/Makefile
new file mode 100644
index ..58dda74d50a3
--- /dev/null
+++ b/tools/testing/selftests/mm/page_frag/Makefile
@@ -0,0 +1,18 @@
+PAGE_FRAG_TEST_DIR := $(realpath $(dir $(abspath $(lastword 
$(MAKEFILE_LIST)
+KDIR ?= $(abspath $(PAGE_FRAG_TEST_DIR)/../../../../..)
+
+ifeq ($(V),1)
+Q =
+else
+Q = @
+endif
+
+MODULES = page_frag_test.ko
+
+obj-m += page_frag_test.o
+
+all:
+   +$(Q)make -C $(KDIR) M=$(PAGE_FRAG_TEST_DIR) modules
+
+clean:
+   +$(Q)make -C $(KDIR) M=$(PAGE_FRAG_TEST_DIR) clean
diff --git a/tools/testing/selftests/mm/page_frag/page_frag_test.c 
b/tools/testing/selftests/mm/page_frag/page_frag_test.c
new file mode 100644
index ..6d6f31936b10
--- /dev/null
+++ b/tools/testing/selftests/mm/page_frag/page_frag_test.c
@@ -0,0 +1,170 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Test module for page_frag cache
+ *
+ * Copyright (C) 2024 Yunsheng Lin 
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+static struct ptr_ring ptr_ring;
+static int nr_objs = 512;
+static atomic_t nthreads;
+static struct completion wait;
+static struct page_frag_cache test_nc;
+
+static int nr_test = 200;
+module_param(nr_test, int, 0);
+MODULE_PARM_DESC(nr_test, "number of iterations to test");
+
+static bool test_align;
+module_param(test_align, bool, 0);
+MODULE_PARM_DESC(test_align, "use align API for testing");
+
+static int test_alloc_len = 2048;
+module_param(test_alloc_len, int, 0);
+MODULE_PARM_DESC(test_alloc_len, "alloc len for testing");
+
+static int test_push_cpu;
+module_param(test_push_cpu, int, 0);
+MODULE_PARM_DESC(test_push_cpu, "test cpu for pushing fragment");
+
+static int test_pop_cpu;
+module_param(test_pop_cpu, int, 0);
+MODULE_PARM_DESC(test_pop_cpu, "test cpu for popping fragment");
+
+static int page_frag_pop_thread(void *arg)
+{
+   struct ptr_ring *ring = arg;
+   int nr = nr_test;
+
+   pr_info("page_frag pop test thread begins on cpu %d\n",
+   smp_processor_id());
+
+   while (nr > 0) {
+   void *obj = __ptr_ring_consume(ring);
+
+   if (obj) {
+   nr--;
+   page_frag_free(obj);
+   } else {
+   cond_resched();
+   }
+   }
+
+   if (atomic_dec_and_test(&nthreads))
+   complete(&wait);
+
+   pr_info("page_frag pop test thread exits on cpu %d\n",
+   smp_processor_id());
+
+   return 0;
+}
+
+static int page_frag_push_thread(void *arg)
+{
+   struct ptr_ring *ring = arg;
+   int nr = nr_test;
+
+   pr_info("page_frag push test thread begins on cpu %d\n",
+   smp_processor_id());
+
+   while (nr > 0) {
+   void *va;
+   int ret;
+
+   if (test_align) {
+   va = page_frag_alloc_align(&test_nc, test_alloc_len,
+  GFP_KERNEL, SMP_CACHE_BYTES);
+
+   WARN_ONCE((unsigned long)va & (SMP_CACHE_BYTES - 1),
+ &quo

[PATCH bpf-next v1 3/3] libbpf: fix some typos in libbpf

2024-09-05 Thread Lin Yikai
Hi, fix some spelling errors in libbpf, the details are as follows:

-in the code comments:
termintaing->terminating
architecutre->architecture
requring->requiring
recored->recoded
sanitise->sanities
allowd->allowed
abover->above
see bpf_udst_arg()->see bpf_usdt_arg()

Signed-off-by: Lin Yikai 
---
 tools/lib/bpf/bpf.h | 4 ++--
 tools/lib/bpf/bpf_tracing.h | 4 ++--
 tools/lib/bpf/btf.c | 2 +-
 tools/lib/bpf/libbpf.c  | 2 +-
 tools/lib/bpf/linker.c  | 4 ++--
 tools/lib/bpf/usdt.bpf.h| 2 +-
 6 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index 972e17ec0c09..a4a7b1ad1b63 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -100,7 +100,7 @@ struct bpf_prog_load_opts {
__u32 log_level;
__u32 log_size;
char *log_buf;
-   /* output: actual total log contents size (including termintaing zero).
+   /* output: actual total log contents size (including terminating zero).
 * It could be both larger than original log_size (if log was
 * truncated), or smaller (if log buffer wasn't filled completely).
 * If kernel doesn't support this feature, log_size is left unchanged.
@@ -129,7 +129,7 @@ struct bpf_btf_load_opts {
char *log_buf;
__u32 log_level;
__u32 log_size;
-   /* output: actual total log contents size (including termintaing zero).
+   /* output: actual total log contents size (including terminating zero).
 * It could be both larger than original log_size (if log was
 * truncated), or smaller (if log buffer wasn't filled completely).
 * If kernel doesn't support this feature, log_size is left unchanged.
diff --git a/tools/lib/bpf/bpf_tracing.h b/tools/lib/bpf/bpf_tracing.h
index 9314fa95f04e..9e45291e40ee 100644
--- a/tools/lib/bpf/bpf_tracing.h
+++ b/tools/lib/bpf/bpf_tracing.h
@@ -473,7 +473,7 @@ struct pt_regs;
 #endif
 /*
  * Similarly, syscall-specific conventions might differ between function call
- * conventions within each architecutre. All supported architectures pass
+ * conventions within each architecture. All supported architectures pass
  * either 6 or 7 syscall arguments in registers.
  *
  * See syscall(2) manpage for succinct table with information on each arch.
@@ -651,7 +651,7 @@ struct pt_regs;
  * BPF_PROG is a convenience wrapper for generic tp_btf/fentry/fexit and
  * similar kinds of BPF programs, that accept input arguments as a single
  * pointer to untyped u64 array, where each u64 can actually be a typed
- * pointer or integer of different size. Instead of requring user to write
+ * pointer or integer of different size. Instead of requiring user to write
  * manual casts and work with array elements by index, BPF_PROG macro
  * allows user to declare a list of named and typed input arguments in the
  * same syntax as for normal C function. All the casting is hidden and
diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index 40aae244e35f..8d51e73d55a8 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -4192,7 +4192,7 @@ static bool btf_dedup_identical_structs(struct btf_dedup 
*d, __u32 id1, __u32 id
  * and canonical graphs are not compatible structurally, whole graphs are
  * incompatible. If types are structurally equivalent (i.e., all information
  * except referenced type IDs is exactly the same), a mapping from `canon_id` 
to
- * a `cand_id` is recored in hypothetical mapping (`btf_dedup->hypot_map`).
+ * a `cand_id` is recoded in hypothetical mapping (`btf_dedup->hypot_map`).
  * If a type references other types, then those referenced types are checked
  * for equivalence recursively.
  *
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index d3a542649e6b..27ad3c6ee868 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -1848,7 +1848,7 @@ static char *internal_map_name(struct bpf_object *obj, 
const char *real_name)
snprintf(map_name, sizeof(map_name), "%.*s%.*s", pfx_len, obj->name,
 sfx_len, real_name);
 
-   /* sanitise map name to characters allowed by kernel */
+   /* sanities map name to characters allowed by kernel */
for (p = map_name; *p && p < map_name + sizeof(map_name); p++)
if (!isalnum(*p) && *p != '_' && *p != '.')
*p = '_';
diff --git a/tools/lib/bpf/linker.c b/tools/lib/bpf/linker.c
index 9cd3d4109788..e0005c6ade88 100644
--- a/tools/lib/bpf/linker.c
+++ b/tools/lib/bpf/linker.c
@@ -1413,7 +1413,7 @@ static bool glob_sym_btf_matches(const char *sym_name, 
bool exact,
return true;
case BTF_KIND_PTR:
/* just validate overall shape of the referenced type, so no
-* contents comparison for struct/union,

[PATCH bpf-next v1 2/3] bpftool: fix some typos in bpftool

2024-09-05 Thread Lin Yikai
Hi, fix some spelling errors in bpftool, the details are as follows:

-in file "bpftool-gen.rst"
libppf->libbpf
-in the code comments:
ouptut->output

Signed-off-by: Lin Yikai 
---
 tools/bpf/bpftool/Documentation/bpftool-gen.rst |  2 +-
 tools/bpf/bpftool/feature.c | 10 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/tools/bpf/bpftool/Documentation/bpftool-gen.rst 
b/tools/bpf/bpftool/Documentation/bpftool-gen.rst
index c768e6d4ae09..1b426e58a7cd 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-gen.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-gen.rst
@@ -104,7 +104,7 @@ bpftool gen skeleton *FILE*
 
 - **example__load**.
   This function creates maps, loads and verifies BPF programs, initializes
-  global data maps. It corresponds to libppf's **bpf_object__load**\ ()
+  global data maps. It corresponds to libbpf's **bpf_object__load**\ ()
   API.
 
 - **example__open_and_load** combines **example__open** and
diff --git a/tools/bpf/bpftool/feature.c b/tools/bpf/bpftool/feature.c
index c754a428c8c6..4dbc4fcdf473 100644
--- a/tools/bpf/bpftool/feature.c
+++ b/tools/bpf/bpftool/feature.c
@@ -196,7 +196,7 @@ static void probe_unprivileged_disabled(void)
 {
long res;
 
-   /* No support for C-style ouptut */
+   /* No support for C-style output */
 
res = read_procfs("/proc/sys/kernel/unprivileged_bpf_disabled");
if (json_output) {
@@ -225,7 +225,7 @@ static void probe_jit_enable(void)
 {
long res;
 
-   /* No support for C-style ouptut */
+   /* No support for C-style output */
 
res = read_procfs("/proc/sys/net/core/bpf_jit_enable");
if (json_output) {
@@ -255,7 +255,7 @@ static void probe_jit_harden(void)
 {
long res;
 
-   /* No support for C-style ouptut */
+   /* No support for C-style output */
 
res = read_procfs("/proc/sys/net/core/bpf_jit_harden");
if (json_output) {
@@ -285,7 +285,7 @@ static void probe_jit_kallsyms(void)
 {
long res;
 
-   /* No support for C-style ouptut */
+   /* No support for C-style output */
 
res = read_procfs("/proc/sys/net/core/bpf_jit_kallsyms");
if (json_output) {
@@ -311,7 +311,7 @@ static void probe_jit_limit(void)
 {
long res;
 
-   /* No support for C-style ouptut */
+   /* No support for C-style output */
 
res = read_procfs("/proc/sys/net/core/bpf_jit_limit");
if (json_output) {
-- 
2.34.1




[PATCH bpf-next v1 1/3] selftests/bpf: fix some typos in selftests

2024-09-05 Thread Lin Yikai
Hi, fix some spelling errors in selftest, the details are as follows:

-in the codes:
test_bpf_sk_stoarge_map_iter_fd(void)
->test_bpf_sk_storage_map_iter_fd(void)
load BTF from btf_data.o->load BTF from btf_data.bpf.o

-in the code comments:
preample->preamble
multi-contollers->multi-controllers
errono->errno
unsighed/unsinged->unsigned
egree->egress
shoud->should
regsiter->register
assummed->assumed
conditiona->conditional
rougly->roughly
timetamp->timestamp
ingores->ignores
null-termainted->null-terminated
slepable->sleepable
implemenation->implementation
veriables->variables
timetamps->timestamps
substitue a costant->substitute a constant
secton->section
unreferened->unreferenced
        verifer->verifier
libppf->libbpf
...

Signed-off-by: Lin Yikai 
---
 tools/testing/selftests/bpf/benchs/bench_trigger.c |  2 +-
 tools/testing/selftests/bpf/cgroup_helpers.c   |  2 +-
 .../selftests/bpf/map_tests/htab_map_batch_ops.c   |  2 +-
 .../bpf/map_tests/lpm_trie_map_batch_ops.c |  2 +-
 tools/testing/selftests/bpf/prog_tests/bpf_iter.c  |  4 ++--
 tools/testing/selftests/bpf/prog_tests/btf.c   |  6 +++---
 .../selftests/bpf/prog_tests/cg_storage_multi.c|  2 +-
 tools/testing/selftests/bpf/prog_tests/log_buf.c   |  4 ++--
 .../testing/selftests/bpf/prog_tests/reg_bounds.c  | 14 +++---
 .../selftests/bpf/prog_tests/resolve_btfids.c  |  2 +-
 .../testing/selftests/bpf/prog_tests/tc_redirect.c |  2 +-
 .../selftests/bpf/prog_tests/test_bprm_opts.c  |  2 +-
 .../selftests/bpf/prog_tests/test_strncmp.c|  2 +-
 tools/testing/selftests/bpf/prog_tests/token.c |  4 ++--
 .../selftests/bpf/prog_tests/uprobe_multi_test.c   |  2 +-
 .../selftests/bpf/prog_tests/user_ringbuf.c|  2 +-
 tools/testing/selftests/bpf/progs/bpf_cubic.c  |  6 +++---
 tools/testing/selftests/bpf/progs/strobemeta.h |  4 ++--
 .../selftests/bpf/progs/test_cls_redirect_dynptr.c |  2 +-
 .../selftests/bpf/progs/test_core_read_macros.c|  2 +-
 .../selftests/bpf/progs/test_global_func15.c   |  2 +-
 .../selftests/bpf/progs/test_global_map_resize.c   |  2 +-
 tools/testing/selftests/bpf/test_maps.c|  2 +-
 tools/testing/selftests/bpf/test_progs.c   |  2 +-
 tools/testing/selftests/bpf/verifier/map_kptr.c|  2 +-
 25 files changed, 39 insertions(+), 39 deletions(-)

diff --git a/tools/testing/selftests/bpf/benchs/bench_trigger.c 
b/tools/testing/selftests/bpf/benchs/bench_trigger.c
index a220545a3238..2ed0ef6f21ee 100644
--- a/tools/testing/selftests/bpf/benchs/bench_trigger.c
+++ b/tools/testing/selftests/bpf/benchs/bench_trigger.c
@@ -276,7 +276,7 @@ static void trigger_rawtp_setup(void)
  * instructions. So use two different targets, one of which starts with nop
  * and another doesn't.
  *
- * GCC doesn't generate stack setup preample for these functions due to them
+ * GCC doesn't generate stack setup preamble for these functions due to them
  * having no input arguments and doing nothing in the body.
  */
 __nocf_check __weak void uprobe_target_nop(void)
diff --git a/tools/testing/selftests/bpf/cgroup_helpers.c 
b/tools/testing/selftests/bpf/cgroup_helpers.c
index 23bb9a9e6a7d..e4535451322e 100644
--- a/tools/testing/selftests/bpf/cgroup_helpers.c
+++ b/tools/testing/selftests/bpf/cgroup_helpers.c
@@ -644,7 +644,7 @@ unsigned long long get_classid_cgroup_id(void)
 /**
  * get_cgroup1_hierarchy_id - Retrieves the ID of a cgroup1 hierarchy from the 
cgroup1 subsys name.
  * @subsys_name: The cgroup1 subsys name, which can be retrieved from 
/proc/self/cgroup. It can be
- * a named cgroup like "name=systemd", a controller name like "net_cls", or 
multi-contollers like
+ * a named cgroup like "name=systemd", a controller name like "net_cls", or 
multi-controllers like
  * "net_cls,net_prio".
  */
 int get_cgroup1_hierarchy_id(const char *subsys_name)
diff --git a/tools/testing/selftests/bpf/map_tests/htab_map_batch_ops.c 
b/tools/testing/selftests/bpf/map_tests/htab_map_batch_ops.c
index 1230ccf90128..5da493b94ae2 100644
--- a/tools/testing/selftests/bpf/map_tests/htab_map_batch_ops.c
+++ b/tools/testing/selftests/bpf/map_tests/htab_map_batch_ops.c
@@ -197,7 +197,7 @@ void __test_map_lookup_and_delete_batch(bool is_pcpu)
CHECK(total != max_entries, "delete with steps",
  "total = %u, max_entries = %u\n", total, max_entries);
 
-   /* check map is empty, errono == ENOENT */
+   /* check map is empty, errno == ENOENT */
err = bpf_map_get_next_key(map_fd, NULL, &key);
CHECK(!err || errno != ENOENT, "bpf_ma

Re: [PATCH net-next v12 04/14] mm: page_frag: add '_va' suffix to page_frag API

2024-08-06 Thread Yunsheng Lin
On 2024/8/6 8:52, Alexander Duyck wrote:
> On Sun, Aug 4, 2024 at 10:00 AM Yunsheng Lin  
> wrote:
>>
>> On 8/3/2024 1:00 AM, Alexander Duyck wrote:
>>
>>>>
>>>>>
>>>>> As far as your API extension and naming maybe you should look like
>>>>> something like bio_vec and borrow the naming from that since that is
>>>>> essentially what you are passing back and forth is essentially that
>>>>> instead of a page frag which is normally a virtual address.
>>>>
>>>> I thought about adding something like bio_vec before, but I am not sure
>>>> what you have in mind is somthing like I considered before?
>>>> Let's say that we reuse bio_vec like something below for the new APIs:
>>>>
>>>> struct bio_vec {
>>>>  struct page *bv_page;
>>>>  void*va;
>>>>  unsigned intbv_len;
>>>>  unsigned intbv_offset;
>>>> };
>>>
>>> I wasn't suggesting changing the bio_vec. I was suggesting that be
>>> what you pass as a pointer reference instead of the offset. Basically
>>> your use case is mostly just for populating bio_vec style structures
>>> anyway.
>>
>> I wasn't trying/going to reuse/change bio_vec for page_frag, I was just
>> having a hard time coming with a good new name for it.
>> The best one I came up with is pfrag_vec, but I am not sure about the
>> 'vec' as the "vec" portion of the name would suggest, iovec structures
>> tend to come in arrays, mentioned in the below article:
>> https://lwn.net/Articles/625077/
>>
>> Anther one is page_frag, which is currently in use.
>>
>> Or any better one in your mind?
> 
> I was suggesting using bio_vec, not some new structure. The general
> idea is that almost all the values you are using are exposed by that
> structure already in the case of the page based calls you were adding,
> so it makes sense to use what is there rather than reinventing the
> wheel.

Through a quick look, there seems to be at least three structs which have
similar values: struct bio_vec & struct skb_frag & struct page_frag.

As your above agrument about using bio_vec, it seems it is ok to use any
one of them as each one of them seems to have almost all the values we
are using?

Personally, my preference over them: 'struct page_frag' > 'struct skb_frag'
> 'struct bio_vec', as the naming of 'struct page_frag' seems to best match
the page_frag API, 'struct skb_frag' is the second preference because we
mostly need to fill skb frag anyway, and 'struct bio_vec' is the last
preference because it just happen to have almost all the values needed.

Is there any specific reason other than the above "almost all the values you
are using are exposed by that structure already " that you prefer bio_vec?

> 
>>>
>>>> It seems we have the below options for the new API:
>>>>
>>>> option 1, it seems like a better option from API naming point of view, but
>>>> it needs to return a bio_vec pointer to the caller, it seems we need to 
>>>> have
>>>> extra space for the pointer, I am not sure how we can avoid the memory 
>>>> waste
>>>> for sk_page_frag() case in patch 12:
>>>> struct bio_vec *page_frag_alloc_bio(struct page_frag_cache *nc,
>>>>  unsigned int fragsz, gfp_t gfp_mask);
>>>>
>>>> option 2, it need both the caller and callee to have a its own local space
>>>> for 'struct bio_vec ', I am not sure if passing the content instead of
>>>> the pointer of a struct through the function returning is the common 
>>>> pattern
>>>> and if it has any performance impact yet:
>>>> struct bio_vec page_frag_alloc_bio(struct page_frag_cache *nc,
>>>> unsigned int fragsz, gfp_t gfp_mask);
>>>>
>>>> option 3, the caller passes the pointer of 'struct bio_vec ' to the callee,
>>>> and page_frag_alloc_bio() fills in the data, I am not sure what is the 
>>>> point
>>>> of indirect using 'struct bio_vec ' instead of passing 'va' & 'fragsz' &
>>>> 'offset' through pointers directly:
>>>> bool page_frag_alloc_bio(struct page_frag_cache *nc,
>>>>   unsigned int fragsz, gfp_t gfp_mask, struct 
>>>> bio_vec *bio);

Re: [PATCH net-next v12 04/14] mm: page_frag: add '_va' suffix to page_frag API

2024-08-03 Thread Yunsheng Lin

On 8/3/2024 1:00 AM, Alexander Duyck wrote:





As far as your API extension and naming maybe you should look like
something like bio_vec and borrow the naming from that since that is
essentially what you are passing back and forth is essentially that
instead of a page frag which is normally a virtual address.


I thought about adding something like bio_vec before, but I am not sure
what you have in mind is somthing like I considered before?
Let's say that we reuse bio_vec like something below for the new APIs:

struct bio_vec {
 struct page *bv_page;
 void*va;
 unsigned intbv_len;
 unsigned intbv_offset;
};


I wasn't suggesting changing the bio_vec. I was suggesting that be
what you pass as a pointer reference instead of the offset. Basically
your use case is mostly just for populating bio_vec style structures
anyway.


I wasn't trying/going to reuse/change bio_vec for page_frag, I was just
having a hard time coming with a good new name for it.
The best one I came up with is pfrag_vec, but I am not sure about the
'vec' as the "vec" portion of the name would suggest, iovec structures 
tend to come in arrays, mentioned in the below article:

https://lwn.net/Articles/625077/

Anther one is page_frag, which is currently in use.

Or any better one in your mind?




It seems we have the below options for the new API:

option 1, it seems like a better option from API naming point of view, but
it needs to return a bio_vec pointer to the caller, it seems we need to have
extra space for the pointer, I am not sure how we can avoid the memory waste
for sk_page_frag() case in patch 12:
struct bio_vec *page_frag_alloc_bio(struct page_frag_cache *nc,
 unsigned int fragsz, gfp_t gfp_mask);

option 2, it need both the caller and callee to have a its own local space
for 'struct bio_vec ', I am not sure if passing the content instead of
the pointer of a struct through the function returning is the common pattern
and if it has any performance impact yet:
struct bio_vec page_frag_alloc_bio(struct page_frag_cache *nc,
unsigned int fragsz, gfp_t gfp_mask);

option 3, the caller passes the pointer of 'struct bio_vec ' to the callee,
and page_frag_alloc_bio() fills in the data, I am not sure what is the point
of indirect using 'struct bio_vec ' instead of passing 'va' & 'fragsz' &
'offset' through pointers directly:
bool page_frag_alloc_bio(struct page_frag_cache *nc,
  unsigned int fragsz, gfp_t gfp_mask, struct bio_vec 
*bio);

If one of the above option is something in your mind? Yes, please be more 
specific
about which one is the prefer option, and why it is the prefer option than the 
one
introduced in this patchset?

If no, please be more specific what that is in your mind?


Option 3 is more or less what I had in mind. Basically you would
return an int to indicate any errors and you would be populating a
bio_vec during your allocation. In addition you would use the bio_vec


Actually using this new bio_vec style structures does not seem to solve
the APIs naming issue this patch is trying to solve as my understanding,
as the new struct is only about passing one pointer or multi-pointers
from API naming perspective. It is part of the API naming, but not all
of it.


as a tracker of the actual fragsz so when you commit you are
committing with the fragsz as it was determined at the time of putting
the bio_vec together so you can theoretically catch things like if the
underlying offset had somehow changed from the time you setup the


I think we might need a stronger argument than the above to use the new
*vec thing other than the above debugging feature.

I looked throught the bio_vec related info, and come along somewhat not
really related, but really helpful "What’s all this get us" section:
https://docs.kernel.org/block/biovecs.html

So the question seems to be: what is this new struct for page_frag get
us?

Generally, I am argeed with the new struct thing if it does bring us
something other than the above debugging feature. Otherwise we should
avoid introducing a new thing which is hard to argue about its existent.


allocation. It would fit well into your probe routines since they are
all essentially passing the page, offset, and fragsz throughout the
code.


For the current probe routines, the 'va' need to be passed, do you
expect the 'va' to be passed by function return, double pointer, or
new the *_vec pointer?









Re: [PATCH net-next v12 04/14] mm: page_frag: add '_va' suffix to page_frag API

2024-08-02 Thread Yunsheng Lin
On 2024/8/1 23:21, Alexander Duyck wrote:
> On Thu, Aug 1, 2024 at 6:01 AM Yunsheng Lin  wrote:
>>
>> On 2024/8/1 2:13, Alexander Duyck wrote:
>>> On Wed, Jul 31, 2024 at 5:50 AM Yunsheng Lin  wrote:
>>>>
>>>> Currently the page_frag API is returning 'virtual address'
>>>> or 'va' when allocing and expecting 'virtual address' or
>>>> 'va' as input when freeing.
>>>>
>>>> As we are about to support new use cases that the caller
>>>> need to deal with 'struct page' or need to deal with both
>>>> 'va' and 'struct page'. In order to differentiate the API
>>>> handling between 'va' and 'struct page', add '_va' suffix
>>>> to the corresponding API mirroring the page_pool_alloc_va()
>>>> API of the page_pool. So that callers expecting to deal with
>>>> va, page or both va and page may call page_frag_alloc_va*,
>>>> page_frag_alloc_pg*, or page_frag_alloc* API accordingly.
>>>>
>>>> CC: Alexander Duyck 
>>>> Signed-off-by: Yunsheng Lin 
>>>> Reviewed-by: Subbaraya Sundeep 
>>>
>>> I am naking this patch. It is a pointless rename that is just going to
>>> obfuscate the git history for these callers.
>>
>> I responded to your above similar comment in v2, and then responded more
>> detailedly in v11, both got not direct responding, it would be good to
>> have more concrete feedback here instead of abstract argument.
>>
>> https://lore.kernel.org/all/74e7259a-c462-e3c1-73ac-8e3f49fb8...@huawei.com/
>> https://lore.kernel.org/all/11187fe4-9419-4341-97b5-6dad7583b...@huawei.com/
> 
> I will make this much more understandable. This patch is one of the
> ones that will permanently block this set in my opinion. As such I
> will never ack this patch as I see no benefit to it. Arguing with me
> on this is moot as you aren't going to change my mind, and I don't
> have all day to argue back and forth with you on every single patch.

Let's move on to more specific technical discussion then.

> 
> As far as your API extension and naming maybe you should look like
> something like bio_vec and borrow the naming from that since that is
> essentially what you are passing back and forth is essentially that
> instead of a page frag which is normally a virtual address.

I thought about adding something like bio_vec before, but I am not sure
what you have in mind is somthing like I considered before?
Let's say that we reuse bio_vec like something below for the new APIs:

struct bio_vec {
struct page *bv_page;
void*va;
unsigned intbv_len;
unsigned intbv_offset;
};

It seems we have the below options for the new API:

option 1, it seems like a better option from API naming point of view, but
it needs to return a bio_vec pointer to the caller, it seems we need to have
extra space for the pointer, I am not sure how we can avoid the memory waste
for sk_page_frag() case in patch 12:
struct bio_vec *page_frag_alloc_bio(struct page_frag_cache *nc,
unsigned int fragsz, gfp_t gfp_mask);

option 2, it need both the caller and callee to have a its own local space
for 'struct bio_vec ', I am not sure if passing the content instead of
the pointer of a struct through the function returning is the common pattern
and if it has any performance impact yet:
struct bio_vec page_frag_alloc_bio(struct page_frag_cache *nc,
   unsigned int fragsz, gfp_t gfp_mask);

option 3, the caller passes the pointer of 'struct bio_vec ' to the callee,
and page_frag_alloc_bio() fills in the data, I am not sure what is the point
of indirect using 'struct bio_vec ' instead of passing 'va' & 'fragsz' &
'offset' through pointers directly:
bool page_frag_alloc_bio(struct page_frag_cache *nc,
 unsigned int fragsz, gfp_t gfp_mask, struct bio_vec 
*bio);

If one of the above option is something in your mind? Yes, please be more 
specific
about which one is the prefer option, and why it is the prefer option than the 
one
introduced in this patchset?

If no, please be more specific what that is in your mind?




[PATCH 2/2] arm64: dts: qcom: msm8916-samsung-j3ltetw: Add initial device tree

2024-08-02 Thread Lin, Meng-Bo
The dts and dtsi add support for msm8916 variant of Samsung Galaxy J3
SM-J320YZ smartphone released in 2016.

Add a device tree for SM-J320YZ with initial support for:

- GPIO keys
- SDHCI (internal and external storage)
- USB Device Mode
- UART (on USB connector via the SM5703 MUIC)
- WCNSS (WiFi/BT)
- Regulators
- QDSP6 audio
- Speaker/earpiece/headphones/microphones via digital/analog codec in
  MSM8916/PM8916
- WWAN Internet via BAM-DMUX
- Touchscreen

There are different variants of J3, with some differences in MUIC, sensor,
NFC and touch key I2C buses.

The common parts are shared in msm8916-samsung-j3-common.dtsi to reduce
duplication.

Signed-off-by: "Lin, Meng-Bo" 
---
 arch/arm64/boot/dts/qcom/Makefile |  1 +
 .../dts/qcom/msm8916-samsung-j3-common.dtsi   | 53 +++
 .../boot/dts/qcom/msm8916-samsung-j3ltetw.dts | 20 +++
 3 files changed, 74 insertions(+)
 create mode 100644 arch/arm64/boot/dts/qcom/msm8916-samsung-j3-common.dtsi
 create mode 100644 arch/arm64/boot/dts/qcom/msm8916-samsung-j3ltetw.dts

diff --git a/arch/arm64/boot/dts/qcom/Makefile 
b/arch/arm64/boot/dts/qcom/Makefile
index e534442620a1..197ab325c0b9 100644
--- a/arch/arm64/boot/dts/qcom/Makefile
+++ b/arch/arm64/boot/dts/qcom/Makefile
@@ -48,6 +48,7 @@ dtb-$(CONFIG_ARCH_QCOM)   += msm8916-samsung-grandmax.dtb
 dtb-$(CONFIG_ARCH_QCOM)+= msm8916-samsung-grandprimelte.dtb
 dtb-$(CONFIG_ARCH_QCOM)+= msm8916-samsung-gt510.dtb
 dtb-$(CONFIG_ARCH_QCOM)+= msm8916-samsung-gt58.dtb
+dtb-$(CONFIG_ARCH_QCOM)+= msm8916-samsung-j3ltetw.dtb
 dtb-$(CONFIG_ARCH_QCOM)+= msm8916-samsung-j5.dtb
 dtb-$(CONFIG_ARCH_QCOM)+= msm8916-samsung-j5x.dtb
 dtb-$(CONFIG_ARCH_QCOM)+= msm8916-samsung-rossa.dtb
diff --git a/arch/arm64/boot/dts/qcom/msm8916-samsung-j3-common.dtsi 
b/arch/arm64/boot/dts/qcom/msm8916-samsung-j3-common.dtsi
new file mode 100644
index ..ce5c1ee450f1
--- /dev/null
+++ b/arch/arm64/boot/dts/qcom/msm8916-samsung-j3-common.dtsi
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include "msm8916-samsung-j5-common.dtsi"
+
+/ {
+   reserved-memory {
+   /delete-node/ tz-apps@8550;
+
+   /* Additional memory used by Samsung firmware modifications */
+   tz-apps@8580 {
+   reg = <0x0 0x8580 0x0 0x80>;
+   no-map;
+   };
+   };
+
+   reg_vdd_tsp_a: regulator-vdd-tsp-a {
+   compatible = "regulator-fixed";
+   regulator-name = "vdd_tsp_a";
+   regulator-min-microvolt = <300>;
+   regulator-max-microvolt = <300>;
+
+   gpio = <&tlmm 16 GPIO_ACTIVE_HIGH>;
+   enable-active-high;
+
+   pinctrl-0 = <&tsp_ldo_en_default>;
+   pinctrl-names = "default";
+   };
+};
+
+&gpio_hall_sensor {
+   status = "disabled";
+};
+
+&i2c_muic {
+   /* GPIO pins vary depending on model variant */
+};
+
+&i2c_sensors {
+   /* GPIO pins vary depending on model variant */
+};
+
+&touchscreen {
+   vdd-supply = <®_vdd_tsp_a>;
+};
+
+&tlmm {
+   tsp_ldo_en_default: tsp-ldo-en-default-state {
+   pins = "gpio16";
+   function = "gpio";
+   drive-strength = <2>;
+   bias-disable;
+   };
+};
diff --git a/arch/arm64/boot/dts/qcom/msm8916-samsung-j3ltetw.dts 
b/arch/arm64/boot/dts/qcom/msm8916-samsung-j3ltetw.dts
new file mode 100644
index ..344e63588531
--- /dev/null
+++ b/arch/arm64/boot/dts/qcom/msm8916-samsung-j3ltetw.dts
@@ -0,0 +1,20 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+/dts-v1/;
+
+#include "msm8916-samsung-j3-common.dtsi"
+
+/ {
+   model = "Samsung Galaxy J3 (2016) (SM-J320YZ)";
+   compatible = "samsung,j3ltetw", "qcom,msm8916";
+   chassis-type = "handset";
+};
+
+&i2c_muic {
+   sda-gpios = <&tlmm 0 (GPIO_ACTIVE_HIGH|GPIO_OPEN_DRAIN)>;
+   scl-gpios = <&tlmm 1 (GPIO_ACTIVE_HIGH|GPIO_OPEN_DRAIN)>;
+};
+
+&muic_i2c_default {
+   pins = "gpio0", "gpio1";
+};
-- 
2.39.2





[PATCH 1/2] dt-bindings: qcom: Document samsung,j3ltetw

2024-08-02 Thread Lin, Meng-Bo
Document samsung,j3ltetw bindings used in its device tree.

Signed-off-by: "Lin, Meng-Bo" 
---
 Documentation/devicetree/bindings/arm/qcom.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Documentation/devicetree/bindings/arm/qcom.yaml 
b/Documentation/devicetree/bindings/arm/qcom.yaml
index 4ef456cefd6c..c0529486810f 100644
--- a/Documentation/devicetree/bindings/arm/qcom.yaml
+++ b/Documentation/devicetree/bindings/arm/qcom.yaml
@@ -235,6 +235,7 @@ properties:
   - samsung,grandprimelte
   - samsung,gt510
   - samsung,gt58
+  - samsung,j3ltetw
   - samsung,j5
   - samsung,j5x
   - samsung,rossa
-- 
2.39.2





[PATCH 0/2] arm64: dts: qcom: msm8916-samsung-j3ltetw: Add initial device tree

2024-08-02 Thread Lin, Meng-Bo
The dts and dtsi add support for msm8916 variant of Samsung Galaxy J3
SM-J320YZ smartphone released in 2016.

Add a device tree for SM-J320YZ with initial support for:

- GPIO keys
- SDHCI (internal and external storage)
- USB Device Mode
- UART (on USB connector via the SM5703 MUIC)
- WCNSS (WiFi/BT)
- Regulators
- QDSP6 audio
- Speaker/earpiece/headphones/microphones via digital/analog codec in
  MSM8916/PM8916
- WWAN Internet via BAM-DMUX
- Touchscreen

There are different variants of J3, with some differences in MUIC, sensor,
NFC and touch key I2C buses.

The common parts are shared in msm8916-samsung-j3-common.dtsi to reduce
duplication.




Re: [PATCH net-next v12 04/14] mm: page_frag: add '_va' suffix to page_frag API

2024-08-01 Thread Yunsheng Lin
On 2024/8/1 2:13, Alexander Duyck wrote:
> On Wed, Jul 31, 2024 at 5:50 AM Yunsheng Lin  wrote:
>>
>> Currently the page_frag API is returning 'virtual address'
>> or 'va' when allocing and expecting 'virtual address' or
>> 'va' as input when freeing.
>>
>> As we are about to support new use cases that the caller
>> need to deal with 'struct page' or need to deal with both
>> 'va' and 'struct page'. In order to differentiate the API
>> handling between 'va' and 'struct page', add '_va' suffix
>> to the corresponding API mirroring the page_pool_alloc_va()
>> API of the page_pool. So that callers expecting to deal with
>> va, page or both va and page may call page_frag_alloc_va*,
>> page_frag_alloc_pg*, or page_frag_alloc* API accordingly.
>>
>> CC: Alexander Duyck 
>> Signed-off-by: Yunsheng Lin 
>> Reviewed-by: Subbaraya Sundeep 
> 
> I am naking this patch. It is a pointless rename that is just going to
> obfuscate the git history for these callers.

I responded to your above similar comment in v2, and then responded more
detailedly in v11, both got not direct responding, it would be good to
have more concrete feedback here instead of abstract argument.

https://lore.kernel.org/all/74e7259a-c462-e3c1-73ac-8e3f49fb8...@huawei.com/
https://lore.kernel.org/all/11187fe4-9419-4341-97b5-6dad7583b...@huawei.com/

> 
> As I believe I said before I would prefer to see this work more like
> the handling of __get_free_pages and __free_pages in terms of the use

I am not even sure why are you bringing up  __get_free_pages() and
__free_pages() here, as the declaration of them is something like below:

unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order);
void __free_pages(struct page *page, unsigned int order);

And I add another related one for completeness here:
extern void free_pages(unsigned long addr, unsigned int order);

I am failing to see there is any pattern or rule for the above API
naming. If there is some pattern for the above existing APIs, please
describe them in detail so that we have common understanding.

After the renaming, the declaration for both new and old APIs is
below, please be more specific about what exactly is the confusion
about them, what is the better naming for the below APIs in your
mind:
struct page *page_frag_alloc_pg(struct page_frag_cache *nc,
   unsigned int *offset, unsigned int fragsz,
   gfp_t gfp);
void *page_frag_alloc_va(struct page_frag_cache *nc,
 unsigned int fragsz, gfp_t gfp_mask);
struct page *page_frag_alloc(struct page_frag_cache *nc,
 unsigned int *offset,
 unsigned int fragsz,
 void **va, gfp_t gfp);

> of pages versus pointers and/or longs. Pushing this API aside because
> you want to reuse the name for something different isn't a valid
> reason to rename an existing API and will just lead to confusion.

Before this patchset, all the page_frag API renamed with a '_va' suffix
in this patch are dealing with virtual address, it would be better to be
more specific about what exactly is the confusion here by adding a explicit
'va' suffix for them in this patch?

I would argue that the renaming may avoid some confusion about whether
page_frag_alloc() returning a 'struct page' or returning a virtual address
instead of leading to confusion.

Anyway, naming is always hard, any better naming is welcome. But don't deny
any existing API renaming when we have not really started doing detailed
comparison between different API naming options yet.



[PATCH net-next v12 05/14] mm: page_frag: avoid caller accessing 'page_frag_cache' directly

2024-07-31 Thread Yunsheng Lin
Use appropriate frag_page API instead of caller accessing
'page_frag_cache' directly.

CC: Alexander Duyck 
Signed-off-by: Yunsheng Lin 
Reviewed-by: Alexander Duyck 
---
 drivers/vhost/net.c |  2 +-
 include/linux/page_frag_cache.h | 10 ++
 mm/page_frag_test.c |  2 +-
 net/core/skbuff.c   |  6 +++---
 net/rxrpc/conn_object.c |  4 +---
 net/rxrpc/local_object.c|  4 +---
 net/sunrpc/svcsock.c|  6 ++
 7 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 6691fac01e0d..b2737dc0dc50 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -1325,7 +1325,7 @@ static int vhost_net_open(struct inode *inode, struct 
file *f)
vqs[VHOST_NET_VQ_RX]);
 
f->private_data = n;
-   n->pf_cache.va = NULL;
+   page_frag_cache_init(&n->pf_cache);
 
return 0;
 }
diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h
index ef038a07925c..7c9125a9aed3 100644
--- a/include/linux/page_frag_cache.h
+++ b/include/linux/page_frag_cache.h
@@ -7,6 +7,16 @@
 #include 
 #include 
 
+static inline void page_frag_cache_init(struct page_frag_cache *nc)
+{
+   nc->va = NULL;
+}
+
+static inline bool page_frag_cache_is_pfmemalloc(struct page_frag_cache *nc)
+{
+   return !!nc->pfmemalloc;
+}
+
 void page_frag_cache_drain(struct page_frag_cache *nc);
 void __page_frag_cache_drain(struct page *page, unsigned int count);
 void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
diff --git a/mm/page_frag_test.c b/mm/page_frag_test.c
index 9eaa3ab74b29..6df8d8865afe 100644
--- a/mm/page_frag_test.c
+++ b/mm/page_frag_test.c
@@ -344,7 +344,7 @@ static int __init page_frag_test_init(void)
u64 duration;
int ret;
 
-   test_frag.va = NULL;
+   page_frag_cache_init(&test_frag);
atomic_set(&nthreads, 2);
init_completion(&wait);
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 4b8acd967793..76a473b1072d 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -749,14 +749,14 @@ struct sk_buff *__netdev_alloc_skb(struct net_device 
*dev, unsigned int len,
if (in_hardirq() || irqs_disabled()) {
nc = this_cpu_ptr(&netdev_alloc_cache);
data = page_frag_alloc_va(nc, len, gfp_mask);
-   pfmemalloc = nc->pfmemalloc;
+   pfmemalloc = page_frag_cache_is_pfmemalloc(nc);
} else {
local_bh_disable();
local_lock_nested_bh(&napi_alloc_cache.bh_lock);
 
nc = this_cpu_ptr(&napi_alloc_cache.page);
data = page_frag_alloc_va(nc, len, gfp_mask);
-   pfmemalloc = nc->pfmemalloc;
+   pfmemalloc = page_frag_cache_is_pfmemalloc(nc);
 
local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
local_bh_enable();
@@ -846,7 +846,7 @@ struct sk_buff *napi_alloc_skb(struct napi_struct *napi, 
unsigned int len)
len = SKB_HEAD_ALIGN(len);
 
data = page_frag_alloc_va(&nc->page, len, gfp_mask);
-   pfmemalloc = nc->page.pfmemalloc;
+   pfmemalloc = page_frag_cache_is_pfmemalloc(&nc->page);
}
local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
 
diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c
index 1539d315afe7..694c4df7a1a3 100644
--- a/net/rxrpc/conn_object.c
+++ b/net/rxrpc/conn_object.c
@@ -337,9 +337,7 @@ static void rxrpc_clean_up_connection(struct work_struct 
*work)
 */
rxrpc_purge_queue(&conn->rx_queue);
 
-   if (conn->tx_data_alloc.va)
-   __page_frag_cache_drain(virt_to_page(conn->tx_data_alloc.va),
-   conn->tx_data_alloc.pagecnt_bias);
+   page_frag_cache_drain(&conn->tx_data_alloc);
call_rcu(&conn->rcu, rxrpc_rcu_free_connection);
 }
 
diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c
index 504453c688d7..a8cffe47cf01 100644
--- a/net/rxrpc/local_object.c
+++ b/net/rxrpc/local_object.c
@@ -452,9 +452,7 @@ void rxrpc_destroy_local(struct rxrpc_local *local)
 #endif
rxrpc_purge_queue(&local->rx_queue);
rxrpc_purge_client_connections(local);
-   if (local->tx_alloc.va)
-   __page_frag_cache_drain(virt_to_page(local->tx_alloc.va),
-   local->tx_alloc.pagecnt_bias);
+   page_frag_cache_drain(&local->tx_alloc);
 }
 
 /*
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 42d20412c1c3..4b1e87187614 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1609,7 +1609,6 @@ static void svc_tcp_sock_detach(struct svc_xprt *xprt)
 static void svc_sock_free(struct svc_xprt *xprt)
 {
struct svc_sock *svsk = container_of

[PATCH net-next v12 04/14] mm: page_frag: add '_va' suffix to page_frag API

2024-07-31 Thread Yunsheng Lin
Currently the page_frag API is returning 'virtual address'
or 'va' when allocing and expecting 'virtual address' or
'va' as input when freeing.

As we are about to support new use cases that the caller
need to deal with 'struct page' or need to deal with both
'va' and 'struct page'. In order to differentiate the API
handling between 'va' and 'struct page', add '_va' suffix
to the corresponding API mirroring the page_pool_alloc_va()
API of the page_pool. So that callers expecting to deal with
va, page or both va and page may call page_frag_alloc_va*,
page_frag_alloc_pg*, or page_frag_alloc* API accordingly.

CC: Alexander Duyck 
Signed-off-by: Yunsheng Lin 
Reviewed-by: Subbaraya Sundeep 
---
 drivers/net/ethernet/google/gve/gve_rx.c  |  4 ++--
 drivers/net/ethernet/intel/ice/ice_txrx.c |  2 +-
 drivers/net/ethernet/intel/ice/ice_txrx.h |  2 +-
 drivers/net/ethernet/intel/ice/ice_txrx_lib.c |  2 +-
 .../net/ethernet/intel/ixgbevf/ixgbevf_main.c |  4 ++--
 .../marvell/octeontx2/nic/otx2_common.c   |  2 +-
 drivers/net/ethernet/mediatek/mtk_wed_wo.c|  4 ++--
 drivers/nvme/host/tcp.c   |  8 +++
 drivers/nvme/target/tcp.c | 22 +--
 drivers/vhost/net.c   |  6 ++---
 include/linux/page_frag_cache.h   | 21 +-
 include/linux/skbuff.h|  2 +-
 kernel/bpf/cpumap.c   |  2 +-
 mm/page_frag_cache.c  | 12 +-
 mm/page_frag_test.c   | 13 ++-
 net/core/skbuff.c | 14 ++--
 net/core/xdp.c|  2 +-
 net/rxrpc/txbuf.c | 15 +++--
 net/sunrpc/svcsock.c  |  6 ++---
 19 files changed, 74 insertions(+), 69 deletions(-)

diff --git a/drivers/net/ethernet/google/gve/gve_rx.c 
b/drivers/net/ethernet/google/gve/gve_rx.c
index acb73d4d0de6..b6c10100e462 100644
--- a/drivers/net/ethernet/google/gve/gve_rx.c
+++ b/drivers/net/ethernet/google/gve/gve_rx.c
@@ -729,7 +729,7 @@ static int gve_xdp_redirect(struct net_device *dev, struct 
gve_rx_ring *rx,
 
total_len = headroom + SKB_DATA_ALIGN(len) +
SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
-   frame = page_frag_alloc(&rx->page_cache, total_len, GFP_ATOMIC);
+   frame = page_frag_alloc_va(&rx->page_cache, total_len, GFP_ATOMIC);
if (!frame) {
u64_stats_update_begin(&rx->statss);
rx->xdp_alloc_fails++;
@@ -742,7 +742,7 @@ static int gve_xdp_redirect(struct net_device *dev, struct 
gve_rx_ring *rx,
 
err = xdp_do_redirect(dev, &new, xdp_prog);
if (err)
-   page_frag_free(frame);
+   page_frag_free_va(frame);
 
return err;
 }
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c 
b/drivers/net/ethernet/intel/ice/ice_txrx.c
index 8bb743f78fcb..399b317c509d 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.c
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.c
@@ -126,7 +126,7 @@ ice_unmap_and_free_tx_buf(struct ice_tx_ring *ring, struct 
ice_tx_buf *tx_buf)
dev_kfree_skb_any(tx_buf->skb);
break;
case ICE_TX_BUF_XDP_TX:
-   page_frag_free(tx_buf->raw_buf);
+   page_frag_free_va(tx_buf->raw_buf);
break;
case ICE_TX_BUF_XDP_XMIT:
xdp_return_frame(tx_buf->xdpf);
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.h 
b/drivers/net/ethernet/intel/ice/ice_txrx.h
index feba314a3fe4..6379f57d8228 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.h
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.h
@@ -148,7 +148,7 @@ static inline int ice_skb_pad(void)
  * @ICE_TX_BUF_DUMMY: dummy Flow Director packet, unmap and kfree()
  * @ICE_TX_BUF_FRAG: mapped skb OR &xdp_buff frag, only unmap DMA
  * @ICE_TX_BUF_SKB: &sk_buff, unmap and consume_skb(), update stats
- * @ICE_TX_BUF_XDP_TX: &xdp_buff, unmap and page_frag_free(), stats
+ * @ICE_TX_BUF_XDP_TX: &xdp_buff, unmap and page_frag_free_va(), stats
  * @ICE_TX_BUF_XDP_XMIT: &xdp_frame, unmap and xdp_return_frame(), stats
  * @ICE_TX_BUF_XSK_TX: &xdp_buff on XSk queue, xsk_buff_free(), stats
  */
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c 
b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
index 2719f0e20933..a1a41a14df0d 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
+++ b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
@@ -250,7 +250,7 @@ ice_clean_xdp_tx_buf(struct device *dev, struct ice_tx_buf 
*tx_buf,
 
switch (tx_buf->type) {
case ICE_TX_BUF_XDP_TX:
-   page_frag_free(tx_buf->raw_buf);
+   page_frag_free_va(tx_buf->raw_buf);
  

Re: [RFC v11 04/14] mm: page_frag: add '_va' suffix to page_frag API

2024-07-25 Thread Yunsheng Lin
On 2024/7/22 4:41, Alexander Duyck wrote:
> On Fri, Jul 19, 2024 at 2:37 AM Yunsheng Lin  wrote:
>>
>> Currently the page_frag API is returning 'virtual address'
>> or 'va' when allocing and expecting 'virtual address' or
>> 'va' as input when freeing.
>>
>> As we are about to support new use cases that the caller
>> need to deal with 'struct page' or need to deal with both
>> 'va' and 'struct page'. In order to differentiate the API
>> handling between 'va' and 'struct page', add '_va' suffix
>> to the corresponding API mirroring the page_pool_alloc_va()
>> API of the page_pool. So that callers expecting to deal with
>> va, page or both va and page may call page_frag_alloc_va*,
>> page_frag_alloc_pg*, or page_frag_alloc* API accordingly.
>>
>> CC: Alexander Duyck 
>> Signed-off-by: Yunsheng Lin 
>> Reviewed-by: Subbaraya Sundeep 
> 
> Rather than renaming the existing API I would rather see this follow
> the same approach as we use with the other memory subsystem functions.

I am not sure if I understand what 'the other memory subsystem functions'
is referring to, it would be better to be more specific about that.

For allocation side:
alloc_pages*()
extern unsigned long get_free_page*(gfp_t gfp_mask, unsigned int order);

For free side, it seems we have:
extern void __free_pages(struct page *page, unsigned int order);
extern void free_pages(unsigned long addr, unsigned int order);
static inline void put_page(struct page *page)

So there seems to be no clear pattern that the mm APIs with double
underscore is dealing with 'struct page' and the one without double
underscore is dealing with virtual address, at least not from the
allocation side.

> A specific example being that with free_page it is essentially passed
> a virtual address, while the double underscore version is passed a
> page. I would be more okay with us renaming the double underscore
> version of any functions we might have to address that rather than
> renaming all the functions with "va".

Before this patchset, page_frag has the below APIs as below:

void *__page_frag_alloc_align(struct page_frag_cache *nc, unsigned int fragsz,
  gfp_t gfp_mask, unsigned int align_mask);

static inline void *page_frag_alloc_align(struct page_frag_cache *nc,
  unsigned int fragsz, gfp_t gfp_mask,
  unsigned int align)

extern void page_frag_free(void *addr);

It would be better to be more specific about what renaming does the above
APIs need in order to support the new usecases.

> 
> In general I would say this patch is adding no value as what it is

As above, it would be better to give a more specific suggestion to
back up the above somewhat abstract agrument, otherwise it is hard
to tell if there is better option here, and why it is better than
the one proposed in this patchset.

> doing is essentially pushing the primary users of this API to change
> to support use cases that won't impact most of them. It is just
> creating a ton of noise in terms of changes with no added value so we
> can reuse the function names.


After this patchset, we have the below page_frag APIs:

For allocation side, we have below APIs:
struct page *page_frag_alloc_pg*(struct page_frag_cache *nc,
unsigned int *offset, unsigned int fragsz,
gfp_t gfp);
void *page_frag_alloc_va*(struct page_frag_cache *nc,
 unsigned int fragsz, gfp_t gfp_mask,
 unsigned int align_mask);
struct page *page_frag_alloc*(struct page_frag_cache *nc,
 unsigned int *offset,
 unsigned int fragsz,
 void **va, gfp_t gfp);

For allocation side, we have below APIs:
void page_frag_free_va(void *addr);

The main rules for the about naming are:
1. The API with 'align' suffix ensure the offset or va is aligned
2. The API with double underscore has no checking for the algin parameter.
3. The API with 'va' suffix is dealing with virtual address.
4. The API with 'pg' suffix is dealing with 'struct page'.
5. The API without 'pg' and 'va' suffix is dealing with both 'struct page'
   and virtual address.

Yes, I suppose it is not perfect mainly because we reuse some existing mm
API for page_frag free API.

As mentioned before, I would be happy to change it if what you are proposing
is indeed the better option.



[RFC v11 05/14] mm: page_frag: avoid caller accessing 'page_frag_cache' directly

2024-07-19 Thread Yunsheng Lin
Use appropriate frag_page API instead of caller accessing
'page_frag_cache' directly.

CC: Alexander Duyck 
Signed-off-by: Yunsheng Lin 
---
 drivers/vhost/net.c |  2 +-
 include/linux/page_frag_cache.h | 10 ++
 mm/page_frag_test.c |  2 +-
 net/core/skbuff.c   |  6 +++---
 net/rxrpc/conn_object.c |  4 +---
 net/rxrpc/local_object.c|  4 +---
 net/sunrpc/svcsock.c|  6 ++
 7 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 6691fac01e0d..b2737dc0dc50 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -1325,7 +1325,7 @@ static int vhost_net_open(struct inode *inode, struct 
file *f)
vqs[VHOST_NET_VQ_RX]);
 
f->private_data = n;
-   n->pf_cache.va = NULL;
+   page_frag_cache_init(&n->pf_cache);
 
return 0;
 }
diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h
index 4c5079f232b5..ef1572f11248 100644
--- a/include/linux/page_frag_cache.h
+++ b/include/linux/page_frag_cache.h
@@ -8,6 +8,16 @@
 #include 
 #include 
 
+static inline void page_frag_cache_init(struct page_frag_cache *nc)
+{
+   nc->va = NULL;
+}
+
+static inline bool page_frag_cache_is_pfmemalloc(struct page_frag_cache *nc)
+{
+   return !!nc->pfmemalloc;
+}
+
 void page_frag_cache_drain(struct page_frag_cache *nc);
 void __page_frag_cache_drain(struct page *page, unsigned int count);
 void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
diff --git a/mm/page_frag_test.c b/mm/page_frag_test.c
index 9eaa3ab74b29..6df8d8865afe 100644
--- a/mm/page_frag_test.c
+++ b/mm/page_frag_test.c
@@ -344,7 +344,7 @@ static int __init page_frag_test_init(void)
u64 duration;
int ret;
 
-   test_frag.va = NULL;
+   page_frag_cache_init(&test_frag);
atomic_set(&nthreads, 2);
init_completion(&wait);
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 4b8acd967793..76a473b1072d 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -749,14 +749,14 @@ struct sk_buff *__netdev_alloc_skb(struct net_device 
*dev, unsigned int len,
if (in_hardirq() || irqs_disabled()) {
nc = this_cpu_ptr(&netdev_alloc_cache);
data = page_frag_alloc_va(nc, len, gfp_mask);
-   pfmemalloc = nc->pfmemalloc;
+   pfmemalloc = page_frag_cache_is_pfmemalloc(nc);
} else {
local_bh_disable();
local_lock_nested_bh(&napi_alloc_cache.bh_lock);
 
nc = this_cpu_ptr(&napi_alloc_cache.page);
data = page_frag_alloc_va(nc, len, gfp_mask);
-   pfmemalloc = nc->pfmemalloc;
+   pfmemalloc = page_frag_cache_is_pfmemalloc(nc);
 
local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
local_bh_enable();
@@ -846,7 +846,7 @@ struct sk_buff *napi_alloc_skb(struct napi_struct *napi, 
unsigned int len)
len = SKB_HEAD_ALIGN(len);
 
data = page_frag_alloc_va(&nc->page, len, gfp_mask);
-   pfmemalloc = nc->page.pfmemalloc;
+   pfmemalloc = page_frag_cache_is_pfmemalloc(&nc->page);
}
local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
 
diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c
index 1539d315afe7..694c4df7a1a3 100644
--- a/net/rxrpc/conn_object.c
+++ b/net/rxrpc/conn_object.c
@@ -337,9 +337,7 @@ static void rxrpc_clean_up_connection(struct work_struct 
*work)
 */
rxrpc_purge_queue(&conn->rx_queue);
 
-   if (conn->tx_data_alloc.va)
-   __page_frag_cache_drain(virt_to_page(conn->tx_data_alloc.va),
-   conn->tx_data_alloc.pagecnt_bias);
+   page_frag_cache_drain(&conn->tx_data_alloc);
call_rcu(&conn->rcu, rxrpc_rcu_free_connection);
 }
 
diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c
index 504453c688d7..a8cffe47cf01 100644
--- a/net/rxrpc/local_object.c
+++ b/net/rxrpc/local_object.c
@@ -452,9 +452,7 @@ void rxrpc_destroy_local(struct rxrpc_local *local)
 #endif
rxrpc_purge_queue(&local->rx_queue);
rxrpc_purge_client_connections(local);
-   if (local->tx_alloc.va)
-   __page_frag_cache_drain(virt_to_page(local->tx_alloc.va),
-   local->tx_alloc.pagecnt_bias);
+   page_frag_cache_drain(&local->tx_alloc);
 }
 
 /*
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 42d20412c1c3..4b1e87187614 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1609,7 +1609,6 @@ static void svc_tcp_sock_detach(struct svc_xprt *xprt)
 static void svc_sock_free(struct svc_xprt *xprt)
 {
struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
-

[RFC v11 04/14] mm: page_frag: add '_va' suffix to page_frag API

2024-07-19 Thread Yunsheng Lin
Currently the page_frag API is returning 'virtual address'
or 'va' when allocing and expecting 'virtual address' or
'va' as input when freeing.

As we are about to support new use cases that the caller
need to deal with 'struct page' or need to deal with both
'va' and 'struct page'. In order to differentiate the API
handling between 'va' and 'struct page', add '_va' suffix
to the corresponding API mirroring the page_pool_alloc_va()
API of the page_pool. So that callers expecting to deal with
va, page or both va and page may call page_frag_alloc_va*,
page_frag_alloc_pg*, or page_frag_alloc* API accordingly.

CC: Alexander Duyck 
Signed-off-by: Yunsheng Lin 
Reviewed-by: Subbaraya Sundeep 
---
 drivers/net/ethernet/google/gve/gve_rx.c  |  4 ++--
 drivers/net/ethernet/intel/ice/ice_txrx.c |  2 +-
 drivers/net/ethernet/intel/ice/ice_txrx.h |  2 +-
 drivers/net/ethernet/intel/ice/ice_txrx_lib.c |  2 +-
 .../net/ethernet/intel/ixgbevf/ixgbevf_main.c |  4 ++--
 .../marvell/octeontx2/nic/otx2_common.c   |  2 +-
 drivers/net/ethernet/mediatek/mtk_wed_wo.c|  4 ++--
 drivers/nvme/host/tcp.c   |  8 +++
 drivers/nvme/target/tcp.c | 22 +--
 drivers/vhost/net.c   |  6 ++---
 include/linux/page_frag_cache.h   | 21 +-
 include/linux/skbuff.h|  2 +-
 kernel/bpf/cpumap.c   |  2 +-
 mm/page_frag_cache.c  | 12 +-
 mm/page_frag_test.c   | 13 ++-
 net/core/skbuff.c | 14 ++--
 net/core/xdp.c|  2 +-
 net/rxrpc/txbuf.c | 15 +++--
 net/sunrpc/svcsock.c  |  6 ++---
 19 files changed, 74 insertions(+), 69 deletions(-)

diff --git a/drivers/net/ethernet/google/gve/gve_rx.c 
b/drivers/net/ethernet/google/gve/gve_rx.c
index acb73d4d0de6..b6c10100e462 100644
--- a/drivers/net/ethernet/google/gve/gve_rx.c
+++ b/drivers/net/ethernet/google/gve/gve_rx.c
@@ -729,7 +729,7 @@ static int gve_xdp_redirect(struct net_device *dev, struct 
gve_rx_ring *rx,
 
total_len = headroom + SKB_DATA_ALIGN(len) +
SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
-   frame = page_frag_alloc(&rx->page_cache, total_len, GFP_ATOMIC);
+   frame = page_frag_alloc_va(&rx->page_cache, total_len, GFP_ATOMIC);
if (!frame) {
u64_stats_update_begin(&rx->statss);
rx->xdp_alloc_fails++;
@@ -742,7 +742,7 @@ static int gve_xdp_redirect(struct net_device *dev, struct 
gve_rx_ring *rx,
 
err = xdp_do_redirect(dev, &new, xdp_prog);
if (err)
-   page_frag_free(frame);
+   page_frag_free_va(frame);
 
return err;
 }
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c 
b/drivers/net/ethernet/intel/ice/ice_txrx.c
index 8bb743f78fcb..399b317c509d 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.c
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.c
@@ -126,7 +126,7 @@ ice_unmap_and_free_tx_buf(struct ice_tx_ring *ring, struct 
ice_tx_buf *tx_buf)
dev_kfree_skb_any(tx_buf->skb);
break;
case ICE_TX_BUF_XDP_TX:
-   page_frag_free(tx_buf->raw_buf);
+   page_frag_free_va(tx_buf->raw_buf);
break;
case ICE_TX_BUF_XDP_XMIT:
xdp_return_frame(tx_buf->xdpf);
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.h 
b/drivers/net/ethernet/intel/ice/ice_txrx.h
index feba314a3fe4..6379f57d8228 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.h
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.h
@@ -148,7 +148,7 @@ static inline int ice_skb_pad(void)
  * @ICE_TX_BUF_DUMMY: dummy Flow Director packet, unmap and kfree()
  * @ICE_TX_BUF_FRAG: mapped skb OR &xdp_buff frag, only unmap DMA
  * @ICE_TX_BUF_SKB: &sk_buff, unmap and consume_skb(), update stats
- * @ICE_TX_BUF_XDP_TX: &xdp_buff, unmap and page_frag_free(), stats
+ * @ICE_TX_BUF_XDP_TX: &xdp_buff, unmap and page_frag_free_va(), stats
  * @ICE_TX_BUF_XDP_XMIT: &xdp_frame, unmap and xdp_return_frame(), stats
  * @ICE_TX_BUF_XSK_TX: &xdp_buff on XSk queue, xsk_buff_free(), stats
  */
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c 
b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
index 2719f0e20933..a1a41a14df0d 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
+++ b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
@@ -250,7 +250,7 @@ ice_clean_xdp_tx_buf(struct device *dev, struct ice_tx_buf 
*tx_buf,
 
switch (tx_buf->type) {
case ICE_TX_BUF_XDP_TX:
-   page_frag_free(tx_buf->raw_buf);
+   page_frag_free_va(tx_buf->raw_buf);
  

[PATCH v2] arm64: dts: qcom: msm8916-samsung-grandmax: Add touchscreen

2024-07-18 Thread Lin, Meng-Bo
Grand Max uses an Imagis IST3038 touchscreen that is connected to
blsp_i2c5. Add it to the device tree.

Signed-off-by: "Lin, Meng-Bo" 
Reviewed-by: Konrad Dybcio 
---
v2: Fix comments for ®_touch_key
---
 .../dts/qcom/msm8916-samsung-grandmax.dts | 24 ++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/boot/dts/qcom/msm8916-samsung-grandmax.dts 
b/arch/arm64/boot/dts/qcom/msm8916-samsung-grandmax.dts
index 135df1739dbd..5ddb69bf8e78 100644
--- a/arch/arm64/boot/dts/qcom/msm8916-samsung-grandmax.dts
+++ b/arch/arm64/boot/dts/qcom/msm8916-samsung-grandmax.dts
@@ -47,12 +47,34 @@ &battery {
constant-charge-voltage-max-microvolt = <440>;
 };
 
+&blsp_i2c5 {
+   status = "okay";
+
+   touchscreen@50 {
+   compatible = "imagis,ist3038";
+   reg = <0x50>;
+
+   interrupts-extended = <&tlmm 13 IRQ_TYPE_EDGE_FALLING>;
+
+   touchscreen-size-x = <720>;
+   touchscreen-size-y = <1280>;
+
+   vdd-supply = <®_vdd_tsp_a>;
+   vddio-supply = <&pm8916_l6>;
+
+   pinctrl-0 = <&ts_int_default>;
+   pinctrl-names = "default";
+
+   linux,keycodes = ;
+   };
+};
+
 ®_motor_vdd {
gpio = <&tlmm 72 GPIO_ACTIVE_HIGH>;
 };
 
 ®_touch_key {
-   status = "disabled";
+   status = "disabled"; /* Using Imagis touch key */
 };
 
 &sound {
-- 
2.39.2





[PATCH v2] arm64: dts: qcom: msm8916-samsung-grandmax: Add touchscreen

2024-07-18 Thread Lin, Meng-Bo
Grand Max uses an Imagis IST3038 touchscreen that is connected to
blsp_i2c5. Add it to the device tree.

Signed-off-by: "Lin, Meng-Bo" 
Reviewed-by: Konrad Dybcio 
---
v2: Fix comments for ®_touch_key
---
 .../dts/qcom/msm8916-samsung-grandmax.dts | 24 ++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/boot/dts/qcom/msm8916-samsung-grandmax.dts 
b/arch/arm64/boot/dts/qcom/msm8916-samsung-grandmax.dts
index 135df1739dbd..5ddb69bf8e78 100644
--- a/arch/arm64/boot/dts/qcom/msm8916-samsung-grandmax.dts
+++ b/arch/arm64/boot/dts/qcom/msm8916-samsung-grandmax.dts
@@ -47,12 +47,34 @@ &battery {
constant-charge-voltage-max-microvolt = <440>;
 };
 
+&blsp_i2c5 {
+   status = "okay";
+
+   touchscreen@50 {
+   compatible = "imagis,ist3038";
+   reg = <0x50>;
+
+   interrupts-extended = <&tlmm 13 IRQ_TYPE_EDGE_FALLING>;
+
+   touchscreen-size-x = <720>;
+   touchscreen-size-y = <1280>;
+
+   vdd-supply = <®_vdd_tsp_a>;
+   vddio-supply = <&pm8916_l6>;
+
+   pinctrl-0 = <&ts_int_default>;
+   pinctrl-names = "default";
+
+   linux,keycodes = ;
+   };
+};
+
 ®_motor_vdd {
gpio = <&tlmm 72 GPIO_ACTIVE_HIGH>;
 };
 
 ®_touch_key {
-   status = "disabled";
+   status = "disabled"; /* Using Imagis touch key */
 };
 
 &sound {
-- 
2.39.2





[PATCH net-next v10 05/15] mm: page_frag: avoid caller accessing 'page_frag_cache' directly

2024-07-09 Thread Yunsheng Lin
Use appropriate frag_page API instead of caller accessing
'page_frag_cache' directly.

CC: Alexander Duyck 
Signed-off-by: Yunsheng Lin 
---
 drivers/vhost/net.c |  2 +-
 include/linux/page_frag_cache.h | 10 ++
 mm/page_frag_test.c |  2 +-
 net/core/skbuff.c   |  6 +++---
 net/rxrpc/conn_object.c |  4 +---
 net/rxrpc/local_object.c|  4 +---
 net/sunrpc/svcsock.c|  6 ++
 7 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 6691fac01e0d..b2737dc0dc50 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -1325,7 +1325,7 @@ static int vhost_net_open(struct inode *inode, struct 
file *f)
vqs[VHOST_NET_VQ_RX]);
 
f->private_data = n;
-   n->pf_cache.va = NULL;
+   page_frag_cache_init(&n->pf_cache);
 
return 0;
 }
diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h
index 185d875e3e6b..0ba96b7b64ad 100644
--- a/include/linux/page_frag_cache.h
+++ b/include/linux/page_frag_cache.h
@@ -26,6 +26,16 @@ struct page_frag_cache {
bool pfmemalloc;
 };
 
+static inline void page_frag_cache_init(struct page_frag_cache *nc)
+{
+   nc->va = NULL;
+}
+
+static inline bool page_frag_cache_is_pfmemalloc(struct page_frag_cache *nc)
+{
+   return !!nc->pfmemalloc;
+}
+
 void page_frag_cache_drain(struct page_frag_cache *nc);
 void __page_frag_cache_drain(struct page *page, unsigned int count);
 void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
diff --git a/mm/page_frag_test.c b/mm/page_frag_test.c
index 50166a059c7d..0d47235b5cf2 100644
--- a/mm/page_frag_test.c
+++ b/mm/page_frag_test.c
@@ -340,7 +340,7 @@ static int __init page_frag_test_init(void)
u64 duration;
int ret;
 
-   test_frag.va = NULL;
+   page_frag_cache_init(&test_frag);
atomic_set(&nthreads, 2);
init_completion(&wait);
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 4b8acd967793..76a473b1072d 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -749,14 +749,14 @@ struct sk_buff *__netdev_alloc_skb(struct net_device 
*dev, unsigned int len,
if (in_hardirq() || irqs_disabled()) {
nc = this_cpu_ptr(&netdev_alloc_cache);
data = page_frag_alloc_va(nc, len, gfp_mask);
-   pfmemalloc = nc->pfmemalloc;
+   pfmemalloc = page_frag_cache_is_pfmemalloc(nc);
} else {
local_bh_disable();
local_lock_nested_bh(&napi_alloc_cache.bh_lock);
 
nc = this_cpu_ptr(&napi_alloc_cache.page);
data = page_frag_alloc_va(nc, len, gfp_mask);
-   pfmemalloc = nc->pfmemalloc;
+   pfmemalloc = page_frag_cache_is_pfmemalloc(nc);
 
local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
local_bh_enable();
@@ -846,7 +846,7 @@ struct sk_buff *napi_alloc_skb(struct napi_struct *napi, 
unsigned int len)
len = SKB_HEAD_ALIGN(len);
 
data = page_frag_alloc_va(&nc->page, len, gfp_mask);
-   pfmemalloc = nc->page.pfmemalloc;
+   pfmemalloc = page_frag_cache_is_pfmemalloc(&nc->page);
}
local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
 
diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c
index 1539d315afe7..694c4df7a1a3 100644
--- a/net/rxrpc/conn_object.c
+++ b/net/rxrpc/conn_object.c
@@ -337,9 +337,7 @@ static void rxrpc_clean_up_connection(struct work_struct 
*work)
 */
rxrpc_purge_queue(&conn->rx_queue);
 
-   if (conn->tx_data_alloc.va)
-   __page_frag_cache_drain(virt_to_page(conn->tx_data_alloc.va),
-   conn->tx_data_alloc.pagecnt_bias);
+   page_frag_cache_drain(&conn->tx_data_alloc);
call_rcu(&conn->rcu, rxrpc_rcu_free_connection);
 }
 
diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c
index 504453c688d7..a8cffe47cf01 100644
--- a/net/rxrpc/local_object.c
+++ b/net/rxrpc/local_object.c
@@ -452,9 +452,7 @@ void rxrpc_destroy_local(struct rxrpc_local *local)
 #endif
rxrpc_purge_queue(&local->rx_queue);
rxrpc_purge_client_connections(local);
-   if (local->tx_alloc.va)
-   __page_frag_cache_drain(virt_to_page(local->tx_alloc.va),
-   local->tx_alloc.pagecnt_bias);
+   page_frag_cache_drain(&local->tx_alloc);
 }
 
 /*
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 42d20412c1c3..4b1e87187614 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1609,7 +1609,6 @@ static void svc_tcp_sock_detach(struct svc_xprt *xprt)
 static void svc_sock_free(struct svc_xprt *xprt)
 {
struct svc_sock *svsk = container_of

[PATCH net-next v10 04/15] mm: page_frag: add '_va' suffix to page_frag API

2024-07-09 Thread Yunsheng Lin
Currently the page_frag API is returning 'virtual address'
or 'va' when allocing and expecting 'virtual address' or
'va' as input when freeing.

As we are about to support new use cases that the caller
need to deal with 'struct page' or need to deal with both
'va' and 'struct page'. In order to differentiate the API
handling between 'va' and 'struct page', add '_va' suffix
to the corresponding API mirroring the page_pool_alloc_va()
API of the page_pool. So that callers expecting to deal with
va, page or both va and page may call page_frag_alloc_va*,
page_frag_alloc_pg*, or page_frag_alloc* API accordingly.

CC: Alexander Duyck 
Signed-off-by: Yunsheng Lin 
---
 drivers/net/ethernet/google/gve/gve_rx.c  |  4 ++--
 drivers/net/ethernet/intel/ice/ice_txrx.c |  2 +-
 drivers/net/ethernet/intel/ice/ice_txrx.h |  2 +-
 drivers/net/ethernet/intel/ice/ice_txrx_lib.c |  2 +-
 .../net/ethernet/intel/ixgbevf/ixgbevf_main.c |  4 ++--
 .../marvell/octeontx2/nic/otx2_common.c   |  2 +-
 drivers/net/ethernet/mediatek/mtk_wed_wo.c|  4 ++--
 drivers/nvme/host/tcp.c   |  8 +++
 drivers/nvme/target/tcp.c | 22 +--
 drivers/vhost/net.c   |  6 ++---
 include/linux/page_frag_cache.h   | 21 +-
 include/linux/skbuff.h|  2 +-
 kernel/bpf/cpumap.c   |  2 +-
 mm/page_frag_cache.c  | 12 +-
 mm/page_frag_test.c   | 13 ++-
 net/core/skbuff.c | 14 ++--
 net/core/xdp.c|  2 +-
 net/rxrpc/txbuf.c | 15 +++--
 net/sunrpc/svcsock.c  |  6 ++---
 19 files changed, 74 insertions(+), 69 deletions(-)

diff --git a/drivers/net/ethernet/google/gve/gve_rx.c 
b/drivers/net/ethernet/google/gve/gve_rx.c
index acb73d4d0de6..b6c10100e462 100644
--- a/drivers/net/ethernet/google/gve/gve_rx.c
+++ b/drivers/net/ethernet/google/gve/gve_rx.c
@@ -729,7 +729,7 @@ static int gve_xdp_redirect(struct net_device *dev, struct 
gve_rx_ring *rx,
 
total_len = headroom + SKB_DATA_ALIGN(len) +
SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
-   frame = page_frag_alloc(&rx->page_cache, total_len, GFP_ATOMIC);
+   frame = page_frag_alloc_va(&rx->page_cache, total_len, GFP_ATOMIC);
if (!frame) {
u64_stats_update_begin(&rx->statss);
rx->xdp_alloc_fails++;
@@ -742,7 +742,7 @@ static int gve_xdp_redirect(struct net_device *dev, struct 
gve_rx_ring *rx,
 
err = xdp_do_redirect(dev, &new, xdp_prog);
if (err)
-   page_frag_free(frame);
+   page_frag_free_va(frame);
 
return err;
 }
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c 
b/drivers/net/ethernet/intel/ice/ice_txrx.c
index 8bb743f78fcb..399b317c509d 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.c
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.c
@@ -126,7 +126,7 @@ ice_unmap_and_free_tx_buf(struct ice_tx_ring *ring, struct 
ice_tx_buf *tx_buf)
dev_kfree_skb_any(tx_buf->skb);
break;
case ICE_TX_BUF_XDP_TX:
-   page_frag_free(tx_buf->raw_buf);
+   page_frag_free_va(tx_buf->raw_buf);
break;
case ICE_TX_BUF_XDP_XMIT:
xdp_return_frame(tx_buf->xdpf);
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.h 
b/drivers/net/ethernet/intel/ice/ice_txrx.h
index feba314a3fe4..6379f57d8228 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.h
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.h
@@ -148,7 +148,7 @@ static inline int ice_skb_pad(void)
  * @ICE_TX_BUF_DUMMY: dummy Flow Director packet, unmap and kfree()
  * @ICE_TX_BUF_FRAG: mapped skb OR &xdp_buff frag, only unmap DMA
  * @ICE_TX_BUF_SKB: &sk_buff, unmap and consume_skb(), update stats
- * @ICE_TX_BUF_XDP_TX: &xdp_buff, unmap and page_frag_free(), stats
+ * @ICE_TX_BUF_XDP_TX: &xdp_buff, unmap and page_frag_free_va(), stats
  * @ICE_TX_BUF_XDP_XMIT: &xdp_frame, unmap and xdp_return_frame(), stats
  * @ICE_TX_BUF_XSK_TX: &xdp_buff on XSk queue, xsk_buff_free(), stats
  */
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c 
b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
index 2719f0e20933..a1a41a14df0d 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
+++ b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
@@ -250,7 +250,7 @@ ice_clean_xdp_tx_buf(struct device *dev, struct ice_tx_buf 
*tx_buf,
 
switch (tx_buf->type) {
case ICE_TX_BUF_XDP_TX:
-   page_frag_free(tx_buf->raw_buf);
+   page_frag_free_va(tx_buf->raw_buf);
break;
case ICE_T

[PATCH net-next v9 05/13] mm: page_frag: avoid caller accessing 'page_frag_cache' directly

2024-06-25 Thread Yunsheng Lin
Use appropriate frag_page API instead of caller accessing
'page_frag_cache' directly.

CC: Alexander Duyck 
Signed-off-by: Yunsheng Lin 
---
 drivers/vhost/net.c |  2 +-
 include/linux/page_frag_cache.h | 10 ++
 mm/page_frag_test.c |  2 +-
 net/core/skbuff.c   |  6 +++---
 net/rxrpc/conn_object.c |  4 +---
 net/rxrpc/local_object.c|  4 +---
 net/sunrpc/svcsock.c|  6 ++
 7 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 6691fac01e0d..b2737dc0dc50 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -1325,7 +1325,7 @@ static int vhost_net_open(struct inode *inode, struct 
file *f)
vqs[VHOST_NET_VQ_RX]);
 
f->private_data = n;
-   n->pf_cache.va = NULL;
+   page_frag_cache_init(&n->pf_cache);
 
return 0;
 }
diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h
index c6fde197a6eb..6ac3a25089d1 100644
--- a/include/linux/page_frag_cache.h
+++ b/include/linux/page_frag_cache.h
@@ -23,6 +23,16 @@ struct page_frag_cache {
bool pfmemalloc;
 };
 
+static inline void page_frag_cache_init(struct page_frag_cache *nc)
+{
+   nc->va = NULL;
+}
+
+static inline bool page_frag_cache_is_pfmemalloc(struct page_frag_cache *nc)
+{
+   return !!nc->pfmemalloc;
+}
+
 void page_frag_cache_drain(struct page_frag_cache *nc);
 void __page_frag_cache_drain(struct page *page, unsigned int count);
 void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
diff --git a/mm/page_frag_test.c b/mm/page_frag_test.c
index a0bd0ca5f343..cdffebc20a10 100644
--- a/mm/page_frag_test.c
+++ b/mm/page_frag_test.c
@@ -341,7 +341,7 @@ static int __init page_frag_test_init(void)
u64 duration;
int ret;
 
-   test_frag.va = NULL;
+   page_frag_cache_init(&test_frag);
atomic_set(&nthreads, 2);
init_completion(&wait);
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 6a84cc929505..59d42d642067 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -749,14 +749,14 @@ struct sk_buff *__netdev_alloc_skb(struct net_device 
*dev, unsigned int len,
if (in_hardirq() || irqs_disabled()) {
nc = this_cpu_ptr(&netdev_alloc_cache);
data = page_frag_alloc_va(nc, len, gfp_mask);
-   pfmemalloc = nc->pfmemalloc;
+   pfmemalloc = page_frag_cache_is_pfmemalloc(nc);
} else {
local_bh_disable();
local_lock_nested_bh(&napi_alloc_cache.bh_lock);
 
nc = this_cpu_ptr(&napi_alloc_cache.page);
data = page_frag_alloc_va(nc, len, gfp_mask);
-   pfmemalloc = nc->pfmemalloc;
+   pfmemalloc = page_frag_cache_is_pfmemalloc(nc);
 
local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
local_bh_enable();
@@ -846,7 +846,7 @@ struct sk_buff *napi_alloc_skb(struct napi_struct *napi, 
unsigned int len)
len = SKB_HEAD_ALIGN(len);
 
data = page_frag_alloc_va(&nc->page, len, gfp_mask);
-   pfmemalloc = nc->page.pfmemalloc;
+   pfmemalloc = page_frag_cache_is_pfmemalloc(&nc->page);
}
local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
 
diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c
index 1539d315afe7..694c4df7a1a3 100644
--- a/net/rxrpc/conn_object.c
+++ b/net/rxrpc/conn_object.c
@@ -337,9 +337,7 @@ static void rxrpc_clean_up_connection(struct work_struct 
*work)
 */
rxrpc_purge_queue(&conn->rx_queue);
 
-   if (conn->tx_data_alloc.va)
-   __page_frag_cache_drain(virt_to_page(conn->tx_data_alloc.va),
-   conn->tx_data_alloc.pagecnt_bias);
+   page_frag_cache_drain(&conn->tx_data_alloc);
call_rcu(&conn->rcu, rxrpc_rcu_free_connection);
 }
 
diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c
index 504453c688d7..a8cffe47cf01 100644
--- a/net/rxrpc/local_object.c
+++ b/net/rxrpc/local_object.c
@@ -452,9 +452,7 @@ void rxrpc_destroy_local(struct rxrpc_local *local)
 #endif
rxrpc_purge_queue(&local->rx_queue);
rxrpc_purge_client_connections(local);
-   if (local->tx_alloc.va)
-   __page_frag_cache_drain(virt_to_page(local->tx_alloc.va),
-   local->tx_alloc.pagecnt_bias);
+   page_frag_cache_drain(&local->tx_alloc);
 }
 
 /*
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 42d20412c1c3..4b1e87187614 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1609,7 +1609,6 @@ static void svc_tcp_sock_detach(struct svc_xprt *xprt)
 static void svc_sock_free(struct svc_xprt *xprt)
 {
struct svc_sock *svsk = container_of

[PATCH net-next v9 04/13] mm: page_frag: add '_va' suffix to page_frag API

2024-06-25 Thread Yunsheng Lin
Currently the page_frag API is returning 'virtual address'
or 'va' when allocing and expecting 'virtual address' or
'va' as input when freeing.

As we are about to support new use cases that the caller
need to deal with 'struct page' or need to deal with both
'va' and 'struct page'. In order to differentiate the API
handling between 'va' and 'struct page', add '_va' suffix
to the corresponding API mirroring the page_pool_alloc_va()
API of the page_pool. So that callers expecting to deal with
va, page or both va and page may call page_frag_alloc_va*,
page_frag_alloc_pg*, or page_frag_alloc* API accordingly.

CC: Alexander Duyck 
Signed-off-by: Yunsheng Lin 
---
 drivers/net/ethernet/google/gve/gve_rx.c  |  4 ++--
 drivers/net/ethernet/intel/ice/ice_txrx.c |  2 +-
 drivers/net/ethernet/intel/ice/ice_txrx.h |  2 +-
 drivers/net/ethernet/intel/ice/ice_txrx_lib.c |  2 +-
 .../net/ethernet/intel/ixgbevf/ixgbevf_main.c |  4 ++--
 .../marvell/octeontx2/nic/otx2_common.c   |  2 +-
 drivers/net/ethernet/mediatek/mtk_wed_wo.c|  4 ++--
 drivers/nvme/host/tcp.c   |  8 +++
 drivers/nvme/target/tcp.c | 22 +--
 drivers/vhost/net.c   |  6 ++---
 include/linux/page_frag_cache.h   | 21 +-
 include/linux/skbuff.h|  2 +-
 kernel/bpf/cpumap.c   |  2 +-
 mm/page_frag_cache.c  | 12 +-
 mm/page_frag_test.c   | 13 ++-
 net/core/skbuff.c | 14 ++--
 net/core/xdp.c|  2 +-
 net/rxrpc/txbuf.c | 15 +++--
 net/sunrpc/svcsock.c  |  6 ++---
 19 files changed, 74 insertions(+), 69 deletions(-)

diff --git a/drivers/net/ethernet/google/gve/gve_rx.c 
b/drivers/net/ethernet/google/gve/gve_rx.c
index acb73d4d0de6..b6c10100e462 100644
--- a/drivers/net/ethernet/google/gve/gve_rx.c
+++ b/drivers/net/ethernet/google/gve/gve_rx.c
@@ -729,7 +729,7 @@ static int gve_xdp_redirect(struct net_device *dev, struct 
gve_rx_ring *rx,
 
total_len = headroom + SKB_DATA_ALIGN(len) +
SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
-   frame = page_frag_alloc(&rx->page_cache, total_len, GFP_ATOMIC);
+   frame = page_frag_alloc_va(&rx->page_cache, total_len, GFP_ATOMIC);
if (!frame) {
u64_stats_update_begin(&rx->statss);
rx->xdp_alloc_fails++;
@@ -742,7 +742,7 @@ static int gve_xdp_redirect(struct net_device *dev, struct 
gve_rx_ring *rx,
 
err = xdp_do_redirect(dev, &new, xdp_prog);
if (err)
-   page_frag_free(frame);
+   page_frag_free_va(frame);
 
return err;
 }
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c 
b/drivers/net/ethernet/intel/ice/ice_txrx.c
index 8bb743f78fcb..399b317c509d 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.c
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.c
@@ -126,7 +126,7 @@ ice_unmap_and_free_tx_buf(struct ice_tx_ring *ring, struct 
ice_tx_buf *tx_buf)
dev_kfree_skb_any(tx_buf->skb);
break;
case ICE_TX_BUF_XDP_TX:
-   page_frag_free(tx_buf->raw_buf);
+   page_frag_free_va(tx_buf->raw_buf);
break;
case ICE_TX_BUF_XDP_XMIT:
xdp_return_frame(tx_buf->xdpf);
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.h 
b/drivers/net/ethernet/intel/ice/ice_txrx.h
index feba314a3fe4..6379f57d8228 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.h
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.h
@@ -148,7 +148,7 @@ static inline int ice_skb_pad(void)
  * @ICE_TX_BUF_DUMMY: dummy Flow Director packet, unmap and kfree()
  * @ICE_TX_BUF_FRAG: mapped skb OR &xdp_buff frag, only unmap DMA
  * @ICE_TX_BUF_SKB: &sk_buff, unmap and consume_skb(), update stats
- * @ICE_TX_BUF_XDP_TX: &xdp_buff, unmap and page_frag_free(), stats
+ * @ICE_TX_BUF_XDP_TX: &xdp_buff, unmap and page_frag_free_va(), stats
  * @ICE_TX_BUF_XDP_XMIT: &xdp_frame, unmap and xdp_return_frame(), stats
  * @ICE_TX_BUF_XSK_TX: &xdp_buff on XSk queue, xsk_buff_free(), stats
  */
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c 
b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
index 2719f0e20933..a1a41a14df0d 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
+++ b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
@@ -250,7 +250,7 @@ ice_clean_xdp_tx_buf(struct device *dev, struct ice_tx_buf 
*tx_buf,
 
switch (tx_buf->type) {
case ICE_TX_BUF_XDP_TX:
-   page_frag_free(tx_buf->raw_buf);
+   page_frag_free_va(tx_buf->raw_buf);
break;
case ICE_T

[PATCH net-next v8 05/13] mm: page_frag: avoid caller accessing 'page_frag_cache' directly

2024-06-17 Thread Yunsheng Lin
Use appropriate frag_page API instead of caller accessing
'page_frag_cache' directly.

CC: Alexander Duyck 
Signed-off-by: Yunsheng Lin 
---
 drivers/vhost/net.c |  2 +-
 include/linux/page_frag_cache.h | 10 ++
 mm/page_frag_test.c |  2 +-
 net/core/skbuff.c   |  6 +++---
 net/rxrpc/conn_object.c |  4 +---
 net/rxrpc/local_object.c|  4 +---
 net/sunrpc/svcsock.c|  6 ++
 7 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 6691fac01e0d..b2737dc0dc50 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -1325,7 +1325,7 @@ static int vhost_net_open(struct inode *inode, struct 
file *f)
vqs[VHOST_NET_VQ_RX]);
 
f->private_data = n;
-   n->pf_cache.va = NULL;
+   page_frag_cache_init(&n->pf_cache);
 
return 0;
 }
diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h
index c6fde197a6eb..6ac3a25089d1 100644
--- a/include/linux/page_frag_cache.h
+++ b/include/linux/page_frag_cache.h
@@ -23,6 +23,16 @@ struct page_frag_cache {
bool pfmemalloc;
 };
 
+static inline void page_frag_cache_init(struct page_frag_cache *nc)
+{
+   nc->va = NULL;
+}
+
+static inline bool page_frag_cache_is_pfmemalloc(struct page_frag_cache *nc)
+{
+   return !!nc->pfmemalloc;
+}
+
 void page_frag_cache_drain(struct page_frag_cache *nc);
 void __page_frag_cache_drain(struct page *page, unsigned int count);
 void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
diff --git a/mm/page_frag_test.c b/mm/page_frag_test.c
index c3cfce87fbbf..8b259e422fae 100644
--- a/mm/page_frag_test.c
+++ b/mm/page_frag_test.c
@@ -341,7 +341,7 @@ static int __init page_frag_test_init(void)
u64 duration;
int ret;
 
-   test_frag.va = NULL;
+   page_frag_cache_init(&test_frag);
atomic_set(&nthreads, 2);
init_completion(&wait);
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 6842fa6a71a5..ac5fe61c0ff2 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -741,12 +741,12 @@ struct sk_buff *__netdev_alloc_skb(struct net_device 
*dev, unsigned int len,
if (in_hardirq() || irqs_disabled()) {
nc = this_cpu_ptr(&netdev_alloc_cache);
data = page_frag_alloc_va(nc, len, gfp_mask);
-   pfmemalloc = nc->pfmemalloc;
+   pfmemalloc = page_frag_cache_is_pfmemalloc(nc);
} else {
local_bh_disable();
nc = this_cpu_ptr(&napi_alloc_cache.page);
data = page_frag_alloc_va(nc, len, gfp_mask);
-   pfmemalloc = nc->pfmemalloc;
+   pfmemalloc = page_frag_cache_is_pfmemalloc(nc);
local_bh_enable();
}
 
@@ -834,7 +834,7 @@ struct sk_buff *napi_alloc_skb(struct napi_struct *napi, 
unsigned int len)
len = SKB_HEAD_ALIGN(len);
 
data = page_frag_alloc_va(&nc->page, len, gfp_mask);
-   pfmemalloc = nc->page.pfmemalloc;
+   pfmemalloc = page_frag_cache_is_pfmemalloc(&nc->page);
}
 
if (unlikely(!data))
diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c
index 1539d315afe7..694c4df7a1a3 100644
--- a/net/rxrpc/conn_object.c
+++ b/net/rxrpc/conn_object.c
@@ -337,9 +337,7 @@ static void rxrpc_clean_up_connection(struct work_struct 
*work)
 */
rxrpc_purge_queue(&conn->rx_queue);
 
-   if (conn->tx_data_alloc.va)
-   __page_frag_cache_drain(virt_to_page(conn->tx_data_alloc.va),
-   conn->tx_data_alloc.pagecnt_bias);
+   page_frag_cache_drain(&conn->tx_data_alloc);
call_rcu(&conn->rcu, rxrpc_rcu_free_connection);
 }
 
diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c
index 504453c688d7..a8cffe47cf01 100644
--- a/net/rxrpc/local_object.c
+++ b/net/rxrpc/local_object.c
@@ -452,9 +452,7 @@ void rxrpc_destroy_local(struct rxrpc_local *local)
 #endif
rxrpc_purge_queue(&local->rx_queue);
rxrpc_purge_client_connections(local);
-   if (local->tx_alloc.va)
-   __page_frag_cache_drain(virt_to_page(local->tx_alloc.va),
-   local->tx_alloc.pagecnt_bias);
+   page_frag_cache_drain(&local->tx_alloc);
 }
 
 /*
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 42d20412c1c3..4b1e87187614 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1609,7 +1609,6 @@ static void svc_tcp_sock_detach(struct svc_xprt *xprt)
 static void svc_sock_free(struct svc_xprt *xprt)
 {
struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
-   struct page_frag_cache *pfc = &svsk->sk_frag_cache;
struct socket *sock = svsk->sk_sock;
 
   

[PATCH net-next v8 04/13] mm: page_frag: add '_va' suffix to page_frag API

2024-06-17 Thread Yunsheng Lin
Currently the page_frag API is returning 'virtual address'
or 'va' when allocing and expecting 'virtual address' or
'va' as input when freeing.

As we are about to support new use cases that the caller
need to deal with 'struct page' or need to deal with both
'va' and 'struct page'. In order to differentiate the API
handling between 'va' and 'struct page', add '_va' suffix
to the corresponding API mirroring the page_pool_alloc_va()
API of the page_pool. So that callers expecting to deal with
va, page or both va and page may call page_frag_alloc_va*,
page_frag_alloc_pg*, or page_frag_alloc* API accordingly.

CC: Alexander Duyck 
Signed-off-by: Yunsheng Lin 
---
 drivers/net/ethernet/google/gve/gve_rx.c  |  4 ++--
 drivers/net/ethernet/intel/ice/ice_txrx.c |  2 +-
 drivers/net/ethernet/intel/ice/ice_txrx.h |  2 +-
 drivers/net/ethernet/intel/ice/ice_txrx_lib.c |  2 +-
 .../net/ethernet/intel/ixgbevf/ixgbevf_main.c |  4 ++--
 .../marvell/octeontx2/nic/otx2_common.c   |  2 +-
 drivers/net/ethernet/mediatek/mtk_wed_wo.c|  4 ++--
 drivers/nvme/host/tcp.c   |  8 +++
 drivers/nvme/target/tcp.c | 22 +--
 drivers/vhost/net.c   |  6 ++---
 include/linux/page_frag_cache.h   | 21 +-
 include/linux/skbuff.h|  2 +-
 kernel/bpf/cpumap.c   |  2 +-
 mm/page_frag_cache.c  | 12 +-
 mm/page_frag_test.c   | 13 ++-
 net/core/skbuff.c | 18 +++
 net/core/xdp.c|  2 +-
 net/rxrpc/txbuf.c | 15 +++--
 net/sunrpc/svcsock.c  |  6 ++---
 19 files changed, 76 insertions(+), 71 deletions(-)

diff --git a/drivers/net/ethernet/google/gve/gve_rx.c 
b/drivers/net/ethernet/google/gve/gve_rx.c
index acb73d4d0de6..b6c10100e462 100644
--- a/drivers/net/ethernet/google/gve/gve_rx.c
+++ b/drivers/net/ethernet/google/gve/gve_rx.c
@@ -729,7 +729,7 @@ static int gve_xdp_redirect(struct net_device *dev, struct 
gve_rx_ring *rx,
 
total_len = headroom + SKB_DATA_ALIGN(len) +
SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
-   frame = page_frag_alloc(&rx->page_cache, total_len, GFP_ATOMIC);
+   frame = page_frag_alloc_va(&rx->page_cache, total_len, GFP_ATOMIC);
if (!frame) {
u64_stats_update_begin(&rx->statss);
rx->xdp_alloc_fails++;
@@ -742,7 +742,7 @@ static int gve_xdp_redirect(struct net_device *dev, struct 
gve_rx_ring *rx,
 
err = xdp_do_redirect(dev, &new, xdp_prog);
if (err)
-   page_frag_free(frame);
+   page_frag_free_va(frame);
 
return err;
 }
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c 
b/drivers/net/ethernet/intel/ice/ice_txrx.c
index 8bb743f78fcb..399b317c509d 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.c
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.c
@@ -126,7 +126,7 @@ ice_unmap_and_free_tx_buf(struct ice_tx_ring *ring, struct 
ice_tx_buf *tx_buf)
dev_kfree_skb_any(tx_buf->skb);
break;
case ICE_TX_BUF_XDP_TX:
-   page_frag_free(tx_buf->raw_buf);
+   page_frag_free_va(tx_buf->raw_buf);
break;
case ICE_TX_BUF_XDP_XMIT:
xdp_return_frame(tx_buf->xdpf);
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.h 
b/drivers/net/ethernet/intel/ice/ice_txrx.h
index feba314a3fe4..6379f57d8228 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.h
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.h
@@ -148,7 +148,7 @@ static inline int ice_skb_pad(void)
  * @ICE_TX_BUF_DUMMY: dummy Flow Director packet, unmap and kfree()
  * @ICE_TX_BUF_FRAG: mapped skb OR &xdp_buff frag, only unmap DMA
  * @ICE_TX_BUF_SKB: &sk_buff, unmap and consume_skb(), update stats
- * @ICE_TX_BUF_XDP_TX: &xdp_buff, unmap and page_frag_free(), stats
+ * @ICE_TX_BUF_XDP_TX: &xdp_buff, unmap and page_frag_free_va(), stats
  * @ICE_TX_BUF_XDP_XMIT: &xdp_frame, unmap and xdp_return_frame(), stats
  * @ICE_TX_BUF_XSK_TX: &xdp_buff on XSk queue, xsk_buff_free(), stats
  */
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c 
b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
index 2719f0e20933..a1a41a14df0d 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
+++ b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
@@ -250,7 +250,7 @@ ice_clean_xdp_tx_buf(struct device *dev, struct ice_tx_buf 
*tx_buf,
 
switch (tx_buf->type) {
case ICE_TX_BUF_XDP_TX:
-   page_frag_free(tx_buf->raw_buf);
+   page_frag_free_va(tx_buf->raw_buf);
break;
ca

[PATCH net-next v7 07/15] mm: page_frag: avoid caller accessing 'page_frag_cache' directly

2024-06-07 Thread Yunsheng Lin
Use appropriate frag_page API instead of caller accessing
'page_frag_cache' directly.

CC: Alexander Duyck 
Signed-off-by: Yunsheng Lin 
---
 drivers/vhost/net.c |  2 +-
 include/linux/page_frag_cache.h | 10 ++
 mm/page_frag_test.c |  2 +-
 net/core/skbuff.c   |  6 +++---
 net/rxrpc/conn_object.c |  4 +---
 net/rxrpc/local_object.c|  4 +---
 net/sunrpc/svcsock.c|  6 ++
 7 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 6691fac01e0d..b2737dc0dc50 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -1325,7 +1325,7 @@ static int vhost_net_open(struct inode *inode, struct 
file *f)
vqs[VHOST_NET_VQ_RX]);
 
f->private_data = n;
-   n->pf_cache.va = NULL;
+   page_frag_cache_init(&n->pf_cache);
 
return 0;
 }
diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h
index c6fde197a6eb..6ac3a25089d1 100644
--- a/include/linux/page_frag_cache.h
+++ b/include/linux/page_frag_cache.h
@@ -23,6 +23,16 @@ struct page_frag_cache {
bool pfmemalloc;
 };
 
+static inline void page_frag_cache_init(struct page_frag_cache *nc)
+{
+   nc->va = NULL;
+}
+
+static inline bool page_frag_cache_is_pfmemalloc(struct page_frag_cache *nc)
+{
+   return !!nc->pfmemalloc;
+}
+
 void page_frag_cache_drain(struct page_frag_cache *nc);
 void __page_frag_cache_drain(struct page *page, unsigned int count);
 void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
diff --git a/mm/page_frag_test.c b/mm/page_frag_test.c
index c3cfce87fbbf..8b259e422fae 100644
--- a/mm/page_frag_test.c
+++ b/mm/page_frag_test.c
@@ -341,7 +341,7 @@ static int __init page_frag_test_init(void)
u64 duration;
int ret;
 
-   test_frag.va = NULL;
+   page_frag_cache_init(&test_frag);
atomic_set(&nthreads, 2);
init_completion(&wait);
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 6842fa6a71a5..ac5fe61c0ff2 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -741,12 +741,12 @@ struct sk_buff *__netdev_alloc_skb(struct net_device 
*dev, unsigned int len,
if (in_hardirq() || irqs_disabled()) {
nc = this_cpu_ptr(&netdev_alloc_cache);
data = page_frag_alloc_va(nc, len, gfp_mask);
-   pfmemalloc = nc->pfmemalloc;
+   pfmemalloc = page_frag_cache_is_pfmemalloc(nc);
} else {
local_bh_disable();
nc = this_cpu_ptr(&napi_alloc_cache.page);
data = page_frag_alloc_va(nc, len, gfp_mask);
-   pfmemalloc = nc->pfmemalloc;
+   pfmemalloc = page_frag_cache_is_pfmemalloc(nc);
local_bh_enable();
}
 
@@ -834,7 +834,7 @@ struct sk_buff *napi_alloc_skb(struct napi_struct *napi, 
unsigned int len)
len = SKB_HEAD_ALIGN(len);
 
data = page_frag_alloc_va(&nc->page, len, gfp_mask);
-   pfmemalloc = nc->page.pfmemalloc;
+   pfmemalloc = page_frag_cache_is_pfmemalloc(&nc->page);
}
 
if (unlikely(!data))
diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c
index 1539d315afe7..694c4df7a1a3 100644
--- a/net/rxrpc/conn_object.c
+++ b/net/rxrpc/conn_object.c
@@ -337,9 +337,7 @@ static void rxrpc_clean_up_connection(struct work_struct 
*work)
 */
rxrpc_purge_queue(&conn->rx_queue);
 
-   if (conn->tx_data_alloc.va)
-   __page_frag_cache_drain(virt_to_page(conn->tx_data_alloc.va),
-   conn->tx_data_alloc.pagecnt_bias);
+   page_frag_cache_drain(&conn->tx_data_alloc);
call_rcu(&conn->rcu, rxrpc_rcu_free_connection);
 }
 
diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c
index 504453c688d7..a8cffe47cf01 100644
--- a/net/rxrpc/local_object.c
+++ b/net/rxrpc/local_object.c
@@ -452,9 +452,7 @@ void rxrpc_destroy_local(struct rxrpc_local *local)
 #endif
rxrpc_purge_queue(&local->rx_queue);
rxrpc_purge_client_connections(local);
-   if (local->tx_alloc.va)
-   __page_frag_cache_drain(virt_to_page(local->tx_alloc.va),
-   local->tx_alloc.pagecnt_bias);
+   page_frag_cache_drain(&local->tx_alloc);
 }
 
 /*
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 42d20412c1c3..4b1e87187614 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1609,7 +1609,6 @@ static void svc_tcp_sock_detach(struct svc_xprt *xprt)
 static void svc_sock_free(struct svc_xprt *xprt)
 {
struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
-   struct page_frag_cache *pfc = &svsk->sk_frag_cache;
struct socket *sock = svsk->sk_sock;
 
   

[PATCH net-next v7 06/15] mm: page_frag: add '_va' suffix to page_frag API

2024-06-07 Thread Yunsheng Lin
Currently the page_frag API is returning 'virtual address'
or 'va' when allocing and expecting 'virtual address' or
'va' as input when freeing.

As we are about to support new use cases that the caller
need to deal with 'struct page' or need to deal with both
'va' and 'struct page'. In order to differentiate the API
handling between 'va' and 'struct page', add '_va' suffix
to the corresponding API mirroring the page_pool_alloc_va()
API of the page_pool. So that callers expecting to deal with
va, page or both va and page may call page_frag_alloc_va*,
page_frag_alloc_pg*, or page_frag_alloc* API accordingly.

CC: Alexander Duyck 
Signed-off-by: Yunsheng Lin 
---
 drivers/net/ethernet/google/gve/gve_rx.c  |  4 ++--
 drivers/net/ethernet/intel/ice/ice_txrx.c |  2 +-
 drivers/net/ethernet/intel/ice/ice_txrx.h |  2 +-
 drivers/net/ethernet/intel/ice/ice_txrx_lib.c |  2 +-
 .../net/ethernet/intel/ixgbevf/ixgbevf_main.c |  4 ++--
 .../marvell/octeontx2/nic/otx2_common.c   |  2 +-
 drivers/net/ethernet/mediatek/mtk_wed_wo.c|  4 ++--
 drivers/nvme/host/tcp.c   |  8 +++
 drivers/nvme/target/tcp.c | 22 +--
 drivers/vhost/net.c   |  6 ++---
 include/linux/page_frag_cache.h   | 21 +-
 include/linux/skbuff.h|  2 +-
 kernel/bpf/cpumap.c   |  2 +-
 mm/page_frag_cache.c  | 12 +-
 mm/page_frag_test.c   | 13 ++-
 net/core/skbuff.c | 18 +++
 net/core/xdp.c|  2 +-
 net/rxrpc/txbuf.c | 15 +++--
 net/sunrpc/svcsock.c  |  6 ++---
 19 files changed, 76 insertions(+), 71 deletions(-)

diff --git a/drivers/net/ethernet/google/gve/gve_rx.c 
b/drivers/net/ethernet/google/gve/gve_rx.c
index acb73d4d0de6..b6c10100e462 100644
--- a/drivers/net/ethernet/google/gve/gve_rx.c
+++ b/drivers/net/ethernet/google/gve/gve_rx.c
@@ -729,7 +729,7 @@ static int gve_xdp_redirect(struct net_device *dev, struct 
gve_rx_ring *rx,
 
total_len = headroom + SKB_DATA_ALIGN(len) +
SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
-   frame = page_frag_alloc(&rx->page_cache, total_len, GFP_ATOMIC);
+   frame = page_frag_alloc_va(&rx->page_cache, total_len, GFP_ATOMIC);
if (!frame) {
u64_stats_update_begin(&rx->statss);
rx->xdp_alloc_fails++;
@@ -742,7 +742,7 @@ static int gve_xdp_redirect(struct net_device *dev, struct 
gve_rx_ring *rx,
 
err = xdp_do_redirect(dev, &new, xdp_prog);
if (err)
-   page_frag_free(frame);
+   page_frag_free_va(frame);
 
return err;
 }
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c 
b/drivers/net/ethernet/intel/ice/ice_txrx.c
index 8bb743f78fcb..399b317c509d 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.c
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.c
@@ -126,7 +126,7 @@ ice_unmap_and_free_tx_buf(struct ice_tx_ring *ring, struct 
ice_tx_buf *tx_buf)
dev_kfree_skb_any(tx_buf->skb);
break;
case ICE_TX_BUF_XDP_TX:
-   page_frag_free(tx_buf->raw_buf);
+   page_frag_free_va(tx_buf->raw_buf);
break;
case ICE_TX_BUF_XDP_XMIT:
xdp_return_frame(tx_buf->xdpf);
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.h 
b/drivers/net/ethernet/intel/ice/ice_txrx.h
index feba314a3fe4..6379f57d8228 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.h
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.h
@@ -148,7 +148,7 @@ static inline int ice_skb_pad(void)
  * @ICE_TX_BUF_DUMMY: dummy Flow Director packet, unmap and kfree()
  * @ICE_TX_BUF_FRAG: mapped skb OR &xdp_buff frag, only unmap DMA
  * @ICE_TX_BUF_SKB: &sk_buff, unmap and consume_skb(), update stats
- * @ICE_TX_BUF_XDP_TX: &xdp_buff, unmap and page_frag_free(), stats
+ * @ICE_TX_BUF_XDP_TX: &xdp_buff, unmap and page_frag_free_va(), stats
  * @ICE_TX_BUF_XDP_XMIT: &xdp_frame, unmap and xdp_return_frame(), stats
  * @ICE_TX_BUF_XSK_TX: &xdp_buff on XSk queue, xsk_buff_free(), stats
  */
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c 
b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
index 2719f0e20933..a1a41a14df0d 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
+++ b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
@@ -250,7 +250,7 @@ ice_clean_xdp_tx_buf(struct device *dev, struct ice_tx_buf 
*tx_buf,
 
switch (tx_buf->type) {
case ICE_TX_BUF_XDP_TX:
-   page_frag_free(tx_buf->raw_buf);
+   page_frag_free_va(tx_buf->raw_buf);
break;
ca

[PATCH net-next v6 06/15] mm: page_frag: add '_va' suffix to page_frag API

2024-06-05 Thread Yunsheng Lin
Currently the page_frag API is returning 'virtual address'
or 'va' when allocing and expecting 'virtual address' or
'va' as input when freeing.

As we are about to support new use cases that the caller
need to deal with 'struct page' or need to deal with both
'va' and 'struct page'. In order to differentiate the API
handling between 'va' and 'struct page', add '_va' suffix
to the corresponding API mirroring the page_pool_alloc_va()
API of the page_pool. So that callers expecting to deal with
va, page or both va and page may call page_frag_alloc_va*,
page_frag_alloc_pg*, or page_frag_alloc* API accordingly.

CC: Alexander Duyck 
Signed-off-by: Yunsheng Lin 
---
 drivers/net/ethernet/google/gve/gve_rx.c  |  4 ++--
 drivers/net/ethernet/intel/ice/ice_txrx.c |  2 +-
 drivers/net/ethernet/intel/ice/ice_txrx.h |  2 +-
 drivers/net/ethernet/intel/ice/ice_txrx_lib.c |  2 +-
 .../net/ethernet/intel/ixgbevf/ixgbevf_main.c |  4 ++--
 .../marvell/octeontx2/nic/otx2_common.c   |  2 +-
 drivers/net/ethernet/mediatek/mtk_wed_wo.c|  4 ++--
 drivers/nvme/host/tcp.c   |  8 +++
 drivers/nvme/target/tcp.c | 22 +--
 drivers/vhost/net.c   |  6 ++---
 include/linux/page_frag_cache.h   | 21 +-
 include/linux/skbuff.h|  2 +-
 kernel/bpf/cpumap.c   |  2 +-
 mm/page_frag_cache.c  | 12 +-
 mm/page_frag_test.c   | 13 ++-
 net/core/skbuff.c | 18 +++
 net/core/xdp.c|  2 +-
 net/rxrpc/txbuf.c | 15 +++--
 net/sunrpc/svcsock.c  |  6 ++---
 19 files changed, 76 insertions(+), 71 deletions(-)

diff --git a/drivers/net/ethernet/google/gve/gve_rx.c 
b/drivers/net/ethernet/google/gve/gve_rx.c
index acb73d4d0de6..b6c10100e462 100644
--- a/drivers/net/ethernet/google/gve/gve_rx.c
+++ b/drivers/net/ethernet/google/gve/gve_rx.c
@@ -729,7 +729,7 @@ static int gve_xdp_redirect(struct net_device *dev, struct 
gve_rx_ring *rx,
 
total_len = headroom + SKB_DATA_ALIGN(len) +
SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
-   frame = page_frag_alloc(&rx->page_cache, total_len, GFP_ATOMIC);
+   frame = page_frag_alloc_va(&rx->page_cache, total_len, GFP_ATOMIC);
if (!frame) {
u64_stats_update_begin(&rx->statss);
rx->xdp_alloc_fails++;
@@ -742,7 +742,7 @@ static int gve_xdp_redirect(struct net_device *dev, struct 
gve_rx_ring *rx,
 
err = xdp_do_redirect(dev, &new, xdp_prog);
if (err)
-   page_frag_free(frame);
+   page_frag_free_va(frame);
 
return err;
 }
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c 
b/drivers/net/ethernet/intel/ice/ice_txrx.c
index 8bb743f78fcb..399b317c509d 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.c
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.c
@@ -126,7 +126,7 @@ ice_unmap_and_free_tx_buf(struct ice_tx_ring *ring, struct 
ice_tx_buf *tx_buf)
dev_kfree_skb_any(tx_buf->skb);
break;
case ICE_TX_BUF_XDP_TX:
-   page_frag_free(tx_buf->raw_buf);
+   page_frag_free_va(tx_buf->raw_buf);
break;
case ICE_TX_BUF_XDP_XMIT:
xdp_return_frame(tx_buf->xdpf);
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.h 
b/drivers/net/ethernet/intel/ice/ice_txrx.h
index feba314a3fe4..6379f57d8228 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.h
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.h
@@ -148,7 +148,7 @@ static inline int ice_skb_pad(void)
  * @ICE_TX_BUF_DUMMY: dummy Flow Director packet, unmap and kfree()
  * @ICE_TX_BUF_FRAG: mapped skb OR &xdp_buff frag, only unmap DMA
  * @ICE_TX_BUF_SKB: &sk_buff, unmap and consume_skb(), update stats
- * @ICE_TX_BUF_XDP_TX: &xdp_buff, unmap and page_frag_free(), stats
+ * @ICE_TX_BUF_XDP_TX: &xdp_buff, unmap and page_frag_free_va(), stats
  * @ICE_TX_BUF_XDP_XMIT: &xdp_frame, unmap and xdp_return_frame(), stats
  * @ICE_TX_BUF_XSK_TX: &xdp_buff on XSk queue, xsk_buff_free(), stats
  */
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c 
b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
index 2719f0e20933..a1a41a14df0d 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
+++ b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
@@ -250,7 +250,7 @@ ice_clean_xdp_tx_buf(struct device *dev, struct ice_tx_buf 
*tx_buf,
 
switch (tx_buf->type) {
case ICE_TX_BUF_XDP_TX:
-   page_frag_free(tx_buf->raw_buf);
+   page_frag_free_va(tx_buf->raw_buf);
break;
ca

[PATCH net-next v6 07/15] mm: page_frag: avoid caller accessing 'page_frag_cache' directly

2024-06-05 Thread Yunsheng Lin
Use appropriate frag_page API instead of caller accessing
'page_frag_cache' directly.

CC: Alexander Duyck 
Signed-off-by: Yunsheng Lin 
---
 drivers/vhost/net.c |  2 +-
 include/linux/page_frag_cache.h | 10 ++
 mm/page_frag_test.c |  2 +-
 net/core/skbuff.c   |  6 +++---
 net/rxrpc/conn_object.c |  4 +---
 net/rxrpc/local_object.c|  4 +---
 net/sunrpc/svcsock.c|  6 ++
 7 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 6691fac01e0d..b2737dc0dc50 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -1325,7 +1325,7 @@ static int vhost_net_open(struct inode *inode, struct 
file *f)
vqs[VHOST_NET_VQ_RX]);
 
f->private_data = n;
-   n->pf_cache.va = NULL;
+   page_frag_cache_init(&n->pf_cache);
 
return 0;
 }
diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h
index c6fde197a6eb..6ac3a25089d1 100644
--- a/include/linux/page_frag_cache.h
+++ b/include/linux/page_frag_cache.h
@@ -23,6 +23,16 @@ struct page_frag_cache {
bool pfmemalloc;
 };
 
+static inline void page_frag_cache_init(struct page_frag_cache *nc)
+{
+   nc->va = NULL;
+}
+
+static inline bool page_frag_cache_is_pfmemalloc(struct page_frag_cache *nc)
+{
+   return !!nc->pfmemalloc;
+}
+
 void page_frag_cache_drain(struct page_frag_cache *nc);
 void __page_frag_cache_drain(struct page *page, unsigned int count);
 void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
diff --git a/mm/page_frag_test.c b/mm/page_frag_test.c
index c3cfce87fbbf..8b259e422fae 100644
--- a/mm/page_frag_test.c
+++ b/mm/page_frag_test.c
@@ -341,7 +341,7 @@ static int __init page_frag_test_init(void)
u64 duration;
int ret;
 
-   test_frag.va = NULL;
+   page_frag_cache_init(&test_frag);
atomic_set(&nthreads, 2);
init_completion(&wait);
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index dca4e7445348..caee22db1cc7 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -741,12 +741,12 @@ struct sk_buff *__netdev_alloc_skb(struct net_device 
*dev, unsigned int len,
if (in_hardirq() || irqs_disabled()) {
nc = this_cpu_ptr(&netdev_alloc_cache);
data = page_frag_alloc_va(nc, len, gfp_mask);
-   pfmemalloc = nc->pfmemalloc;
+   pfmemalloc = page_frag_cache_is_pfmemalloc(nc);
} else {
local_bh_disable();
nc = this_cpu_ptr(&napi_alloc_cache.page);
data = page_frag_alloc_va(nc, len, gfp_mask);
-   pfmemalloc = nc->pfmemalloc;
+   pfmemalloc = page_frag_cache_is_pfmemalloc(nc);
local_bh_enable();
}
 
@@ -834,7 +834,7 @@ struct sk_buff *napi_alloc_skb(struct napi_struct *napi, 
unsigned int len)
len = SKB_HEAD_ALIGN(len);
 
data = page_frag_alloc_va(&nc->page, len, gfp_mask);
-   pfmemalloc = nc->page.pfmemalloc;
+   pfmemalloc = page_frag_cache_is_pfmemalloc(&nc->page);
}
 
if (unlikely(!data))
diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c
index 1539d315afe7..694c4df7a1a3 100644
--- a/net/rxrpc/conn_object.c
+++ b/net/rxrpc/conn_object.c
@@ -337,9 +337,7 @@ static void rxrpc_clean_up_connection(struct work_struct 
*work)
 */
rxrpc_purge_queue(&conn->rx_queue);
 
-   if (conn->tx_data_alloc.va)
-   __page_frag_cache_drain(virt_to_page(conn->tx_data_alloc.va),
-   conn->tx_data_alloc.pagecnt_bias);
+   page_frag_cache_drain(&conn->tx_data_alloc);
call_rcu(&conn->rcu, rxrpc_rcu_free_connection);
 }
 
diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c
index 504453c688d7..a8cffe47cf01 100644
--- a/net/rxrpc/local_object.c
+++ b/net/rxrpc/local_object.c
@@ -452,9 +452,7 @@ void rxrpc_destroy_local(struct rxrpc_local *local)
 #endif
rxrpc_purge_queue(&local->rx_queue);
rxrpc_purge_client_connections(local);
-   if (local->tx_alloc.va)
-   __page_frag_cache_drain(virt_to_page(local->tx_alloc.va),
-   local->tx_alloc.pagecnt_bias);
+   page_frag_cache_drain(&local->tx_alloc);
 }
 
 /*
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 42d20412c1c3..4b1e87187614 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1609,7 +1609,6 @@ static void svc_tcp_sock_detach(struct svc_xprt *xprt)
 static void svc_sock_free(struct svc_xprt *xprt)
 {
struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
-   struct page_frag_cache *pfc = &svsk->sk_frag_cache;
struct socket *sock = svsk->sk_sock;
 
   

[PATCH net-next v5 07/13] mm: page_frag: avoid caller accessing 'page_frag_cache' directly

2024-05-28 Thread Yunsheng Lin
Use appropriate frag_page API instead of caller accessing
'page_frag_cache' directly.

CC: Alexander Duyck 
Signed-off-by: Yunsheng Lin 
---
 drivers/vhost/net.c |  2 +-
 include/linux/page_frag_cache.h | 10 ++
 mm/page_frag_test.c |  2 +-
 net/core/skbuff.c   |  6 +++---
 net/rxrpc/conn_object.c |  4 +---
 net/rxrpc/local_object.c|  4 +---
 net/sunrpc/svcsock.c|  6 ++
 7 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 6691fac01e0d..b2737dc0dc50 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -1325,7 +1325,7 @@ static int vhost_net_open(struct inode *inode, struct 
file *f)
vqs[VHOST_NET_VQ_RX]);
 
f->private_data = n;
-   n->pf_cache.va = NULL;
+   page_frag_cache_init(&n->pf_cache);
 
return 0;
 }
diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h
index c6fde197a6eb..6ac3a25089d1 100644
--- a/include/linux/page_frag_cache.h
+++ b/include/linux/page_frag_cache.h
@@ -23,6 +23,16 @@ struct page_frag_cache {
bool pfmemalloc;
 };
 
+static inline void page_frag_cache_init(struct page_frag_cache *nc)
+{
+   nc->va = NULL;
+}
+
+static inline bool page_frag_cache_is_pfmemalloc(struct page_frag_cache *nc)
+{
+   return !!nc->pfmemalloc;
+}
+
 void page_frag_cache_drain(struct page_frag_cache *nc);
 void __page_frag_cache_drain(struct page *page, unsigned int count);
 void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
diff --git a/mm/page_frag_test.c b/mm/page_frag_test.c
index bb9f5aa84631..641255136105 100644
--- a/mm/page_frag_test.c
+++ b/mm/page_frag_test.c
@@ -337,7 +337,7 @@ static int __init page_frag_test_init(void)
u64 duration;
int ret;
 
-   test_frag.va = NULL;
+   page_frag_cache_init(&test_frag);
atomic_set(&nthreads, 2);
init_completion(&wait);
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index dca4e7445348..caee22db1cc7 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -741,12 +741,12 @@ struct sk_buff *__netdev_alloc_skb(struct net_device 
*dev, unsigned int len,
if (in_hardirq() || irqs_disabled()) {
nc = this_cpu_ptr(&netdev_alloc_cache);
data = page_frag_alloc_va(nc, len, gfp_mask);
-   pfmemalloc = nc->pfmemalloc;
+   pfmemalloc = page_frag_cache_is_pfmemalloc(nc);
} else {
local_bh_disable();
nc = this_cpu_ptr(&napi_alloc_cache.page);
data = page_frag_alloc_va(nc, len, gfp_mask);
-   pfmemalloc = nc->pfmemalloc;
+   pfmemalloc = page_frag_cache_is_pfmemalloc(nc);
local_bh_enable();
}
 
@@ -834,7 +834,7 @@ struct sk_buff *napi_alloc_skb(struct napi_struct *napi, 
unsigned int len)
len = SKB_HEAD_ALIGN(len);
 
data = page_frag_alloc_va(&nc->page, len, gfp_mask);
-   pfmemalloc = nc->page.pfmemalloc;
+   pfmemalloc = page_frag_cache_is_pfmemalloc(&nc->page);
}
 
if (unlikely(!data))
diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c
index 1539d315afe7..694c4df7a1a3 100644
--- a/net/rxrpc/conn_object.c
+++ b/net/rxrpc/conn_object.c
@@ -337,9 +337,7 @@ static void rxrpc_clean_up_connection(struct work_struct 
*work)
 */
rxrpc_purge_queue(&conn->rx_queue);
 
-   if (conn->tx_data_alloc.va)
-   __page_frag_cache_drain(virt_to_page(conn->tx_data_alloc.va),
-   conn->tx_data_alloc.pagecnt_bias);
+   page_frag_cache_drain(&conn->tx_data_alloc);
call_rcu(&conn->rcu, rxrpc_rcu_free_connection);
 }
 
diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c
index 504453c688d7..a8cffe47cf01 100644
--- a/net/rxrpc/local_object.c
+++ b/net/rxrpc/local_object.c
@@ -452,9 +452,7 @@ void rxrpc_destroy_local(struct rxrpc_local *local)
 #endif
rxrpc_purge_queue(&local->rx_queue);
rxrpc_purge_client_connections(local);
-   if (local->tx_alloc.va)
-   __page_frag_cache_drain(virt_to_page(local->tx_alloc.va),
-   local->tx_alloc.pagecnt_bias);
+   page_frag_cache_drain(&local->tx_alloc);
 }
 
 /*
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 42d20412c1c3..4b1e87187614 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1609,7 +1609,6 @@ static void svc_tcp_sock_detach(struct svc_xprt *xprt)
 static void svc_sock_free(struct svc_xprt *xprt)
 {
struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
-   struct page_frag_cache *pfc = &svsk->sk_frag_cache;
struct socket *sock = svsk->sk_sock;
 
   

[PATCH net-next v5 06/13] mm: page_frag: add '_va' suffix to page_frag API

2024-05-28 Thread Yunsheng Lin
Currently the page_frag API is returning 'virtual address'
or 'va' when allocing and expecting 'virtual address' or
'va' as input when freeing.

As we are about to support new use cases that the caller
need to deal with 'struct page' or need to deal with both
'va' and 'struct page'. In order to differentiate the API
handling between 'va' and 'struct page', add '_va' suffix
to the corresponding API mirroring the page_pool_alloc_va()
API of the page_pool. So that callers expecting to deal with
va, page or both va and page may call page_frag_alloc_va*,
page_frag_alloc_pg*, or page_frag_alloc* API accordingly.

CC: Alexander Duyck 
Signed-off-by: Yunsheng Lin 
---
 drivers/net/ethernet/google/gve/gve_rx.c  |  4 ++--
 drivers/net/ethernet/intel/ice/ice_txrx.c |  2 +-
 drivers/net/ethernet/intel/ice/ice_txrx.h |  2 +-
 drivers/net/ethernet/intel/ice/ice_txrx_lib.c |  2 +-
 .../net/ethernet/intel/ixgbevf/ixgbevf_main.c |  4 ++--
 .../marvell/octeontx2/nic/otx2_common.c   |  2 +-
 drivers/net/ethernet/mediatek/mtk_wed_wo.c|  4 ++--
 drivers/nvme/host/tcp.c   |  8 +++
 drivers/nvme/target/tcp.c | 22 +--
 drivers/vhost/net.c   |  6 ++---
 include/linux/page_frag_cache.h   | 21 +-
 include/linux/skbuff.h|  2 +-
 kernel/bpf/cpumap.c   |  2 +-
 mm/page_frag_cache.c  | 12 +-
 mm/page_frag_test.c   | 11 +-
 net/core/skbuff.c | 18 +++
 net/core/xdp.c|  2 +-
 net/rxrpc/txbuf.c | 15 +++--
 net/sunrpc/svcsock.c  |  6 ++---
 19 files changed, 74 insertions(+), 71 deletions(-)

diff --git a/drivers/net/ethernet/google/gve/gve_rx.c 
b/drivers/net/ethernet/google/gve/gve_rx.c
index acb73d4d0de6..b6c10100e462 100644
--- a/drivers/net/ethernet/google/gve/gve_rx.c
+++ b/drivers/net/ethernet/google/gve/gve_rx.c
@@ -729,7 +729,7 @@ static int gve_xdp_redirect(struct net_device *dev, struct 
gve_rx_ring *rx,
 
total_len = headroom + SKB_DATA_ALIGN(len) +
SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
-   frame = page_frag_alloc(&rx->page_cache, total_len, GFP_ATOMIC);
+   frame = page_frag_alloc_va(&rx->page_cache, total_len, GFP_ATOMIC);
if (!frame) {
u64_stats_update_begin(&rx->statss);
rx->xdp_alloc_fails++;
@@ -742,7 +742,7 @@ static int gve_xdp_redirect(struct net_device *dev, struct 
gve_rx_ring *rx,
 
err = xdp_do_redirect(dev, &new, xdp_prog);
if (err)
-   page_frag_free(frame);
+   page_frag_free_va(frame);
 
return err;
 }
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c 
b/drivers/net/ethernet/intel/ice/ice_txrx.c
index 8bb743f78fcb..399b317c509d 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.c
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.c
@@ -126,7 +126,7 @@ ice_unmap_and_free_tx_buf(struct ice_tx_ring *ring, struct 
ice_tx_buf *tx_buf)
dev_kfree_skb_any(tx_buf->skb);
break;
case ICE_TX_BUF_XDP_TX:
-   page_frag_free(tx_buf->raw_buf);
+   page_frag_free_va(tx_buf->raw_buf);
break;
case ICE_TX_BUF_XDP_XMIT:
xdp_return_frame(tx_buf->xdpf);
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.h 
b/drivers/net/ethernet/intel/ice/ice_txrx.h
index feba314a3fe4..6379f57d8228 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.h
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.h
@@ -148,7 +148,7 @@ static inline int ice_skb_pad(void)
  * @ICE_TX_BUF_DUMMY: dummy Flow Director packet, unmap and kfree()
  * @ICE_TX_BUF_FRAG: mapped skb OR &xdp_buff frag, only unmap DMA
  * @ICE_TX_BUF_SKB: &sk_buff, unmap and consume_skb(), update stats
- * @ICE_TX_BUF_XDP_TX: &xdp_buff, unmap and page_frag_free(), stats
+ * @ICE_TX_BUF_XDP_TX: &xdp_buff, unmap and page_frag_free_va(), stats
  * @ICE_TX_BUF_XDP_XMIT: &xdp_frame, unmap and xdp_return_frame(), stats
  * @ICE_TX_BUF_XSK_TX: &xdp_buff on XSk queue, xsk_buff_free(), stats
  */
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c 
b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
index 2719f0e20933..a1a41a14df0d 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
+++ b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
@@ -250,7 +250,7 @@ ice_clean_xdp_tx_buf(struct device *dev, struct ice_tx_buf 
*tx_buf,
 
switch (tx_buf->type) {
case ICE_TX_BUF_XDP_TX:
-   page_frag_free(tx_buf->raw_buf);
+   page_frag_free_va(tx_buf->raw_buf);
break;
case ICE_T

Re: [linus:master] [mm] d99e3140a4: BUG:KCSAN:data-race_in_folio_remove_rmap_ptes/print_report

2024-05-28 Thread Miaohe Lin
On 2024/5/28 15:43, David Hildenbrand wrote:
> Am 28.05.24 um 09:11 schrieb kernel test robot:
>>
>>
>> Hello,
>>
>> kernel test robot noticed 
>> "BUG:KCSAN:data-race_in_folio_remove_rmap_ptes/print_report" on:
>>
>> commit: d99e3140a4d33e26066183ff727d8f02f56bec64 ("mm: turn 
>> folio_test_hugetlb into a PageType")
>> https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git master
>>
>> [test failed on linus/master  c760b3725e52403dc1b28644fb09c47a83cacea6]
>> [test failed on linux-next/master 3689b0ef08b70e4e03b82ebd37730a03a672853a]
>>
>> in testcase: trinity
>> version: trinity-i386-abe9de86-1_20230429
>> with following parameters:
>>
>> runtime: 300s
>> group: group-04
>> nr_groups: 5
>>
>>
>>
>> compiler: gcc-13
>> test machine: qemu-system-x86_64 -enable-kvm -cpu SandyBridge -smp 2 -m 16G
>>
>> (please refer to attached dmesg/kmsg for entire log/backtrace)
>>
>>
>> we noticed this issue does not always happen. we also noticed there are
>> different random KCSAN issues for both this commit and its parent. but below
>> 4 only happen on this commit with not small rate and keep clean on parent.
>>
> 
> Likely that's just a page_type check racing against concurrent
> mapcount changes.
> 
> In __folio_rmap_sanity_checks() we check
> VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
> 
> To make sure we don't get hugetlb folios in the wrong rmap code path. That
> can easily race with concurrent mapcount changes, just like any other
> page_type checks that end up in folio_test_type/page_has_type e.g., from
> PFN walkers.
> 
> Load tearing in these functions shouldn't really result in false positives
> (what we care about), but READ_ONCE shouldn't hurt or make a difference.
> 
> 
> From b03dc9bf27571442d886d8da624a4e4f737433f2 Mon Sep 17 00:00:00 2001
> From: David Hildenbrand 
> Date: Tue, 28 May 2024 09:37:20 +0200
> Subject: [PATCH] mm: read page_type using READ_ONCE
> 
> KCSAN complains about possible data races: while we check for a
> page_type -- for example for sanity checks -- we might concurrently
> modify the mapcount that overlays page_type.
> 
> Let's use READ_ONCE to avoid laod tearing (shouldn't make a difference)
> and to make KCSAN happy.
> 
> Note: nothing should really be broken besides wrong KCSAN complaints.
> 
> Reported-by: kernel test robot 
> Closes: https://lore.kernel.org/oe-lkp/202405281431.c46a3be9-...@intel.com
> Signed-off-by: David Hildenbrand 

LGTM. Thanks for fixing.

Reviewed-by: Miaohe Lin 
Thanks.
.




[RFC v4 06/13] mm: page_frag: add '_va' suffix to page_frag API

2024-05-15 Thread Yunsheng Lin
Currently the page_frag API is returning 'virtual address'
or 'va' when allocing and expecting 'virtual address' or
'va' as input when freeing.

As we are about to support new use cases that the caller
need to deal with 'struct page' or need to deal with both
'va' and 'struct page'. In order to differentiate the API
handling between 'va' and 'struct page', add '_va' suffix
to the corresponding API mirroring the page_pool_alloc_va()
API of the page_pool. So that callers expecting to deal with
va, page or both va and page may call page_frag_alloc_va*,
page_frag_alloc_pg*, or page_frag_alloc* API accordingly.

CC: Alexander Duyck 
Signed-off-by: Yunsheng Lin 
---
 drivers/net/ethernet/google/gve/gve_rx.c  |  4 ++--
 drivers/net/ethernet/intel/ice/ice_txrx.c |  2 +-
 drivers/net/ethernet/intel/ice/ice_txrx.h |  2 +-
 drivers/net/ethernet/intel/ice/ice_txrx_lib.c |  2 +-
 .../net/ethernet/intel/ixgbevf/ixgbevf_main.c |  4 ++--
 .../marvell/octeontx2/nic/otx2_common.c   |  2 +-
 drivers/net/ethernet/mediatek/mtk_wed_wo.c|  4 ++--
 drivers/nvme/host/tcp.c   |  8 +++
 drivers/nvme/target/tcp.c | 22 +--
 drivers/vhost/net.c   |  6 ++---
 include/linux/page_frag_cache.h   | 21 +-
 include/linux/skbuff.h|  2 +-
 kernel/bpf/cpumap.c   |  2 +-
 mm/page_frag_cache.c  | 12 +-
 mm/page_frag_test.c   | 11 +-
 net/core/skbuff.c | 18 +++
 net/core/xdp.c|  2 +-
 net/rxrpc/txbuf.c | 15 +++--
 net/sunrpc/svcsock.c  |  6 ++---
 19 files changed, 74 insertions(+), 71 deletions(-)

diff --git a/drivers/net/ethernet/google/gve/gve_rx.c 
b/drivers/net/ethernet/google/gve/gve_rx.c
index acb73d4d0de6..b6c10100e462 100644
--- a/drivers/net/ethernet/google/gve/gve_rx.c
+++ b/drivers/net/ethernet/google/gve/gve_rx.c
@@ -729,7 +729,7 @@ static int gve_xdp_redirect(struct net_device *dev, struct 
gve_rx_ring *rx,
 
total_len = headroom + SKB_DATA_ALIGN(len) +
SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
-   frame = page_frag_alloc(&rx->page_cache, total_len, GFP_ATOMIC);
+   frame = page_frag_alloc_va(&rx->page_cache, total_len, GFP_ATOMIC);
if (!frame) {
u64_stats_update_begin(&rx->statss);
rx->xdp_alloc_fails++;
@@ -742,7 +742,7 @@ static int gve_xdp_redirect(struct net_device *dev, struct 
gve_rx_ring *rx,
 
err = xdp_do_redirect(dev, &new, xdp_prog);
if (err)
-   page_frag_free(frame);
+   page_frag_free_va(frame);
 
return err;
 }
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c 
b/drivers/net/ethernet/intel/ice/ice_txrx.c
index 8bb743f78fcb..399b317c509d 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.c
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.c
@@ -126,7 +126,7 @@ ice_unmap_and_free_tx_buf(struct ice_tx_ring *ring, struct 
ice_tx_buf *tx_buf)
dev_kfree_skb_any(tx_buf->skb);
break;
case ICE_TX_BUF_XDP_TX:
-   page_frag_free(tx_buf->raw_buf);
+   page_frag_free_va(tx_buf->raw_buf);
break;
case ICE_TX_BUF_XDP_XMIT:
xdp_return_frame(tx_buf->xdpf);
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.h 
b/drivers/net/ethernet/intel/ice/ice_txrx.h
index feba314a3fe4..6379f57d8228 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.h
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.h
@@ -148,7 +148,7 @@ static inline int ice_skb_pad(void)
  * @ICE_TX_BUF_DUMMY: dummy Flow Director packet, unmap and kfree()
  * @ICE_TX_BUF_FRAG: mapped skb OR &xdp_buff frag, only unmap DMA
  * @ICE_TX_BUF_SKB: &sk_buff, unmap and consume_skb(), update stats
- * @ICE_TX_BUF_XDP_TX: &xdp_buff, unmap and page_frag_free(), stats
+ * @ICE_TX_BUF_XDP_TX: &xdp_buff, unmap and page_frag_free_va(), stats
  * @ICE_TX_BUF_XDP_XMIT: &xdp_frame, unmap and xdp_return_frame(), stats
  * @ICE_TX_BUF_XSK_TX: &xdp_buff on XSk queue, xsk_buff_free(), stats
  */
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c 
b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
index 2719f0e20933..a1a41a14df0d 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
+++ b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
@@ -250,7 +250,7 @@ ice_clean_xdp_tx_buf(struct device *dev, struct ice_tx_buf 
*tx_buf,
 
switch (tx_buf->type) {
case ICE_TX_BUF_XDP_TX:
-   page_frag_free(tx_buf->raw_buf);
+   page_frag_free_va(tx_buf->raw_buf);
break;
case ICE_T

[RFC v4 07/13] mm: page_frag: avoid caller accessing 'page_frag_cache' directly

2024-05-15 Thread Yunsheng Lin
Use appropriate frag_page API instead of caller accessing
'page_frag_cache' directly.

CC: Alexander Duyck 
Signed-off-by: Yunsheng Lin 
---
 drivers/vhost/net.c |  2 +-
 include/linux/page_frag_cache.h | 10 ++
 mm/page_frag_test.c |  2 +-
 net/core/skbuff.c   |  6 +++---
 net/rxrpc/conn_object.c |  4 +---
 net/rxrpc/local_object.c|  4 +---
 net/sunrpc/svcsock.c|  6 ++
 7 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 6691fac01e0d..b2737dc0dc50 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -1325,7 +1325,7 @@ static int vhost_net_open(struct inode *inode, struct 
file *f)
vqs[VHOST_NET_VQ_RX]);
 
f->private_data = n;
-   n->pf_cache.va = NULL;
+   page_frag_cache_init(&n->pf_cache);
 
return 0;
 }
diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h
index a5747cf7a3a1..024ff73a7ea4 100644
--- a/include/linux/page_frag_cache.h
+++ b/include/linux/page_frag_cache.h
@@ -23,6 +23,16 @@ struct page_frag_cache {
bool pfmemalloc;
 };
 
+static inline void page_frag_cache_init(struct page_frag_cache *nc)
+{
+   nc->va = NULL;
+}
+
+static inline bool page_frag_cache_is_pfmemalloc(struct page_frag_cache *nc)
+{
+   return !!nc->pfmemalloc;
+}
+
 void page_frag_cache_drain(struct page_frag_cache *nc);
 void __page_frag_cache_drain(struct page *page, unsigned int count);
 void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
diff --git a/mm/page_frag_test.c b/mm/page_frag_test.c
index bb9f5aa84631..641255136105 100644
--- a/mm/page_frag_test.c
+++ b/mm/page_frag_test.c
@@ -337,7 +337,7 @@ static int __init page_frag_test_init(void)
u64 duration;
int ret;
 
-   test_frag.va = NULL;
+   page_frag_cache_init(&test_frag);
atomic_set(&nthreads, 2);
init_completion(&wait);
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index dca4e7445348..caee22db1cc7 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -741,12 +741,12 @@ struct sk_buff *__netdev_alloc_skb(struct net_device 
*dev, unsigned int len,
if (in_hardirq() || irqs_disabled()) {
nc = this_cpu_ptr(&netdev_alloc_cache);
data = page_frag_alloc_va(nc, len, gfp_mask);
-   pfmemalloc = nc->pfmemalloc;
+   pfmemalloc = page_frag_cache_is_pfmemalloc(nc);
} else {
local_bh_disable();
nc = this_cpu_ptr(&napi_alloc_cache.page);
data = page_frag_alloc_va(nc, len, gfp_mask);
-   pfmemalloc = nc->pfmemalloc;
+   pfmemalloc = page_frag_cache_is_pfmemalloc(nc);
local_bh_enable();
}
 
@@ -834,7 +834,7 @@ struct sk_buff *napi_alloc_skb(struct napi_struct *napi, 
unsigned int len)
len = SKB_HEAD_ALIGN(len);
 
data = page_frag_alloc_va(&nc->page, len, gfp_mask);
-   pfmemalloc = nc->page.pfmemalloc;
+   pfmemalloc = page_frag_cache_is_pfmemalloc(&nc->page);
}
 
if (unlikely(!data))
diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c
index 1539d315afe7..694c4df7a1a3 100644
--- a/net/rxrpc/conn_object.c
+++ b/net/rxrpc/conn_object.c
@@ -337,9 +337,7 @@ static void rxrpc_clean_up_connection(struct work_struct 
*work)
 */
rxrpc_purge_queue(&conn->rx_queue);
 
-   if (conn->tx_data_alloc.va)
-   __page_frag_cache_drain(virt_to_page(conn->tx_data_alloc.va),
-   conn->tx_data_alloc.pagecnt_bias);
+   page_frag_cache_drain(&conn->tx_data_alloc);
call_rcu(&conn->rcu, rxrpc_rcu_free_connection);
 }
 
diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c
index 504453c688d7..a8cffe47cf01 100644
--- a/net/rxrpc/local_object.c
+++ b/net/rxrpc/local_object.c
@@ -452,9 +452,7 @@ void rxrpc_destroy_local(struct rxrpc_local *local)
 #endif
rxrpc_purge_queue(&local->rx_queue);
rxrpc_purge_client_connections(local);
-   if (local->tx_alloc.va)
-   __page_frag_cache_drain(virt_to_page(local->tx_alloc.va),
-   local->tx_alloc.pagecnt_bias);
+   page_frag_cache_drain(&local->tx_alloc);
 }
 
 /*
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 42d20412c1c3..4b1e87187614 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1609,7 +1609,6 @@ static void svc_tcp_sock_detach(struct svc_xprt *xprt)
 static void svc_sock_free(struct svc_xprt *xprt)
 {
struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
-   struct page_frag_cache *pfc = &svsk->sk_frag_cache;
struct socket *sock = svsk->sk_sock;
 
   

[PATCH net-next v3 07/13] mm: page_frag: avoid caller accessing 'page_frag_cache' directly

2024-05-08 Thread Yunsheng Lin
Use appropriate frag_page API instead of caller accessing
'page_frag_cache' directly.

CC: Alexander Duyck 
Signed-off-by: Yunsheng Lin 
---
 drivers/vhost/net.c |  2 +-
 include/linux/page_frag_cache.h | 10 ++
 mm/page_frag_test.c |  2 +-
 net/core/skbuff.c   |  6 +++---
 net/rxrpc/conn_object.c |  4 +---
 net/rxrpc/local_object.c|  4 +---
 net/sunrpc/svcsock.c|  6 ++
 7 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 6691fac01e0d..b2737dc0dc50 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -1325,7 +1325,7 @@ static int vhost_net_open(struct inode *inode, struct 
file *f)
vqs[VHOST_NET_VQ_RX]);
 
f->private_data = n;
-   n->pf_cache.va = NULL;
+   page_frag_cache_init(&n->pf_cache);
 
return 0;
 }
diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h
index a5747cf7a3a1..024ff73a7ea4 100644
--- a/include/linux/page_frag_cache.h
+++ b/include/linux/page_frag_cache.h
@@ -23,6 +23,16 @@ struct page_frag_cache {
bool pfmemalloc;
 };
 
+static inline void page_frag_cache_init(struct page_frag_cache *nc)
+{
+   nc->va = NULL;
+}
+
+static inline bool page_frag_cache_is_pfmemalloc(struct page_frag_cache *nc)
+{
+   return !!nc->pfmemalloc;
+}
+
 void page_frag_cache_drain(struct page_frag_cache *nc);
 void __page_frag_cache_drain(struct page *page, unsigned int count);
 void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
diff --git a/mm/page_frag_test.c b/mm/page_frag_test.c
index 92eb288aab75..8a974d0588bf 100644
--- a/mm/page_frag_test.c
+++ b/mm/page_frag_test.c
@@ -329,7 +329,7 @@ static int __init page_frag_test_init(void)
u64 duration;
int ret;
 
-   test_frag.va = NULL;
+   page_frag_cache_init(&test_frag);
atomic_set(&nthreads, 2);
init_completion(&wait);
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index dca4e7445348..caee22db1cc7 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -741,12 +741,12 @@ struct sk_buff *__netdev_alloc_skb(struct net_device 
*dev, unsigned int len,
if (in_hardirq() || irqs_disabled()) {
nc = this_cpu_ptr(&netdev_alloc_cache);
data = page_frag_alloc_va(nc, len, gfp_mask);
-   pfmemalloc = nc->pfmemalloc;
+   pfmemalloc = page_frag_cache_is_pfmemalloc(nc);
} else {
local_bh_disable();
nc = this_cpu_ptr(&napi_alloc_cache.page);
data = page_frag_alloc_va(nc, len, gfp_mask);
-   pfmemalloc = nc->pfmemalloc;
+   pfmemalloc = page_frag_cache_is_pfmemalloc(nc);
local_bh_enable();
}
 
@@ -834,7 +834,7 @@ struct sk_buff *napi_alloc_skb(struct napi_struct *napi, 
unsigned int len)
len = SKB_HEAD_ALIGN(len);
 
data = page_frag_alloc_va(&nc->page, len, gfp_mask);
-   pfmemalloc = nc->page.pfmemalloc;
+   pfmemalloc = page_frag_cache_is_pfmemalloc(&nc->page);
}
 
if (unlikely(!data))
diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c
index 1539d315afe7..694c4df7a1a3 100644
--- a/net/rxrpc/conn_object.c
+++ b/net/rxrpc/conn_object.c
@@ -337,9 +337,7 @@ static void rxrpc_clean_up_connection(struct work_struct 
*work)
 */
rxrpc_purge_queue(&conn->rx_queue);
 
-   if (conn->tx_data_alloc.va)
-   __page_frag_cache_drain(virt_to_page(conn->tx_data_alloc.va),
-   conn->tx_data_alloc.pagecnt_bias);
+   page_frag_cache_drain(&conn->tx_data_alloc);
call_rcu(&conn->rcu, rxrpc_rcu_free_connection);
 }
 
diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c
index 504453c688d7..a8cffe47cf01 100644
--- a/net/rxrpc/local_object.c
+++ b/net/rxrpc/local_object.c
@@ -452,9 +452,7 @@ void rxrpc_destroy_local(struct rxrpc_local *local)
 #endif
rxrpc_purge_queue(&local->rx_queue);
rxrpc_purge_client_connections(local);
-   if (local->tx_alloc.va)
-   __page_frag_cache_drain(virt_to_page(local->tx_alloc.va),
-   local->tx_alloc.pagecnt_bias);
+   page_frag_cache_drain(&local->tx_alloc);
 }
 
 /*
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 42d20412c1c3..4b1e87187614 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1609,7 +1609,6 @@ static void svc_tcp_sock_detach(struct svc_xprt *xprt)
 static void svc_sock_free(struct svc_xprt *xprt)
 {
struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
-   struct page_frag_cache *pfc = &svsk->sk_frag_cache;
struct socket *sock = svsk->sk_sock;
 
   

[PATCH net-next v3 06/13] mm: page_frag: add '_va' suffix to page_frag API

2024-05-08 Thread Yunsheng Lin
Currently the page_frag API is returning 'virtual address'
or 'va' when allocing and expecting 'virtual address' or
'va' as input when freeing.

As we are about to support new use cases that the caller
need to deal with 'struct page' or need to deal with both
'va' and 'struct page'. In order to differentiate the API
handling between 'va' and 'struct page', add '_va' suffix
to the corresponding API mirroring the page_pool_alloc_va()
API of the page_pool. So that callers expecting to deal with
va, page or both va and page may call page_frag_alloc_va*,
page_frag_alloc_pg*, or page_frag_alloc* API accordingly.

CC: Alexander Duyck 
Signed-off-by: Yunsheng Lin 
---
 drivers/net/ethernet/google/gve/gve_rx.c  |  4 ++--
 drivers/net/ethernet/intel/ice/ice_txrx.c |  2 +-
 drivers/net/ethernet/intel/ice/ice_txrx.h |  2 +-
 drivers/net/ethernet/intel/ice/ice_txrx_lib.c |  2 +-
 .../net/ethernet/intel/ixgbevf/ixgbevf_main.c |  4 ++--
 .../marvell/octeontx2/nic/otx2_common.c   |  2 +-
 drivers/net/ethernet/mediatek/mtk_wed_wo.c|  4 ++--
 drivers/nvme/host/tcp.c   |  8 +++
 drivers/nvme/target/tcp.c | 22 +--
 drivers/vhost/net.c   |  6 ++---
 include/linux/page_frag_cache.h   | 21 +-
 include/linux/skbuff.h|  2 +-
 kernel/bpf/cpumap.c   |  2 +-
 mm/page_frag_cache.c  | 12 +-
 mm/page_frag_test.c   | 11 +-
 net/core/skbuff.c | 18 +++
 net/core/xdp.c|  2 +-
 net/rxrpc/txbuf.c | 15 +++--
 net/sunrpc/svcsock.c  |  6 ++---
 19 files changed, 74 insertions(+), 71 deletions(-)

diff --git a/drivers/net/ethernet/google/gve/gve_rx.c 
b/drivers/net/ethernet/google/gve/gve_rx.c
index acb73d4d0de6..b6c10100e462 100644
--- a/drivers/net/ethernet/google/gve/gve_rx.c
+++ b/drivers/net/ethernet/google/gve/gve_rx.c
@@ -729,7 +729,7 @@ static int gve_xdp_redirect(struct net_device *dev, struct 
gve_rx_ring *rx,
 
total_len = headroom + SKB_DATA_ALIGN(len) +
SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
-   frame = page_frag_alloc(&rx->page_cache, total_len, GFP_ATOMIC);
+   frame = page_frag_alloc_va(&rx->page_cache, total_len, GFP_ATOMIC);
if (!frame) {
u64_stats_update_begin(&rx->statss);
rx->xdp_alloc_fails++;
@@ -742,7 +742,7 @@ static int gve_xdp_redirect(struct net_device *dev, struct 
gve_rx_ring *rx,
 
err = xdp_do_redirect(dev, &new, xdp_prog);
if (err)
-   page_frag_free(frame);
+   page_frag_free_va(frame);
 
return err;
 }
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c 
b/drivers/net/ethernet/intel/ice/ice_txrx.c
index 8bb743f78fcb..399b317c509d 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.c
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.c
@@ -126,7 +126,7 @@ ice_unmap_and_free_tx_buf(struct ice_tx_ring *ring, struct 
ice_tx_buf *tx_buf)
dev_kfree_skb_any(tx_buf->skb);
break;
case ICE_TX_BUF_XDP_TX:
-   page_frag_free(tx_buf->raw_buf);
+   page_frag_free_va(tx_buf->raw_buf);
break;
case ICE_TX_BUF_XDP_XMIT:
xdp_return_frame(tx_buf->xdpf);
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.h 
b/drivers/net/ethernet/intel/ice/ice_txrx.h
index feba314a3fe4..6379f57d8228 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.h
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.h
@@ -148,7 +148,7 @@ static inline int ice_skb_pad(void)
  * @ICE_TX_BUF_DUMMY: dummy Flow Director packet, unmap and kfree()
  * @ICE_TX_BUF_FRAG: mapped skb OR &xdp_buff frag, only unmap DMA
  * @ICE_TX_BUF_SKB: &sk_buff, unmap and consume_skb(), update stats
- * @ICE_TX_BUF_XDP_TX: &xdp_buff, unmap and page_frag_free(), stats
+ * @ICE_TX_BUF_XDP_TX: &xdp_buff, unmap and page_frag_free_va(), stats
  * @ICE_TX_BUF_XDP_XMIT: &xdp_frame, unmap and xdp_return_frame(), stats
  * @ICE_TX_BUF_XSK_TX: &xdp_buff on XSk queue, xsk_buff_free(), stats
  */
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c 
b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
index 2719f0e20933..a1a41a14df0d 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
+++ b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
@@ -250,7 +250,7 @@ ice_clean_xdp_tx_buf(struct device *dev, struct ice_tx_buf 
*tx_buf,
 
switch (tx_buf->type) {
case ICE_TX_BUF_XDP_TX:
-   page_frag_free(tx_buf->raw_buf);
+   page_frag_free_va(tx_buf->raw_buf);
break;
case ICE_T

Re: [PATCH net-next v2 07/15] mm: page_frag: add '_va' suffix to page_frag API

2024-04-17 Thread Yunsheng Lin
On 2024/4/17 0:12, Alexander H Duyck wrote:
> On Mon, 2024-04-15 at 21:19 +0800, Yunsheng Lin wrote:
>> Currently most of the API for page_frag API is returning
>> 'virtual address' as output or expecting 'virtual address'
>> as input, in order to differentiate the API handling between
>> 'virtual address' and 'struct page', add '_va' suffix to the
>> corresponding API mirroring the page_pool_alloc_va() API of
>> the page_pool.
>>
>> Signed-off-by: Yunsheng Lin 
> 
> This patch is a total waste of time. By that logic we should be
> renaming __get_free_pages since it essentially does the same thing.
> 
> This just seems like more code changes for the sake of adding code
> changes rather than fixing anything. In my opinion it should be dropped
> from the set.

The rename is to support different use case as mentioned below in patch
14:
"Depending on different use cases, callers expecting to deal with va, page or
both va and page for them may call page_frag_alloc_va*, page_frag_alloc_pg*,
or page_frag_alloc* API accordingly."

Naming is hard anyway, I am open to better API naming for the above use cases.

> 
> .
> 



[PATCH net-next v2 07/15] mm: page_frag: add '_va' suffix to page_frag API

2024-04-15 Thread Yunsheng Lin
Currently most of the API for page_frag API is returning
'virtual address' as output or expecting 'virtual address'
as input, in order to differentiate the API handling between
'virtual address' and 'struct page', add '_va' suffix to the
corresponding API mirroring the page_pool_alloc_va() API of
the page_pool.

Signed-off-by: Yunsheng Lin 
---
 drivers/net/ethernet/google/gve/gve_rx.c  |  4 ++--
 drivers/net/ethernet/intel/ice/ice_txrx.c |  2 +-
 drivers/net/ethernet/intel/ice/ice_txrx.h |  2 +-
 drivers/net/ethernet/intel/ice/ice_txrx_lib.c |  2 +-
 .../net/ethernet/intel/ixgbevf/ixgbevf_main.c |  4 ++--
 .../marvell/octeontx2/nic/otx2_common.c   |  2 +-
 drivers/net/ethernet/mediatek/mtk_wed_wo.c|  4 ++--
 drivers/nvme/host/tcp.c   |  8 +++
 drivers/nvme/target/tcp.c | 22 -
 drivers/vhost/net.c   |  6 ++---
 include/linux/page_frag_cache.h   | 24 ++-
 include/linux/skbuff.h|  2 +-
 kernel/bpf/cpumap.c   |  2 +-
 mm/page_frag_cache.c  | 10 
 mm/page_frag_test.c   |  6 ++---
 net/core/skbuff.c | 15 ++--
 net/core/xdp.c|  2 +-
 net/rxrpc/txbuf.c | 15 ++--
 net/sunrpc/svcsock.c  |  6 ++---
 19 files changed, 71 insertions(+), 67 deletions(-)

diff --git a/drivers/net/ethernet/google/gve/gve_rx.c 
b/drivers/net/ethernet/google/gve/gve_rx.c
index cd727e55ae0f..820874c1c570 100644
--- a/drivers/net/ethernet/google/gve/gve_rx.c
+++ b/drivers/net/ethernet/google/gve/gve_rx.c
@@ -687,7 +687,7 @@ static int gve_xdp_redirect(struct net_device *dev, struct 
gve_rx_ring *rx,
 
total_len = headroom + SKB_DATA_ALIGN(len) +
SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
-   frame = page_frag_alloc(&rx->page_cache, total_len, GFP_ATOMIC);
+   frame = page_frag_alloc_va(&rx->page_cache, total_len, GFP_ATOMIC);
if (!frame) {
u64_stats_update_begin(&rx->statss);
rx->xdp_alloc_fails++;
@@ -700,7 +700,7 @@ static int gve_xdp_redirect(struct net_device *dev, struct 
gve_rx_ring *rx,
 
err = xdp_do_redirect(dev, &new, xdp_prog);
if (err)
-   page_frag_free(frame);
+   page_frag_free_va(frame);
 
return err;
 }
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c 
b/drivers/net/ethernet/intel/ice/ice_txrx.c
index 8bb743f78fcb..399b317c509d 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.c
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.c
@@ -126,7 +126,7 @@ ice_unmap_and_free_tx_buf(struct ice_tx_ring *ring, struct 
ice_tx_buf *tx_buf)
dev_kfree_skb_any(tx_buf->skb);
break;
case ICE_TX_BUF_XDP_TX:
-   page_frag_free(tx_buf->raw_buf);
+   page_frag_free_va(tx_buf->raw_buf);
break;
case ICE_TX_BUF_XDP_XMIT:
xdp_return_frame(tx_buf->xdpf);
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.h 
b/drivers/net/ethernet/intel/ice/ice_txrx.h
index feba314a3fe4..6379f57d8228 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.h
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.h
@@ -148,7 +148,7 @@ static inline int ice_skb_pad(void)
  * @ICE_TX_BUF_DUMMY: dummy Flow Director packet, unmap and kfree()
  * @ICE_TX_BUF_FRAG: mapped skb OR &xdp_buff frag, only unmap DMA
  * @ICE_TX_BUF_SKB: &sk_buff, unmap and consume_skb(), update stats
- * @ICE_TX_BUF_XDP_TX: &xdp_buff, unmap and page_frag_free(), stats
+ * @ICE_TX_BUF_XDP_TX: &xdp_buff, unmap and page_frag_free_va(), stats
  * @ICE_TX_BUF_XDP_XMIT: &xdp_frame, unmap and xdp_return_frame(), stats
  * @ICE_TX_BUF_XSK_TX: &xdp_buff on XSk queue, xsk_buff_free(), stats
  */
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c 
b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
index df072ce767b1..c34cc02ad578 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
+++ b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
@@ -288,7 +288,7 @@ ice_clean_xdp_tx_buf(struct device *dev, struct ice_tx_buf 
*tx_buf,
 
switch (tx_buf->type) {
case ICE_TX_BUF_XDP_TX:
-   page_frag_free(tx_buf->raw_buf);
+   page_frag_free_va(tx_buf->raw_buf);
break;
case ICE_TX_BUF_XDP_XMIT:
xdp_return_frame_bulk(tx_buf->xdpf, bq);
diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c 
b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
index 3161a13079fe..c35b8f675b48 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -303,7 +303,7 @@ static bool ixgbevf

[PATCH net-next v1 04/12] mm: page_frag: add '_va' suffix to page_frag API

2024-04-07 Thread Yunsheng Lin
Currently most of the API for page_frag API is returning
'virtual address' as output or expecting 'virtual address'
as input, in order to differentiate the API handling between
'virtual address' and 'struct page', add '_va' suffix to the
corresponding API mirroring the page_pool_alloc_va() API of
the page_pool.

Signed-off-by: Yunsheng Lin 
---
 drivers/net/ethernet/google/gve/gve_rx.c  |  4 ++--
 drivers/net/ethernet/intel/ice/ice_txrx.c |  2 +-
 drivers/net/ethernet/intel/ice/ice_txrx.h |  2 +-
 drivers/net/ethernet/intel/ice/ice_txrx_lib.c |  2 +-
 .../net/ethernet/intel/ixgbevf/ixgbevf_main.c |  4 ++--
 .../marvell/octeontx2/nic/otx2_common.c   |  2 +-
 drivers/net/ethernet/mediatek/mtk_wed_wo.c|  4 ++--
 drivers/nvme/host/tcp.c   |  8 +++
 drivers/nvme/target/tcp.c | 22 -
 drivers/vhost/net.c   |  6 ++---
 include/linux/page_frag_cache.h   | 24 ++-
 include/linux/skbuff.h|  2 +-
 kernel/bpf/cpumap.c   |  2 +-
 mm/page_frag_cache.c  | 10 
 net/core/skbuff.c | 15 ++--
 net/core/xdp.c|  2 +-
 net/rxrpc/txbuf.c | 15 ++--
 net/sunrpc/svcsock.c  |  4 ++--
 18 files changed, 67 insertions(+), 63 deletions(-)

diff --git a/drivers/net/ethernet/google/gve/gve_rx.c 
b/drivers/net/ethernet/google/gve/gve_rx.c
index cd727e55ae0f..820874c1c570 100644
--- a/drivers/net/ethernet/google/gve/gve_rx.c
+++ b/drivers/net/ethernet/google/gve/gve_rx.c
@@ -687,7 +687,7 @@ static int gve_xdp_redirect(struct net_device *dev, struct 
gve_rx_ring *rx,
 
total_len = headroom + SKB_DATA_ALIGN(len) +
SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
-   frame = page_frag_alloc(&rx->page_cache, total_len, GFP_ATOMIC);
+   frame = page_frag_alloc_va(&rx->page_cache, total_len, GFP_ATOMIC);
if (!frame) {
u64_stats_update_begin(&rx->statss);
rx->xdp_alloc_fails++;
@@ -700,7 +700,7 @@ static int gve_xdp_redirect(struct net_device *dev, struct 
gve_rx_ring *rx,
 
err = xdp_do_redirect(dev, &new, xdp_prog);
if (err)
-   page_frag_free(frame);
+   page_frag_free_va(frame);
 
return err;
 }
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c 
b/drivers/net/ethernet/intel/ice/ice_txrx.c
index 8bb743f78fcb..399b317c509d 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.c
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.c
@@ -126,7 +126,7 @@ ice_unmap_and_free_tx_buf(struct ice_tx_ring *ring, struct 
ice_tx_buf *tx_buf)
dev_kfree_skb_any(tx_buf->skb);
break;
case ICE_TX_BUF_XDP_TX:
-   page_frag_free(tx_buf->raw_buf);
+   page_frag_free_va(tx_buf->raw_buf);
break;
case ICE_TX_BUF_XDP_XMIT:
xdp_return_frame(tx_buf->xdpf);
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.h 
b/drivers/net/ethernet/intel/ice/ice_txrx.h
index feba314a3fe4..6379f57d8228 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.h
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.h
@@ -148,7 +148,7 @@ static inline int ice_skb_pad(void)
  * @ICE_TX_BUF_DUMMY: dummy Flow Director packet, unmap and kfree()
  * @ICE_TX_BUF_FRAG: mapped skb OR &xdp_buff frag, only unmap DMA
  * @ICE_TX_BUF_SKB: &sk_buff, unmap and consume_skb(), update stats
- * @ICE_TX_BUF_XDP_TX: &xdp_buff, unmap and page_frag_free(), stats
+ * @ICE_TX_BUF_XDP_TX: &xdp_buff, unmap and page_frag_free_va(), stats
  * @ICE_TX_BUF_XDP_XMIT: &xdp_frame, unmap and xdp_return_frame(), stats
  * @ICE_TX_BUF_XSK_TX: &xdp_buff on XSk queue, xsk_buff_free(), stats
  */
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c 
b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
index df072ce767b1..c34cc02ad578 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
+++ b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
@@ -288,7 +288,7 @@ ice_clean_xdp_tx_buf(struct device *dev, struct ice_tx_buf 
*tx_buf,
 
switch (tx_buf->type) {
case ICE_TX_BUF_XDP_TX:
-   page_frag_free(tx_buf->raw_buf);
+   page_frag_free_va(tx_buf->raw_buf);
break;
case ICE_TX_BUF_XDP_XMIT:
xdp_return_frame_bulk(tx_buf->xdpf, bq);
diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c 
b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
index 3161a13079fe..c35b8f675b48 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -303,7 +303,7 @@ static bool ixgbevf_clean_tx_irq(struct ixgbevf_q_vector 
*q_vector,
 
 

[PATCH RFC 04/10] mm: page_frag: add '_va' suffix to page_frag API

2024-03-28 Thread Yunsheng Lin
Currently most of the API for page_frag API is returning
'virtual address' as output or expecting 'virtual address'
as input, in order to differentiate the API handling between
'virtual address' and 'struct page', add '_va' suffix to the
corresponding API mirroring the page_pool_alloc_va() API of
the page_pool.

Signed-off-by: Yunsheng Lin 
---
 drivers/net/ethernet/google/gve/gve_rx.c  |  4 ++--
 drivers/net/ethernet/intel/ice/ice_txrx.c |  2 +-
 drivers/net/ethernet/intel/ice/ice_txrx.h |  2 +-
 drivers/net/ethernet/intel/ice/ice_txrx_lib.c |  2 +-
 .../net/ethernet/intel/ixgbevf/ixgbevf_main.c |  4 ++--
 .../marvell/octeontx2/nic/otx2_common.c   |  2 +-
 drivers/net/ethernet/mediatek/mtk_wed_wo.c|  4 ++--
 drivers/nvme/host/tcp.c   |  8 +++
 drivers/nvme/target/tcp.c | 22 -
 drivers/vhost/net.c   |  6 ++---
 include/linux/page_frag_cache.h   | 24 ++-
 include/linux/skbuff.h|  2 +-
 kernel/bpf/cpumap.c   |  2 +-
 mm/page_frag_alloc.c  | 10 
 net/core/skbuff.c | 15 ++--
 net/core/xdp.c|  2 +-
 net/rxrpc/txbuf.c | 15 ++--
 net/sunrpc/svcsock.c  |  4 ++--
 18 files changed, 67 insertions(+), 63 deletions(-)

diff --git a/drivers/net/ethernet/google/gve/gve_rx.c 
b/drivers/net/ethernet/google/gve/gve_rx.c
index 20f5a9e7fae9..58091de93430 100644
--- a/drivers/net/ethernet/google/gve/gve_rx.c
+++ b/drivers/net/ethernet/google/gve/gve_rx.c
@@ -687,7 +687,7 @@ static int gve_xdp_redirect(struct net_device *dev, struct 
gve_rx_ring *rx,
 
total_len = headroom + SKB_DATA_ALIGN(len) +
SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
-   frame = page_frag_alloc(&rx->page_cache, total_len, GFP_ATOMIC);
+   frame = page_frag_alloc_va(&rx->page_cache, total_len, GFP_ATOMIC);
if (!frame) {
u64_stats_update_begin(&rx->statss);
rx->xdp_alloc_fails++;
@@ -700,7 +700,7 @@ static int gve_xdp_redirect(struct net_device *dev, struct 
gve_rx_ring *rx,
 
err = xdp_do_redirect(dev, &new, xdp_prog);
if (err)
-   page_frag_free(frame);
+   page_frag_free_va(frame);
 
return err;
 }
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c 
b/drivers/net/ethernet/intel/ice/ice_txrx.c
index 97d41d6ebf1f..87f23995b657 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.c
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.c
@@ -126,7 +126,7 @@ ice_unmap_and_free_tx_buf(struct ice_tx_ring *ring, struct 
ice_tx_buf *tx_buf)
dev_kfree_skb_any(tx_buf->skb);
break;
case ICE_TX_BUF_XDP_TX:
-   page_frag_free(tx_buf->raw_buf);
+   page_frag_free_va(tx_buf->raw_buf);
break;
case ICE_TX_BUF_XDP_XMIT:
xdp_return_frame(tx_buf->xdpf);
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.h 
b/drivers/net/ethernet/intel/ice/ice_txrx.h
index af955b0e5dc5..65ad1757824f 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.h
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.h
@@ -148,7 +148,7 @@ static inline int ice_skb_pad(void)
  * @ICE_TX_BUF_DUMMY: dummy Flow Director packet, unmap and kfree()
  * @ICE_TX_BUF_FRAG: mapped skb OR &xdp_buff frag, only unmap DMA
  * @ICE_TX_BUF_SKB: &sk_buff, unmap and consume_skb(), update stats
- * @ICE_TX_BUF_XDP_TX: &xdp_buff, unmap and page_frag_free(), stats
+ * @ICE_TX_BUF_XDP_TX: &xdp_buff, unmap and page_frag_free_va(), stats
  * @ICE_TX_BUF_XDP_XMIT: &xdp_frame, unmap and xdp_return_frame(), stats
  * @ICE_TX_BUF_XSK_TX: &xdp_buff on XSk queue, xsk_buff_free(), stats
  */
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c 
b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
index f8f1d2bdc1be..312f351ac601 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
+++ b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
@@ -279,7 +279,7 @@ ice_clean_xdp_tx_buf(struct device *dev, struct ice_tx_buf 
*tx_buf,
 
switch (tx_buf->type) {
case ICE_TX_BUF_XDP_TX:
-   page_frag_free(tx_buf->raw_buf);
+   page_frag_free_va(tx_buf->raw_buf);
break;
case ICE_TX_BUF_XDP_XMIT:
xdp_return_frame_bulk(tx_buf->xdpf, bq);
diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c 
b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
index 9c960017a6de..f781c5f202c9 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -303,7 +303,7 @@ static bool ixgbevf_clean_tx_irq(struct ixgbevf_q_vector 
*q_vector,
 
 

[PATCH net-next v6 4/5] vhost/net: remove vhost_net_page_frag_refill()

2024-02-28 Thread Yunsheng Lin
The page frag in vhost_net_page_frag_refill() uses the
'struct page_frag' from skb_page_frag_refill(), but it's
implementation is similar to page_frag_alloc_align() now.

This patch removes vhost_net_page_frag_refill() by using
'struct page_frag_cache' instead of 'struct page_frag',
and allocating frag using page_frag_alloc_align().

The added benefit is that not only unifying the page frag
implementation a little, but also having about 0.5% performance
boost testing by using the vhost_net_test introduced in the
last patch.

Signed-off-by: Yunsheng Lin 
Acked-by: Jason Wang 
---
 drivers/vhost/net.c | 91 ++---
 1 file changed, 27 insertions(+), 64 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index e574e21cc0ca..4b2fcb228a0a 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -141,10 +141,8 @@ struct vhost_net {
unsigned tx_zcopy_err;
/* Flush in progress. Protected by tx vq lock. */
bool tx_flush;
-   /* Private page frag */
-   struct page_frag page_frag;
-   /* Refcount bias of page frag */
-   int refcnt_bias;
+   /* Private page frag cache */
+   struct page_frag_cache pf_cache;
 };
 
 static unsigned vhost_net_zcopy_mask __read_mostly;
@@ -655,41 +653,6 @@ static bool tx_can_batch(struct vhost_virtqueue *vq, 
size_t total_len)
   !vhost_vq_avail_empty(vq->dev, vq);
 }
 
-static bool vhost_net_page_frag_refill(struct vhost_net *net, unsigned int sz,
-  struct page_frag *pfrag, gfp_t gfp)
-{
-   if (pfrag->page) {
-   if (pfrag->offset + sz <= pfrag->size)
-   return true;
-   __page_frag_cache_drain(pfrag->page, net->refcnt_bias);
-   }
-
-   pfrag->offset = 0;
-   net->refcnt_bias = 0;
-   if (SKB_FRAG_PAGE_ORDER) {
-   /* Avoid direct reclaim but allow kswapd to wake */
-   pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
- __GFP_COMP | __GFP_NOWARN |
- __GFP_NORETRY | __GFP_NOMEMALLOC,
- SKB_FRAG_PAGE_ORDER);
-   if (likely(pfrag->page)) {
-   pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
-   goto done;
-   }
-   }
-   pfrag->page = alloc_page(gfp);
-   if (likely(pfrag->page)) {
-   pfrag->size = PAGE_SIZE;
-   goto done;
-   }
-   return false;
-
-done:
-   net->refcnt_bias = USHRT_MAX;
-   page_ref_add(pfrag->page, USHRT_MAX - 1);
-   return true;
-}
-
 #define VHOST_NET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
 
 static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq,
@@ -699,7 +662,6 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue 
*nvq,
struct vhost_net *net = container_of(vq->dev, struct vhost_net,
 dev);
struct socket *sock = vhost_vq_get_backend(vq);
-   struct page_frag *alloc_frag = &net->page_frag;
struct virtio_net_hdr *gso;
struct xdp_buff *xdp = &nvq->xdp[nvq->batched_xdp];
struct tun_xdp_hdr *hdr;
@@ -710,6 +672,7 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue 
*nvq,
int sock_hlen = nvq->sock_hlen;
void *buf;
int copied;
+   int ret;
 
if (unlikely(len < nvq->sock_hlen))
return -EFAULT;
@@ -719,18 +682,17 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue 
*nvq,
return -ENOSPC;
 
buflen += SKB_DATA_ALIGN(len + pad);
-   alloc_frag->offset = ALIGN((u64)alloc_frag->offset, SMP_CACHE_BYTES);
-   if (unlikely(!vhost_net_page_frag_refill(net, buflen,
-alloc_frag, GFP_KERNEL)))
+   buf = page_frag_alloc_align(&net->pf_cache, buflen, GFP_KERNEL,
+   SMP_CACHE_BYTES);
+   if (unlikely(!buf))
return -ENOMEM;
 
-   buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
-   copied = copy_page_from_iter(alloc_frag->page,
-alloc_frag->offset +
-offsetof(struct tun_xdp_hdr, gso),
-sock_hlen, from);
-   if (copied != sock_hlen)
-   return -EFAULT;
+   copied = copy_from_iter(buf + offsetof(struct tun_xdp_hdr, gso),
+   sock_hlen, from);
+   if (copied != sock_hlen) {
+   ret = -EFAULT;
+   goto err;
+   }
 
hdr = buf;
gso = &hdr->gso;
@@ -743,27 +705,30 @@ static int vhost_net_build_xdp(struct vhost_net_

[PATCH net-next v6 5/5] tools: virtio: introduce vhost_net_test

2024-02-28 Thread Yunsheng Lin
introduce vhost_net_test for both vhost_net tx and rx basing
on virtio_test to test vhost_net changing in the kernel.

Steps for vhost_net tx testing:
1. Prepare a out buf.
2. Kick the vhost_net to do tx processing.
3. Do the receiving in the tun side.
4. verify the data received by tun is correct.

Steps for vhost_net rx testing:
1. Prepare a in buf.
2. Do the sending in the tun side.
3. Kick the vhost_net to do rx processing.
4. verify the data received by vhost_net is correct.

Signed-off-by: Yunsheng Lin 
---
 tools/virtio/.gitignore|   1 +
 tools/virtio/Makefile  |   8 +-
 tools/virtio/linux/virtio_config.h |   4 +
 tools/virtio/vhost_net_test.c  | 532 +
 4 files changed, 542 insertions(+), 3 deletions(-)
 create mode 100644 tools/virtio/vhost_net_test.c

diff --git a/tools/virtio/.gitignore b/tools/virtio/.gitignore
index 9934d48d9a55..7e47b281c442 100644
--- a/tools/virtio/.gitignore
+++ b/tools/virtio/.gitignore
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0-only
 *.d
 virtio_test
+vhost_net_test
 vringh_test
 virtio-trace/trace-agent
diff --git a/tools/virtio/Makefile b/tools/virtio/Makefile
index d128925980e0..e25e99c1c3b7 100644
--- a/tools/virtio/Makefile
+++ b/tools/virtio/Makefile
@@ -1,8 +1,9 @@
 # SPDX-License-Identifier: GPL-2.0
 all: test mod
-test: virtio_test vringh_test
+test: virtio_test vringh_test vhost_net_test
 virtio_test: virtio_ring.o virtio_test.o
 vringh_test: vringh_test.o vringh.o virtio_ring.o
+vhost_net_test: virtio_ring.o vhost_net_test.o
 
 try-run = $(shell set -e;  \
if ($(1)) >/dev/null 2>&1;  \
@@ -49,6 +50,7 @@ oot-clean: OOT_BUILD+=clean
 
 .PHONY: all test mod clean vhost oot oot-clean oot-build
 clean:
-   ${RM} *.o vringh_test virtio_test vhost_test/*.o vhost_test/.*.cmd \
-  vhost_test/Module.symvers vhost_test/modules.order *.d
+   ${RM} *.o vringh_test virtio_test vhost_net_test vhost_test/*.o \
+  vhost_test/.*.cmd vhost_test/Module.symvers \
+  vhost_test/modules.order *.d
 -include *.d
diff --git a/tools/virtio/linux/virtio_config.h 
b/tools/virtio/linux/virtio_config.h
index 2a8a70e2a950..42a564f22f2d 100644
--- a/tools/virtio/linux/virtio_config.h
+++ b/tools/virtio/linux/virtio_config.h
@@ -1,4 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 */
+#ifndef LINUX_VIRTIO_CONFIG_H
+#define LINUX_VIRTIO_CONFIG_H
 #include 
 #include 
 #include 
@@ -95,3 +97,5 @@ static inline __virtio64 cpu_to_virtio64(struct virtio_device 
*vdev, u64 val)
 {
return __cpu_to_virtio64(virtio_is_little_endian(vdev), val);
 }
+
+#endif
diff --git a/tools/virtio/vhost_net_test.c b/tools/virtio/vhost_net_test.c
new file mode 100644
index ..389d99a6d7c7
--- /dev/null
+++ b/tools/virtio/vhost_net_test.c
@@ -0,0 +1,532 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define HDR_LENsizeof(struct virtio_net_hdr_mrg_rxbuf)
+#define TEST_BUF_LEN   256
+#define TEST_PTYPE ETH_P_LOOPBACK
+#define DESC_NUM   256
+
+/* Used by implementation of kmalloc() in tools/virtio/linux/kernel.h */
+void *__kmalloc_fake, *__kfree_ignore_start, *__kfree_ignore_end;
+
+struct vq_info {
+   int kick;
+   int call;
+   int idx;
+   long started;
+   long completed;
+   struct pollfd fds;
+   void *ring;
+   /* copy used for control */
+   struct vring vring;
+   struct virtqueue *vq;
+};
+
+struct vdev_info {
+   struct virtio_device vdev;
+   int control;
+   struct vq_info vqs[2];
+   int nvqs;
+   void *buf;
+   size_t buf_size;
+   char *test_buf;
+   char *res_buf;
+   struct vhost_memory *mem;
+   int sock;
+   int ifindex;
+   unsigned char mac[ETHER_ADDR_LEN];
+};
+
+static int tun_alloc(struct vdev_info *dev, char *tun_name)
+{
+   struct ifreq ifr;
+   int len = HDR_LEN;
+   int fd, e;
+
+   fd = open("/dev/net/tun", O_RDWR);
+   if (fd < 0) {
+   perror("Cannot open /dev/net/tun");
+   return fd;
+   }
+
+   memset(&ifr, 0, sizeof(ifr));
+
+   ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR;
+   strncpy(ifr.ifr_name, tun_name, IFNAMSIZ);
+
+   e = ioctl(fd, TUNSETIFF, &ifr);
+   if (e < 0) {
+   perror("ioctl[TUNSETIFF]");
+   close(fd);
+   return e;
+   }
+
+   e = ioctl(fd, TUNSETVNETHDRSZ, &len);
+   if (e < 0) {
+   perror("ioctl[TUNSETVNETHDRSZ]");
+   close(fd);
+   return e;
+   }
+
+   e = ioctl(fd, SIOCGIFHWADDR, &ifr);
+   if (e < 0) {
+   perro

[PATCH net-next v6 2/5] page_frag: unify gfp bits for order 3 page allocation

2024-02-28 Thread Yunsheng Lin
Currently there seems to be three page frag implementations
which all try to allocate order 3 page, if that fails, it
then fail back to allocate order 0 page, and each of them
all allow order 3 page allocation to fail under certain
condition by using specific gfp bits.

The gfp bits for order 3 page allocation are different
between different implementation, __GFP_NOMEMALLOC is
or'd to forbid access to emergency reserves memory for
__page_frag_cache_refill(), but it is not or'd in other
implementions, __GFP_DIRECT_RECLAIM is masked off to avoid
direct reclaim in vhost_net_page_frag_refill(), but it is
not masked off in __page_frag_cache_refill().

This patch unifies the gfp bits used between different
implementions by or'ing __GFP_NOMEMALLOC and masking off
__GFP_DIRECT_RECLAIM for order 3 page allocation to avoid
possible pressure for mm.

Leave the gfp unifying for page frag implementation in sock.c
for now as suggested by Paolo Abeni.

Signed-off-by: Yunsheng Lin 
Reviewed-by: Alexander Duyck 
CC: Alexander Duyck 
---
 drivers/vhost/net.c | 2 +-
 mm/page_alloc.c | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index f2ed7167c848..e574e21cc0ca 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -670,7 +670,7 @@ static bool vhost_net_page_frag_refill(struct vhost_net 
*net, unsigned int sz,
/* Avoid direct reclaim but allow kswapd to wake */
pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
  __GFP_COMP | __GFP_NOWARN |
- __GFP_NORETRY,
+ __GFP_NORETRY | __GFP_NOMEMALLOC,
  SKB_FRAG_PAGE_ORDER);
if (likely(pfrag->page)) {
pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c0f7e67c4250..636145c29f70 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4685,8 +4685,8 @@ static struct page *__page_frag_cache_refill(struct 
page_frag_cache *nc,
gfp_t gfp = gfp_mask;
 
 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
-   gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY |
-   __GFP_NOMEMALLOC;
+   gfp_mask = (gfp_mask & ~__GFP_DIRECT_RECLAIM) |  __GFP_COMP |
+  __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC;
page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
PAGE_FRAG_CACHE_MAX_ORDER);
nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
-- 
2.33.0




Re: [PATCH net-next v5 5/5] tools: virtio: introduce vhost_net_test

2024-02-05 Thread Yunsheng Lin
On 2024/2/6 11:08, Jason Wang wrote:

...

>> +
>> +static void wait_for_interrupt(struct vq_info *vq)
>> +{
>> +   unsigned long long val;
>> +
>> +   poll(&vq->fds, 1, -1);
> 
> It's not good to wait indefinitely.

How about a timeout value of 100ms as below?
poll(&vq->fds, 1, 100);

> 
>> +
>> +   if (vq->fds.revents & POLLIN)
>> +   read(vq->fds.fd, &val, sizeof(val));
>> +}
>> +
>> +static void verify_res_buf(char *res_buf)
>> +{
>> +   int i;
>> +
>> +   for (i = ETHER_HDR_LEN; i < TEST_BUF_LEN; i++)
>> +   assert(res_buf[i] == (char)i);
>> +}
>> +
>> +static void run_tx_test(struct vdev_info *dev, struct vq_info *vq,
>> +   bool delayed, int bufs)
>> +{
>> +   long long spurious = 0;
>> +   struct scatterlist sl;
>> +   unsigned int len;
>> +   int r;
>> +
>> +   for (;;) {
>> +   long started_before = vq->started;
>> +   long completed_before = vq->completed;
>> +
>> +   virtqueue_disable_cb(vq->vq);
>> +   do {
>> +   while (vq->started < bufs &&
>> +  (vq->started - vq->completed) < 1) {
>> +   sg_init_one(&sl, dev->test_buf, HDR_LEN + 
>> TEST_BUF_LEN);
>> +   r = virtqueue_add_outbuf(vq->vq, &sl, 1,
>> +dev->test_buf + 
>> vq->started,
>> +GFP_ATOMIC);
>> +   if (unlikely(r != 0))
>> +   break;
>> +
>> +   ++vq->started;
> 
> If we never decrease started/completed shouldn't we use unsigned here?
> (as well as completed)
> 
> Otherwise we may get unexpected results for vq->started as well as
> vq->completed.

We have "vq->started < bufs" checking before the increasing as above,
and there is 'assert(nbufs > 0)' when getting optarg in main(), which
means we never allow started/completed to be greater than nbufs as
my understanding.

> 
>> +
>> +   if (unlikely(!virtqueue_kick(vq->vq))) {
>> +   r = -1;
>> +   break;
>> +   }
>> +   }
>> +
>> +   if (vq->started >= bufs)
>> +   r = -1;
> 
> Which condition do we reach here?

It is also a copy & paste of virtio_test.c
It means we have finished adding the outbuf in virtqueue, and set 'r'
to be '-1' so that we can break the inner while loop if there is no
result for virtqueue_get_buf() as my understanding.

> 
>> +
>> +   /* Flush out completed bufs if any */
>> +   while (virtqueue_get_buf(vq->vq, &len)) {
>> +   int n;
>> +
>> +   n = recvfrom(dev->sock, dev->res_buf, 
>> TEST_BUF_LEN, 0, NULL, NULL);
>> +   assert(n == TEST_BUF_LEN);
>> +   verify_res_buf(dev->res_buf);
>> +
>> +   ++vq->completed;
>> +   r = 0;
>> +   }
>> +   } while (r == 0);
>> +
>> +   if (vq->completed == completed_before && vq->started == 
>> started_before)
>> +   ++spurious;
>> +
>> +   assert(vq->completed <= bufs);
>> +   assert(vq->started <= bufs);
>> +   if (vq->completed == bufs)
>> +   break;
>> +
>> +   if (delayed) {
>> +   if (virtqueue_enable_cb_delayed(vq->vq))
>> +   wait_for_interrupt(vq);
>> +   } else {
>> +   if (virtqueue_enable_cb(vq->vq))
>> +   wait_for_interrupt(vq);
>> +   }
> 
> This could be simplified with
> 
> if (delayed)
> else
> 
> wait_for_interrupt(vq)

I am not sure if I understand the above comment.
The wait_for_interrupt() is only called conditionally depending on the
returning of virtqueue_enable_cb_delayed() and virtqueue_enable_cb().

> 
>> +   }
>> +   printf("TX spurious wakeups: 0x%llx started=0x%lx completed=0x%lx\n",
>> +  spurious, vq->started, vq->completed);
>> +}
>> +

...

>> +
>> +   /* Flush out completed bufs if any */
>> +   while (virtqueue_get_buf(vq->vq, &len)) {
>> +   struct ether_header *eh;
>> +
>> +   eh = (struct ether_header *)(dev->res_buf + 
>> HDR_LEN);
>> +
>> +   /* tun netdev is up and running, ignore the
>> +* non-TEST_PTYPE packet.
>> +*/
>> +   if (eh->ether_type != htons(TEST_PTYPE)) {
>> +  

[PATCH net-next v5 5/5] tools: virtio: introduce vhost_net_test

2024-02-05 Thread Yunsheng Lin
introduce vhost_net_test for both vhost_net tx and rx basing
on virtio_test to test vhost_net changing in the kernel.

Steps for vhost_net tx testing:
1. Prepare a out buf.
2. Kick the vhost_net to do tx processing.
3. Do the receiving in the tun side.
4. verify the data received by tun is correct.

Steps for vhost_net rx testing:
1. Prepare a in buf.
2. Do the sending in the tun side.
3. Kick the vhost_net to do rx processing.
4. verify the data received by vhost_net is correct.

Signed-off-by: Yunsheng Lin 
---
 tools/virtio/.gitignore|   1 +
 tools/virtio/Makefile  |   8 +-
 tools/virtio/linux/virtio_config.h |   4 +
 tools/virtio/vhost_net_test.c  | 536 +
 4 files changed, 546 insertions(+), 3 deletions(-)
 create mode 100644 tools/virtio/vhost_net_test.c

diff --git a/tools/virtio/.gitignore b/tools/virtio/.gitignore
index 9934d48d9a55..7e47b281c442 100644
--- a/tools/virtio/.gitignore
+++ b/tools/virtio/.gitignore
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0-only
 *.d
 virtio_test
+vhost_net_test
 vringh_test
 virtio-trace/trace-agent
diff --git a/tools/virtio/Makefile b/tools/virtio/Makefile
index d128925980e0..e25e99c1c3b7 100644
--- a/tools/virtio/Makefile
+++ b/tools/virtio/Makefile
@@ -1,8 +1,9 @@
 # SPDX-License-Identifier: GPL-2.0
 all: test mod
-test: virtio_test vringh_test
+test: virtio_test vringh_test vhost_net_test
 virtio_test: virtio_ring.o virtio_test.o
 vringh_test: vringh_test.o vringh.o virtio_ring.o
+vhost_net_test: virtio_ring.o vhost_net_test.o
 
 try-run = $(shell set -e;  \
if ($(1)) >/dev/null 2>&1;  \
@@ -49,6 +50,7 @@ oot-clean: OOT_BUILD+=clean
 
 .PHONY: all test mod clean vhost oot oot-clean oot-build
 clean:
-   ${RM} *.o vringh_test virtio_test vhost_test/*.o vhost_test/.*.cmd \
-  vhost_test/Module.symvers vhost_test/modules.order *.d
+   ${RM} *.o vringh_test virtio_test vhost_net_test vhost_test/*.o \
+  vhost_test/.*.cmd vhost_test/Module.symvers \
+  vhost_test/modules.order *.d
 -include *.d
diff --git a/tools/virtio/linux/virtio_config.h 
b/tools/virtio/linux/virtio_config.h
index 2a8a70e2a950..42a564f22f2d 100644
--- a/tools/virtio/linux/virtio_config.h
+++ b/tools/virtio/linux/virtio_config.h
@@ -1,4 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 */
+#ifndef LINUX_VIRTIO_CONFIG_H
+#define LINUX_VIRTIO_CONFIG_H
 #include 
 #include 
 #include 
@@ -95,3 +97,5 @@ static inline __virtio64 cpu_to_virtio64(struct virtio_device 
*vdev, u64 val)
 {
return __cpu_to_virtio64(virtio_is_little_endian(vdev), val);
 }
+
+#endif
diff --git a/tools/virtio/vhost_net_test.c b/tools/virtio/vhost_net_test.c
new file mode 100644
index ..6c41204e6707
--- /dev/null
+++ b/tools/virtio/vhost_net_test.c
@@ -0,0 +1,536 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define HDR_LENsizeof(struct virtio_net_hdr_mrg_rxbuf)
+#define TEST_BUF_LEN   256
+#define TEST_PTYPE ETH_P_LOOPBACK
+#define DESC_NUM   256
+
+/* Used by implementation of kmalloc() in tools/virtio/linux/kernel.h */
+void *__kmalloc_fake, *__kfree_ignore_start, *__kfree_ignore_end;
+
+struct vq_info {
+   int kick;
+   int call;
+   int idx;
+   long started;
+   long completed;
+   struct pollfd fds;
+   void *ring;
+   /* copy used for control */
+   struct vring vring;
+   struct virtqueue *vq;
+};
+
+struct vdev_info {
+   struct virtio_device vdev;
+   int control;
+   struct vq_info vqs[2];
+   int nvqs;
+   void *buf;
+   size_t buf_size;
+   char *test_buf;
+   char *res_buf;
+   struct vhost_memory *mem;
+   int sock;
+   int ifindex;
+   unsigned char mac[ETHER_ADDR_LEN];
+};
+
+static int tun_alloc(struct vdev_info *dev, char *tun_name)
+{
+   struct ifreq ifr;
+   int len = HDR_LEN;
+   int fd, e;
+
+   fd = open("/dev/net/tun", O_RDWR);
+   if (fd < 0) {
+   perror("Cannot open /dev/net/tun");
+   return fd;
+   }
+
+   memset(&ifr, 0, sizeof(ifr));
+
+   ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR;
+   strncpy(ifr.ifr_name, tun_name, IFNAMSIZ);
+
+   e = ioctl(fd, TUNSETIFF, &ifr);
+   if (e < 0) {
+   perror("ioctl[TUNSETIFF]");
+   close(fd);
+   return e;
+   }
+
+   e = ioctl(fd, TUNSETVNETHDRSZ, &len);
+   if (e < 0) {
+   perror("ioctl[TUNSETVNETHDRSZ]");
+   close(fd);
+   return e;
+   }
+
+   e = ioctl(fd, SIOCGIFHWADDR, &ifr);
+   if (e < 0) {
+   perro

[PATCH net-next v5 4/5] vhost/net: remove vhost_net_page_frag_refill()

2024-02-05 Thread Yunsheng Lin
The page frag in vhost_net_page_frag_refill() uses the
'struct page_frag' from skb_page_frag_refill(), but it's
implementation is similar to page_frag_alloc_align() now.

This patch removes vhost_net_page_frag_refill() by using
'struct page_frag_cache' instead of 'struct page_frag',
and allocating frag using page_frag_alloc_align().

The added benefit is that not only unifying the page frag
implementation a little, but also having about 0.5% performance
boost testing by using the vhost_net_test introduced in the
last patch.

Signed-off-by: Yunsheng Lin 
Acked-by: Jason Wang 
---
 drivers/vhost/net.c | 91 ++---
 1 file changed, 27 insertions(+), 64 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index e574e21cc0ca..4b2fcb228a0a 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -141,10 +141,8 @@ struct vhost_net {
unsigned tx_zcopy_err;
/* Flush in progress. Protected by tx vq lock. */
bool tx_flush;
-   /* Private page frag */
-   struct page_frag page_frag;
-   /* Refcount bias of page frag */
-   int refcnt_bias;
+   /* Private page frag cache */
+   struct page_frag_cache pf_cache;
 };
 
 static unsigned vhost_net_zcopy_mask __read_mostly;
@@ -655,41 +653,6 @@ static bool tx_can_batch(struct vhost_virtqueue *vq, 
size_t total_len)
   !vhost_vq_avail_empty(vq->dev, vq);
 }
 
-static bool vhost_net_page_frag_refill(struct vhost_net *net, unsigned int sz,
-  struct page_frag *pfrag, gfp_t gfp)
-{
-   if (pfrag->page) {
-   if (pfrag->offset + sz <= pfrag->size)
-   return true;
-   __page_frag_cache_drain(pfrag->page, net->refcnt_bias);
-   }
-
-   pfrag->offset = 0;
-   net->refcnt_bias = 0;
-   if (SKB_FRAG_PAGE_ORDER) {
-   /* Avoid direct reclaim but allow kswapd to wake */
-   pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
- __GFP_COMP | __GFP_NOWARN |
- __GFP_NORETRY | __GFP_NOMEMALLOC,
- SKB_FRAG_PAGE_ORDER);
-   if (likely(pfrag->page)) {
-   pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
-   goto done;
-   }
-   }
-   pfrag->page = alloc_page(gfp);
-   if (likely(pfrag->page)) {
-   pfrag->size = PAGE_SIZE;
-   goto done;
-   }
-   return false;
-
-done:
-   net->refcnt_bias = USHRT_MAX;
-   page_ref_add(pfrag->page, USHRT_MAX - 1);
-   return true;
-}
-
 #define VHOST_NET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
 
 static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq,
@@ -699,7 +662,6 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue 
*nvq,
struct vhost_net *net = container_of(vq->dev, struct vhost_net,
 dev);
struct socket *sock = vhost_vq_get_backend(vq);
-   struct page_frag *alloc_frag = &net->page_frag;
struct virtio_net_hdr *gso;
struct xdp_buff *xdp = &nvq->xdp[nvq->batched_xdp];
struct tun_xdp_hdr *hdr;
@@ -710,6 +672,7 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue 
*nvq,
int sock_hlen = nvq->sock_hlen;
void *buf;
int copied;
+   int ret;
 
if (unlikely(len < nvq->sock_hlen))
return -EFAULT;
@@ -719,18 +682,17 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue 
*nvq,
return -ENOSPC;
 
buflen += SKB_DATA_ALIGN(len + pad);
-   alloc_frag->offset = ALIGN((u64)alloc_frag->offset, SMP_CACHE_BYTES);
-   if (unlikely(!vhost_net_page_frag_refill(net, buflen,
-alloc_frag, GFP_KERNEL)))
+   buf = page_frag_alloc_align(&net->pf_cache, buflen, GFP_KERNEL,
+   SMP_CACHE_BYTES);
+   if (unlikely(!buf))
return -ENOMEM;
 
-   buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
-   copied = copy_page_from_iter(alloc_frag->page,
-alloc_frag->offset +
-offsetof(struct tun_xdp_hdr, gso),
-sock_hlen, from);
-   if (copied != sock_hlen)
-   return -EFAULT;
+   copied = copy_from_iter(buf + offsetof(struct tun_xdp_hdr, gso),
+   sock_hlen, from);
+   if (copied != sock_hlen) {
+   ret = -EFAULT;
+   goto err;
+   }
 
hdr = buf;
gso = &hdr->gso;
@@ -743,27 +705,30 @@ static int vhost_net_build_xdp(struct vhost_net_

[PATCH net-next v5 2/5] page_frag: unify gfp bits for order 3 page allocation

2024-02-05 Thread Yunsheng Lin
Currently there seems to be three page frag implementations
which all try to allocate order 3 page, if that fails, it
then fail back to allocate order 0 page, and each of them
all allow order 3 page allocation to fail under certain
condition by using specific gfp bits.

The gfp bits for order 3 page allocation are different
between different implementation, __GFP_NOMEMALLOC is
or'd to forbid access to emergency reserves memory for
__page_frag_cache_refill(), but it is not or'd in other
implementions, __GFP_DIRECT_RECLAIM is masked off to avoid
direct reclaim in vhost_net_page_frag_refill(), but it is
not masked off in __page_frag_cache_refill().

This patch unifies the gfp bits used between different
implementions by or'ing __GFP_NOMEMALLOC and masking off
__GFP_DIRECT_RECLAIM for order 3 page allocation to avoid
possible pressure for mm.

Leave the gfp unifying for page frag implementation in sock.c
for now as suggested by Paolo Abeni.

Signed-off-by: Yunsheng Lin 
Reviewed-by: Alexander Duyck 
CC: Alexander Duyck 
---
 drivers/vhost/net.c | 2 +-
 mm/page_alloc.c | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index f2ed7167c848..e574e21cc0ca 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -670,7 +670,7 @@ static bool vhost_net_page_frag_refill(struct vhost_net 
*net, unsigned int sz,
/* Avoid direct reclaim but allow kswapd to wake */
pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
  __GFP_COMP | __GFP_NOWARN |
- __GFP_NORETRY,
+ __GFP_NORETRY | __GFP_NOMEMALLOC,
  SKB_FRAG_PAGE_ORDER);
if (likely(pfrag->page)) {
pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c0f7e67c4250..636145c29f70 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4685,8 +4685,8 @@ static struct page *__page_frag_cache_refill(struct 
page_frag_cache *nc,
gfp_t gfp = gfp_mask;
 
 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
-   gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY |
-   __GFP_NOMEMALLOC;
+   gfp_mask = (gfp_mask & ~__GFP_DIRECT_RECLAIM) |  __GFP_COMP |
+  __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC;
page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
PAGE_FRAG_CACHE_MAX_ORDER);
nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
-- 
2.33.0




Re: [PATCH net-next v4 5/5] tools: virtio: introduce vhost_net_test

2024-02-03 Thread Yunsheng Lin
On 2024/2/4 9:30, Jason Wang wrote:
> On Fri, Feb 2, 2024 at 8:24 PM Yunsheng Lin  wrote:
>>
>> On 2024/2/2 12:05, Jason Wang wrote:
>>> On Tue, Jan 30, 2024 at 7:38 PM Yunsheng Lin  wrote:
>>>>
>>>> introduce vhost_net_test basing on virtio_test to test
>>>> vhost_net changing in the kernel.
>>>
>>> Let's describe what kind of test is being done and how it is done here.
>>
>> How about something like below:
>>
>> This patch introduces testing for both vhost_net tx and rx.
>> Steps for vhost_net tx testing:
>> 1. Prepare a out buf
>> 2. Kick the vhost_net to do tx processing
>> 3. Do the receiving in the tun side
>> 4. verify the data received by tun is correct
>>
>> Steps for vhost_net rx testing::
>> 1. Prepare a in buf
>> 2. Do the sending in the tun side
>> 3. Kick the vhost_net to do rx processing
>> 4. verify the data received by vhost_net is correct
> 
> It looks like some important details were lost, e.g the logic for batching 
> etc.

I am supposeing you are referring to the virtio desc batch handling,
right?

It was a copy & paste code of virtio_test.c, I was thinking about removing
the virtio desc batch handling for now, as this patchset does not require
that to do the testing, it mainly depend on the "sock->sk->sk_sndbuf" to
be INT_MAX to call vhost_net_build_xdp(), which seems to be the default
case for vhost_net.

> 
>>

...

>>>> +static void vdev_create_socket(struct vdev_info *dev)
>>>> +{
>>>> +   struct ifreq ifr;
>>>> +
>>>> +   dev->sock = socket(AF_PACKET, SOCK_RAW, htons(TEST_PTYPE));
>>>> +   assert(dev->sock != -1);
>>>> +
>>>> +   snprintf(ifr.ifr_name, IFNAMSIZ, "tun_%d", getpid());
>>>
>>> Nit: it might be better to accept the device name instead of repeating
>>> the snprintf trick here, this would facilitate the future changes.
>>
>> I am not sure I understand what did you mean by "accept the device name"
>> here.
>>
>> The above is used to get ifindex of the tun netdevice created in
>> tun_alloc(), so that we can use it in vdev_send_packet() to send
>> a packet using the tun netdevice created in tun_alloc(). Is there
>> anything obvious I missed here?
> 
> I meant a const char *ifname for this function and let the caller to
> pass the name.

Sure.

> 
>>

>>>> +
>>>> +static void run_rx_test(struct vdev_info *dev, struct vq_info *vq,
>>>> +   bool delayed, int batch, int bufs)
>>>> +{
>>>> +   const bool random_batch = batch == RANDOM_BATCH;
>>>> +   long long spurious = 0;
>>>> +   struct scatterlist sl;
>>>> +   unsigned int len;
>>>> +   int r;
>>>> +
>>>> +   for (;;) {
>>>> +   long started_before = vq->started;
>>>> +   long completed_before = vq->completed;
>>>> +
>>>> +   do {
>>>> +   if (random_batch)
>>>> +   batch = (random() % vq->vring.num) + 1;
>>>> +
>>>> +   while (vq->started < bufs &&
>>>> +  (vq->started - vq->completed) < batch) {
>>>> +   sg_init_one(&sl, dev->res_buf, HDR_LEN + 
>>>> TEST_BUF_LEN);
>>>> +
>>>> +   r = virtqueue_add_inbuf(vq->vq, &sl, 1,
>>>> +   dev->res_buf + 
>>>> vq->started,
>>>> +   GFP_ATOMIC);
>>>> +   if (unlikely(r != 0)) {
>>>> +   if (r == -ENOSPC &&
>>>
>>> Drivers usually maintain a #free_slots, this can help to avoid the
>>> trick for checking ENOSPC?
>>
>> The above "(vq->started - vq->completed) < batch" seems to ensure that
>> the 'r' can't be '-ENOSPC'?
> 
> Well, if this is true any reason we still check ENOSPEC here?

As mentioned above, It was a copy & paste code of virtio_test.c.
Will remove 'r == -ENOSPC' checking.

> 
>> We just need to ensure the batch <= desc_num,
>> and the 'r == -ENOSPC' checking seems to be unnecessary.
>>

Re: [PATCH net-next v4 2/5] page_frag: unify gfp bits for order 3 page allocation

2024-02-02 Thread Yunsheng Lin
On 2024/2/2 16:36, Paolo Abeni wrote:
> On Fri, 2024-02-02 at 10:10 +0800, Yunsheng Lin wrote:
>> On 2024/2/1 21:16, Paolo Abeni wrote:
>>
>>> from the __page_frag_cache_refill() allocator - which never accesses
>>> the memory reserves.
>>
>> I am not really sure I understand the above commemt.
>> The semantic is the same as skb_page_frag_refill() as explained above
>> as my understanding. Note that __page_frag_cache_refill() use 'gfp_mask'
>> for allocating order 3 pages and use the original 'gfp' for allocating
>> order 0 pages.
> 
> You are right! I got fooled misreading 'gfp' as 'gfp_mask' in there.
> 
>>> I'm unsure we want to propagate the __page_frag_cache_refill behavior
>>> here, the current behavior could be required by some systems.
>>>
>>> It looks like this series still leave the skb_page_frag_refill()
>>> allocator alone, what about dropping this chunk, too? 
>>
>> As explained above, I would prefer to keep it as it is as it seems
>> to be quite obvious that we can avoid possible pressure for mm by
>> not using memory reserve for order 3 pages as we have the fallback
>> for order 0 pages.
>>
>> Please let me know if there is anything obvious I missed.
>>
> 
> I still think/fear that behaviours changes here could have
> subtle/negative side effects - even if I agree the change looks safe.
> 
> I think the series without this patch would still achieve its goals and
> would be much more uncontroversial. What about move this patch as a
> standalone follow-up?

Fair enough, will remove that for now.

> 
> Thanks!
> 
> Paolo
> 
> .
> 



Re: [PATCH net-next v4 5/5] tools: virtio: introduce vhost_net_test

2024-02-02 Thread Yunsheng Lin
On 2024/2/2 12:05, Jason Wang wrote:
> On Tue, Jan 30, 2024 at 7:38 PM Yunsheng Lin  wrote:
>>
>> introduce vhost_net_test basing on virtio_test to test
>> vhost_net changing in the kernel.
> 
> Let's describe what kind of test is being done and how it is done here.

How about something like below:

This patch introduces testing for both vhost_net tx and rx.
Steps for vhost_net tx testing:
1. Prepare a out buf
2. Kick the vhost_net to do tx processing
3. Do the receiving in the tun side
4. verify the data received by tun is correct

Steps for vhost_net rx testing::
1. Prepare a in buf
2. Do the sending in the tun side
3. Kick the vhost_net to do rx processing
4. verify the data received by vhost_net is correct


>> +
>> +static int tun_alloc(struct vdev_info *dev)
>> +{
>> +   struct ifreq ifr;
>> +   int len = HDR_LEN;
> 
> Any reason you can't just use the virtio_net uapi?

I didn't find a macro for that in include/uapi/linux/virtio_net.h.

Did you mean using something like below?
sizeof(struct virtio_net_hdr_mrg_rxbuf)

> 
>> +   int fd, e;
>> +
>> +   fd = open("/dev/net/tun", O_RDWR);
>> +   if (fd < 0) {
>> +   perror("Cannot open /dev/net/tun");
>> +   return fd;
>> +   }
>> +
>> +   memset(&ifr, 0, sizeof(ifr));
>> +
>> +   ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR;
>> +   snprintf(ifr.ifr_name, IFNAMSIZ, "tun_%d", getpid());
>> +
>> +   e = ioctl(fd, TUNSETIFF, &ifr);
>> +   if (e < 0) {
>> +   perror("ioctl[TUNSETIFF]");
>> +   close(fd);
>> +   return e;
>> +   }
>> +
>> +   e = ioctl(fd, TUNSETVNETHDRSZ, &len);
>> +   if (e < 0) {
>> +   perror("ioctl[TUNSETVNETHDRSZ]");
>> +   close(fd);
>> +   return e;
>> +   }
>> +
>> +   e = ioctl(fd, SIOCGIFHWADDR, &ifr);
>> +   if (e < 0) {
>> +   perror("ioctl[SIOCGIFHWADDR]");
>> +   close(fd);
>> +   return e;
>> +   }
>> +
>> +   memcpy(dev->mac, &ifr.ifr_hwaddr.sa_data, ETHER_ADDR_LEN);
>> +   return fd;
>> +}
>> +
>> +static void vdev_create_socket(struct vdev_info *dev)
>> +{
>> +   struct ifreq ifr;
>> +
>> +   dev->sock = socket(AF_PACKET, SOCK_RAW, htons(TEST_PTYPE));
>> +   assert(dev->sock != -1);
>> +
>> +   snprintf(ifr.ifr_name, IFNAMSIZ, "tun_%d", getpid());
> 
> Nit: it might be better to accept the device name instead of repeating
> the snprintf trick here, this would facilitate the future changes.

I am not sure I understand what did you mean by "accept the device name"
here.

The above is used to get ifindex of the tun netdevice created in
tun_alloc(), so that we can use it in vdev_send_packet() to send
a packet using the tun netdevice created in tun_alloc(). Is there
anything obvious I missed here?

> 
>> +   assert(ioctl(dev->sock, SIOCGIFINDEX, &ifr) >= 0);
>> +
>> +   dev->ifindex = ifr.ifr_ifindex;
>> +
>> +   /* Set the flags that bring the device up */
>> +   assert(ioctl(dev->sock, SIOCGIFFLAGS, &ifr) >= 0);
>> +   ifr.ifr_flags |= (IFF_UP | IFF_RUNNING);
>> +   assert(ioctl(dev->sock, SIOCSIFFLAGS, &ifr) >= 0);
>> +}
>> +
>> +static void vdev_send_packet(struct vdev_info *dev)
>> +{
>> +   char *sendbuf = dev->test_buf + HDR_LEN;
>> +   struct sockaddr_ll saddrll = {0};
>> +   int sockfd = dev->sock;
>> +   int ret;
>> +
>> +   saddrll.sll_family = PF_PACKET;
>> +   saddrll.sll_ifindex = dev->ifindex;
>> +   saddrll.sll_halen = ETH_ALEN;
>> +   saddrll.sll_protocol = htons(TEST_PTYPE);
>> +
>> +   ret = sendto(sockfd, sendbuf, TEST_BUF_LEN, 0,
>> +(struct sockaddr *)&saddrll,
>> +sizeof(struct sockaddr_ll));
>> +   assert(ret >= 0);
>> +}
>> +

...

>> +
>> +static void vq_info_add(struct vdev_info *dev, int idx, int num, int fd)
>> +{
>> +   struct vhost_vring_file backend = { .index = idx, .fd = fd };
>> +   struct vq_info *info = &dev->vqs[idx];
>> +   int r;
>> +
>> +   info->idx = idx;
>> +   info->kick = eventfd(0, EFD_NONBLOCK);
>> +   info->call = eventfd(0, EFD_NON

Re: [PATCH net-next v4 2/5] page_frag: unify gfp bits for order 3 page allocation

2024-02-01 Thread Yunsheng Lin
On 2024/2/1 21:16, Paolo Abeni wrote:
> On Tue, 2024-01-30 at 19:37 +0800, Yunsheng Lin wrote:
>> Currently there seems to be three page frag implementions
>> which all try to allocate order 3 page, if that fails, it
>> then fail back to allocate order 0 page, and each of them
>> all allow order 3 page allocation to fail under certain
>> condition by using specific gfp bits.
>>
>> The gfp bits for order 3 page allocation are different
>> between different implementation, __GFP_NOMEMALLOC is
>> or'd to forbid access to emergency reserves memory for
>> __page_frag_cache_refill(), but it is not or'd in other
>> implementions, __GFP_DIRECT_RECLAIM is masked off to avoid
>> direct reclaim in skb_page_frag_refill(), but it is not
>> masked off in __page_frag_cache_refill().
>>
>> This patch unifies the gfp bits used between different
>> implementions by or'ing __GFP_NOMEMALLOC and masking off
>> __GFP_DIRECT_RECLAIM for order 3 page allocation to avoid
>> possible pressure for mm.
>>
>> Signed-off-by: Yunsheng Lin 
>> Reviewed-by: Alexander Duyck 
>> CC: Alexander Duyck 
>> ---
>>  drivers/vhost/net.c | 2 +-
>>  mm/page_alloc.c | 4 ++--
>>  net/core/sock.c | 2 +-
>>  3 files changed, 4 insertions(+), 4 deletions(-)
>>
>> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
>> index f2ed7167c848..e574e21cc0ca 100644
>> --- a/drivers/vhost/net.c
>> +++ b/drivers/vhost/net.c
>> @@ -670,7 +670,7 @@ static bool vhost_net_page_frag_refill(struct vhost_net 
>> *net, unsigned int sz,
>>  /* Avoid direct reclaim but allow kswapd to wake */
>>  pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
>>__GFP_COMP | __GFP_NOWARN |
>> -  __GFP_NORETRY,
>> +  __GFP_NORETRY | __GFP_NOMEMALLOC,
>>SKB_FRAG_PAGE_ORDER);
> 
>>  if (likely(pfrag->page)) {
>>  pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>> index c0f7e67c4250..636145c29f70 100644
>> --- a/mm/page_alloc.c
>> +++ b/mm/page_alloc.c
>> @@ -4685,8 +4685,8 @@ static struct page *__page_frag_cache_refill(struct 
>> page_frag_cache *nc,
>>  gfp_t gfp = gfp_mask;
>>  
>>  #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
>> -gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY |
>> -__GFP_NOMEMALLOC;
>> +gfp_mask = (gfp_mask & ~__GFP_DIRECT_RECLAIM) |  __GFP_COMP |
>> +   __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC;
>>  page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
>>  PAGE_FRAG_CACHE_MAX_ORDER);
>>  nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
>> diff --git a/net/core/sock.c b/net/core/sock.c
>> index 88bf810394a5..8289a3d8c375 100644
>> --- a/net/core/sock.c
>> +++ b/net/core/sock.c
>> @@ -2919,7 +2919,7 @@ bool skb_page_frag_refill(unsigned int sz, struct 
>> page_frag *pfrag, gfp_t gfp)
>>  /* Avoid direct reclaim but allow kswapd to wake */
>>  pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
>>__GFP_COMP | __GFP_NOWARN |
>> -  __GFP_NORETRY,
>> +  __GFP_NORETRY | __GFP_NOMEMALLOC,
>>SKB_FRAG_PAGE_ORDER);
> 
> This will prevent memory reserve usage when allocating order 3 pages,
> but not when allocating a single page as a fallback. Still different

More accurately, the above ensures memory reserve is always not used
for order 3 pages, whether memory reserve is used for order 0 pages
depending on original 'gfp' flags, if 'gfp' does not have __GFP_NOMEMALLOC
bit set, memory reserve may still be used  for order 0 pages.

> from the __page_frag_cache_refill() allocator - which never accesses
> the memory reserves.

I am not really sure I understand the above commemt.
The semantic is the same as skb_page_frag_refill() as explained above
as my understanding. Note that __page_frag_cache_refill() use 'gfp_mask'
for allocating order 3 pages and use the original 'gfp' for allocating
order 0 pages.

> 
> I'm unsure we want to propagate the __page_frag_cache_refill behavior
> here, the current behavior could be required by some systems.
> 
> It looks like this series still leave the skb_page_frag_refill()
> allocator alone, what about dropping this chunk, too? 

As explained above, I would prefer to keep it as it is as it seems
to be quite obvious that we can avoid possible pressure for mm by
not using memory reserve for order 3 pages as we have the fallback
for order 0 pages.

Please let me know if there is anything obvious I missed.

> 
> Thanks!
> 
> Paolo
> 
> 
> .
> 



[PATCH net-next v4 5/5] tools: virtio: introduce vhost_net_test

2024-01-30 Thread Yunsheng Lin
introduce vhost_net_test basing on virtio_test to test
vhost_net changing in the kernel.

Signed-off-by: Yunsheng Lin 
---
 tools/virtio/.gitignore   |   1 +
 tools/virtio/Makefile |   8 +-
 tools/virtio/vhost_net_test.c | 576 ++
 3 files changed, 582 insertions(+), 3 deletions(-)
 create mode 100644 tools/virtio/vhost_net_test.c

diff --git a/tools/virtio/.gitignore b/tools/virtio/.gitignore
index 9934d48d9a55..7e47b281c442 100644
--- a/tools/virtio/.gitignore
+++ b/tools/virtio/.gitignore
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0-only
 *.d
 virtio_test
+vhost_net_test
 vringh_test
 virtio-trace/trace-agent
diff --git a/tools/virtio/Makefile b/tools/virtio/Makefile
index d128925980e0..e25e99c1c3b7 100644
--- a/tools/virtio/Makefile
+++ b/tools/virtio/Makefile
@@ -1,8 +1,9 @@
 # SPDX-License-Identifier: GPL-2.0
 all: test mod
-test: virtio_test vringh_test
+test: virtio_test vringh_test vhost_net_test
 virtio_test: virtio_ring.o virtio_test.o
 vringh_test: vringh_test.o vringh.o virtio_ring.o
+vhost_net_test: virtio_ring.o vhost_net_test.o
 
 try-run = $(shell set -e;  \
if ($(1)) >/dev/null 2>&1;  \
@@ -49,6 +50,7 @@ oot-clean: OOT_BUILD+=clean
 
 .PHONY: all test mod clean vhost oot oot-clean oot-build
 clean:
-   ${RM} *.o vringh_test virtio_test vhost_test/*.o vhost_test/.*.cmd \
-  vhost_test/Module.symvers vhost_test/modules.order *.d
+   ${RM} *.o vringh_test virtio_test vhost_net_test vhost_test/*.o \
+  vhost_test/.*.cmd vhost_test/Module.symvers \
+  vhost_test/modules.order *.d
 -include *.d
diff --git a/tools/virtio/vhost_net_test.c b/tools/virtio/vhost_net_test.c
new file mode 100644
index ..e336792a0d77
--- /dev/null
+++ b/tools/virtio/vhost_net_test.c
@@ -0,0 +1,576 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define RANDOM_BATCH   -1
+#define HDR_LEN12
+#define TEST_BUF_LEN   256
+#define TEST_PTYPE ETH_P_LOOPBACK
+
+/* Used by implementation of kmalloc() in tools/virtio/linux/kernel.h */
+void *__kmalloc_fake, *__kfree_ignore_start, *__kfree_ignore_end;
+
+struct vq_info {
+   int kick;
+   int call;
+   int idx;
+   long started;
+   long completed;
+   struct pollfd fds;
+   void *ring;
+   /* copy used for control */
+   struct vring vring;
+   struct virtqueue *vq;
+};
+
+struct vdev_info {
+   struct virtio_device vdev;
+   int control;
+   struct vq_info vqs[2];
+   int nvqs;
+   void *buf;
+   size_t buf_size;
+   char *test_buf;
+   char *res_buf;
+   struct vhost_memory *mem;
+   int sock;
+   int ifindex;
+   unsigned char mac[ETHER_ADDR_LEN];
+};
+
+static int tun_alloc(struct vdev_info *dev)
+{
+   struct ifreq ifr;
+   int len = HDR_LEN;
+   int fd, e;
+
+   fd = open("/dev/net/tun", O_RDWR);
+   if (fd < 0) {
+   perror("Cannot open /dev/net/tun");
+   return fd;
+   }
+
+   memset(&ifr, 0, sizeof(ifr));
+
+   ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR;
+   snprintf(ifr.ifr_name, IFNAMSIZ, "tun_%d", getpid());
+
+   e = ioctl(fd, TUNSETIFF, &ifr);
+   if (e < 0) {
+   perror("ioctl[TUNSETIFF]");
+   close(fd);
+   return e;
+   }
+
+   e = ioctl(fd, TUNSETVNETHDRSZ, &len);
+   if (e < 0) {
+   perror("ioctl[TUNSETVNETHDRSZ]");
+   close(fd);
+   return e;
+   }
+
+   e = ioctl(fd, SIOCGIFHWADDR, &ifr);
+   if (e < 0) {
+   perror("ioctl[SIOCGIFHWADDR]");
+   close(fd);
+   return e;
+   }
+
+   memcpy(dev->mac, &ifr.ifr_hwaddr.sa_data, ETHER_ADDR_LEN);
+   return fd;
+}
+
+static void vdev_create_socket(struct vdev_info *dev)
+{
+   struct ifreq ifr;
+
+   dev->sock = socket(AF_PACKET, SOCK_RAW, htons(TEST_PTYPE));
+   assert(dev->sock != -1);
+
+   snprintf(ifr.ifr_name, IFNAMSIZ, "tun_%d", getpid());
+   assert(ioctl(dev->sock, SIOCGIFINDEX, &ifr) >= 0);
+
+   dev->ifindex = ifr.ifr_ifindex;
+
+   /* Set the flags that bring the device up */
+   assert(ioctl(dev->sock, SIOCGIFFLAGS, &ifr) >= 0);
+   ifr.ifr_flags |= (IFF_UP | IFF_RUNNING);
+   assert(ioctl(dev->sock, SIOCSIFFLAGS, &ifr) >= 0);
+}
+
+static void vdev_send_packet(struct vdev_info *dev)
+{
+   char *sendbuf = dev->test_buf + HDR_LEN;
+   struct sockaddr_ll saddrll = {0};
+   int so

[PATCH net-next v4 4/5] vhost/net: remove vhost_net_page_frag_refill()

2024-01-30 Thread Yunsheng Lin
The page frag in vhost_net_page_frag_refill() uses the
'struct page_frag' from skb_page_frag_refill(), but it's
implementation is similar to page_frag_alloc_align() now.

This patch removes vhost_net_page_frag_refill() by using
'struct page_frag_cache' instead of 'struct page_frag',
and allocating frag using page_frag_alloc_align().

The added benefit is that not only unifying the page frag
implementation a little, but also having about 0.5% performance
boost testing by using the vhost_net_test introduced in the
last patch.

Signed-off-by: Yunsheng Lin 
Acked-by: Jason Wang 
---
 drivers/vhost/net.c | 91 ++---
 1 file changed, 27 insertions(+), 64 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index e574e21cc0ca..4b2fcb228a0a 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -141,10 +141,8 @@ struct vhost_net {
unsigned tx_zcopy_err;
/* Flush in progress. Protected by tx vq lock. */
bool tx_flush;
-   /* Private page frag */
-   struct page_frag page_frag;
-   /* Refcount bias of page frag */
-   int refcnt_bias;
+   /* Private page frag cache */
+   struct page_frag_cache pf_cache;
 };
 
 static unsigned vhost_net_zcopy_mask __read_mostly;
@@ -655,41 +653,6 @@ static bool tx_can_batch(struct vhost_virtqueue *vq, 
size_t total_len)
   !vhost_vq_avail_empty(vq->dev, vq);
 }
 
-static bool vhost_net_page_frag_refill(struct vhost_net *net, unsigned int sz,
-  struct page_frag *pfrag, gfp_t gfp)
-{
-   if (pfrag->page) {
-   if (pfrag->offset + sz <= pfrag->size)
-   return true;
-   __page_frag_cache_drain(pfrag->page, net->refcnt_bias);
-   }
-
-   pfrag->offset = 0;
-   net->refcnt_bias = 0;
-   if (SKB_FRAG_PAGE_ORDER) {
-   /* Avoid direct reclaim but allow kswapd to wake */
-   pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
- __GFP_COMP | __GFP_NOWARN |
- __GFP_NORETRY | __GFP_NOMEMALLOC,
- SKB_FRAG_PAGE_ORDER);
-   if (likely(pfrag->page)) {
-   pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
-   goto done;
-   }
-   }
-   pfrag->page = alloc_page(gfp);
-   if (likely(pfrag->page)) {
-   pfrag->size = PAGE_SIZE;
-   goto done;
-   }
-   return false;
-
-done:
-   net->refcnt_bias = USHRT_MAX;
-   page_ref_add(pfrag->page, USHRT_MAX - 1);
-   return true;
-}
-
 #define VHOST_NET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
 
 static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq,
@@ -699,7 +662,6 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue 
*nvq,
struct vhost_net *net = container_of(vq->dev, struct vhost_net,
 dev);
struct socket *sock = vhost_vq_get_backend(vq);
-   struct page_frag *alloc_frag = &net->page_frag;
struct virtio_net_hdr *gso;
struct xdp_buff *xdp = &nvq->xdp[nvq->batched_xdp];
struct tun_xdp_hdr *hdr;
@@ -710,6 +672,7 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue 
*nvq,
int sock_hlen = nvq->sock_hlen;
void *buf;
int copied;
+   int ret;
 
if (unlikely(len < nvq->sock_hlen))
return -EFAULT;
@@ -719,18 +682,17 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue 
*nvq,
return -ENOSPC;
 
buflen += SKB_DATA_ALIGN(len + pad);
-   alloc_frag->offset = ALIGN((u64)alloc_frag->offset, SMP_CACHE_BYTES);
-   if (unlikely(!vhost_net_page_frag_refill(net, buflen,
-alloc_frag, GFP_KERNEL)))
+   buf = page_frag_alloc_align(&net->pf_cache, buflen, GFP_KERNEL,
+   SMP_CACHE_BYTES);
+   if (unlikely(!buf))
return -ENOMEM;
 
-   buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
-   copied = copy_page_from_iter(alloc_frag->page,
-alloc_frag->offset +
-offsetof(struct tun_xdp_hdr, gso),
-sock_hlen, from);
-   if (copied != sock_hlen)
-   return -EFAULT;
+   copied = copy_from_iter(buf + offsetof(struct tun_xdp_hdr, gso),
+   sock_hlen, from);
+   if (copied != sock_hlen) {
+   ret = -EFAULT;
+   goto err;
+   }
 
hdr = buf;
gso = &hdr->gso;
@@ -743,27 +705,30 @@ static int vhost_net_build_xdp(struct vhost_net_

[PATCH net-next v4 2/5] page_frag: unify gfp bits for order 3 page allocation

2024-01-30 Thread Yunsheng Lin
Currently there seems to be three page frag implementions
which all try to allocate order 3 page, if that fails, it
then fail back to allocate order 0 page, and each of them
all allow order 3 page allocation to fail under certain
condition by using specific gfp bits.

The gfp bits for order 3 page allocation are different
between different implementation, __GFP_NOMEMALLOC is
or'd to forbid access to emergency reserves memory for
__page_frag_cache_refill(), but it is not or'd in other
implementions, __GFP_DIRECT_RECLAIM is masked off to avoid
direct reclaim in skb_page_frag_refill(), but it is not
masked off in __page_frag_cache_refill().

This patch unifies the gfp bits used between different
implementions by or'ing __GFP_NOMEMALLOC and masking off
__GFP_DIRECT_RECLAIM for order 3 page allocation to avoid
possible pressure for mm.

Signed-off-by: Yunsheng Lin 
Reviewed-by: Alexander Duyck 
CC: Alexander Duyck 
---
 drivers/vhost/net.c | 2 +-
 mm/page_alloc.c | 4 ++--
 net/core/sock.c | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index f2ed7167c848..e574e21cc0ca 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -670,7 +670,7 @@ static bool vhost_net_page_frag_refill(struct vhost_net 
*net, unsigned int sz,
/* Avoid direct reclaim but allow kswapd to wake */
pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
  __GFP_COMP | __GFP_NOWARN |
- __GFP_NORETRY,
+ __GFP_NORETRY | __GFP_NOMEMALLOC,
  SKB_FRAG_PAGE_ORDER);
if (likely(pfrag->page)) {
pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c0f7e67c4250..636145c29f70 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4685,8 +4685,8 @@ static struct page *__page_frag_cache_refill(struct 
page_frag_cache *nc,
gfp_t gfp = gfp_mask;
 
 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
-   gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY |
-   __GFP_NOMEMALLOC;
+   gfp_mask = (gfp_mask & ~__GFP_DIRECT_RECLAIM) |  __GFP_COMP |
+  __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC;
page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
PAGE_FRAG_CACHE_MAX_ORDER);
nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
diff --git a/net/core/sock.c b/net/core/sock.c
index 88bf810394a5..8289a3d8c375 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2919,7 +2919,7 @@ bool skb_page_frag_refill(unsigned int sz, struct 
page_frag *pfrag, gfp_t gfp)
/* Avoid direct reclaim but allow kswapd to wake */
pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
  __GFP_COMP | __GFP_NOWARN |
- __GFP_NORETRY,
+ __GFP_NORETRY | __GFP_NOMEMALLOC,
  SKB_FRAG_PAGE_ORDER);
if (likely(pfrag->page)) {
pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
-- 
2.33.0




[PATCH net-next v3 5/5] tools: virtio: introduce vhost_net_test

2024-01-23 Thread Yunsheng Lin
introduce vhost_net_test basing on virtio_test to test
vhost_net changing in the kernel.

Signed-off-by: Yunsheng Lin 
---
 tools/virtio/.gitignore   |   1 +
 tools/virtio/Makefile |   8 +-
 tools/virtio/vhost_net_test.c | 576 ++
 3 files changed, 582 insertions(+), 3 deletions(-)
 create mode 100644 tools/virtio/vhost_net_test.c

diff --git a/tools/virtio/.gitignore b/tools/virtio/.gitignore
index 9934d48d9a55..7e47b281c442 100644
--- a/tools/virtio/.gitignore
+++ b/tools/virtio/.gitignore
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0-only
 *.d
 virtio_test
+vhost_net_test
 vringh_test
 virtio-trace/trace-agent
diff --git a/tools/virtio/Makefile b/tools/virtio/Makefile
index d128925980e0..e25e99c1c3b7 100644
--- a/tools/virtio/Makefile
+++ b/tools/virtio/Makefile
@@ -1,8 +1,9 @@
 # SPDX-License-Identifier: GPL-2.0
 all: test mod
-test: virtio_test vringh_test
+test: virtio_test vringh_test vhost_net_test
 virtio_test: virtio_ring.o virtio_test.o
 vringh_test: vringh_test.o vringh.o virtio_ring.o
+vhost_net_test: virtio_ring.o vhost_net_test.o
 
 try-run = $(shell set -e;  \
if ($(1)) >/dev/null 2>&1;  \
@@ -49,6 +50,7 @@ oot-clean: OOT_BUILD+=clean
 
 .PHONY: all test mod clean vhost oot oot-clean oot-build
 clean:
-   ${RM} *.o vringh_test virtio_test vhost_test/*.o vhost_test/.*.cmd \
-  vhost_test/Module.symvers vhost_test/modules.order *.d
+   ${RM} *.o vringh_test virtio_test vhost_net_test vhost_test/*.o \
+  vhost_test/.*.cmd vhost_test/Module.symvers \
+  vhost_test/modules.order *.d
 -include *.d
diff --git a/tools/virtio/vhost_net_test.c b/tools/virtio/vhost_net_test.c
new file mode 100644
index ..e336792a0d77
--- /dev/null
+++ b/tools/virtio/vhost_net_test.c
@@ -0,0 +1,576 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define RANDOM_BATCH   -1
+#define HDR_LEN12
+#define TEST_BUF_LEN   256
+#define TEST_PTYPE ETH_P_LOOPBACK
+
+/* Used by implementation of kmalloc() in tools/virtio/linux/kernel.h */
+void *__kmalloc_fake, *__kfree_ignore_start, *__kfree_ignore_end;
+
+struct vq_info {
+   int kick;
+   int call;
+   int idx;
+   long started;
+   long completed;
+   struct pollfd fds;
+   void *ring;
+   /* copy used for control */
+   struct vring vring;
+   struct virtqueue *vq;
+};
+
+struct vdev_info {
+   struct virtio_device vdev;
+   int control;
+   struct vq_info vqs[2];
+   int nvqs;
+   void *buf;
+   size_t buf_size;
+   char *test_buf;
+   char *res_buf;
+   struct vhost_memory *mem;
+   int sock;
+   int ifindex;
+   unsigned char mac[ETHER_ADDR_LEN];
+};
+
+static int tun_alloc(struct vdev_info *dev)
+{
+   struct ifreq ifr;
+   int len = HDR_LEN;
+   int fd, e;
+
+   fd = open("/dev/net/tun", O_RDWR);
+   if (fd < 0) {
+   perror("Cannot open /dev/net/tun");
+   return fd;
+   }
+
+   memset(&ifr, 0, sizeof(ifr));
+
+   ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR;
+   snprintf(ifr.ifr_name, IFNAMSIZ, "tun_%d", getpid());
+
+   e = ioctl(fd, TUNSETIFF, &ifr);
+   if (e < 0) {
+   perror("ioctl[TUNSETIFF]");
+   close(fd);
+   return e;
+   }
+
+   e = ioctl(fd, TUNSETVNETHDRSZ, &len);
+   if (e < 0) {
+   perror("ioctl[TUNSETVNETHDRSZ]");
+   close(fd);
+   return e;
+   }
+
+   e = ioctl(fd, SIOCGIFHWADDR, &ifr);
+   if (e < 0) {
+   perror("ioctl[SIOCGIFHWADDR]");
+   close(fd);
+   return e;
+   }
+
+   memcpy(dev->mac, &ifr.ifr_hwaddr.sa_data, ETHER_ADDR_LEN);
+   return fd;
+}
+
+static void vdev_create_socket(struct vdev_info *dev)
+{
+   struct ifreq ifr;
+
+   dev->sock = socket(AF_PACKET, SOCK_RAW, htons(TEST_PTYPE));
+   assert(dev->sock != -1);
+
+   snprintf(ifr.ifr_name, IFNAMSIZ, "tun_%d", getpid());
+   assert(ioctl(dev->sock, SIOCGIFINDEX, &ifr) >= 0);
+
+   dev->ifindex = ifr.ifr_ifindex;
+
+   /* Set the flags that bring the device up */
+   assert(ioctl(dev->sock, SIOCGIFFLAGS, &ifr) >= 0);
+   ifr.ifr_flags |= (IFF_UP | IFF_RUNNING);
+   assert(ioctl(dev->sock, SIOCSIFFLAGS, &ifr) >= 0);
+}
+
+static void vdev_send_packet(struct vdev_info *dev)
+{
+   char *sendbuf = dev->test_buf + HDR_LEN;
+   struct sockaddr_ll saddrll = {0};
+   int so

[PATCH net-next v3 4/5] vhost/net: remove vhost_net_page_frag_refill()

2024-01-23 Thread Yunsheng Lin
The page frag in vhost_net_page_frag_refill() uses the
'struct page_frag' from skb_page_frag_refill(), but it's
implementation is similar to page_frag_alloc_align() now.

This patch removes vhost_net_page_frag_refill() by using
'struct page_frag_cache' instead of 'struct page_frag',
and allocating frag using page_frag_alloc_align().

The added benefit is that not only unifying the page frag
implementation a little, but also having about 0.5% performance
boost testing by using the vhost_net_test introduced in the
last patch.

Signed-off-by: Yunsheng Lin 
Acked-by: Jason Wang 
---
 drivers/vhost/net.c | 91 ++---
 1 file changed, 27 insertions(+), 64 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index e574e21cc0ca..4b2fcb228a0a 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -141,10 +141,8 @@ struct vhost_net {
unsigned tx_zcopy_err;
/* Flush in progress. Protected by tx vq lock. */
bool tx_flush;
-   /* Private page frag */
-   struct page_frag page_frag;
-   /* Refcount bias of page frag */
-   int refcnt_bias;
+   /* Private page frag cache */
+   struct page_frag_cache pf_cache;
 };
 
 static unsigned vhost_net_zcopy_mask __read_mostly;
@@ -655,41 +653,6 @@ static bool tx_can_batch(struct vhost_virtqueue *vq, 
size_t total_len)
   !vhost_vq_avail_empty(vq->dev, vq);
 }
 
-static bool vhost_net_page_frag_refill(struct vhost_net *net, unsigned int sz,
-  struct page_frag *pfrag, gfp_t gfp)
-{
-   if (pfrag->page) {
-   if (pfrag->offset + sz <= pfrag->size)
-   return true;
-   __page_frag_cache_drain(pfrag->page, net->refcnt_bias);
-   }
-
-   pfrag->offset = 0;
-   net->refcnt_bias = 0;
-   if (SKB_FRAG_PAGE_ORDER) {
-   /* Avoid direct reclaim but allow kswapd to wake */
-   pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
- __GFP_COMP | __GFP_NOWARN |
- __GFP_NORETRY | __GFP_NOMEMALLOC,
- SKB_FRAG_PAGE_ORDER);
-   if (likely(pfrag->page)) {
-   pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
-   goto done;
-   }
-   }
-   pfrag->page = alloc_page(gfp);
-   if (likely(pfrag->page)) {
-   pfrag->size = PAGE_SIZE;
-   goto done;
-   }
-   return false;
-
-done:
-   net->refcnt_bias = USHRT_MAX;
-   page_ref_add(pfrag->page, USHRT_MAX - 1);
-   return true;
-}
-
 #define VHOST_NET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
 
 static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq,
@@ -699,7 +662,6 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue 
*nvq,
struct vhost_net *net = container_of(vq->dev, struct vhost_net,
 dev);
struct socket *sock = vhost_vq_get_backend(vq);
-   struct page_frag *alloc_frag = &net->page_frag;
struct virtio_net_hdr *gso;
struct xdp_buff *xdp = &nvq->xdp[nvq->batched_xdp];
struct tun_xdp_hdr *hdr;
@@ -710,6 +672,7 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue 
*nvq,
int sock_hlen = nvq->sock_hlen;
void *buf;
int copied;
+   int ret;
 
if (unlikely(len < nvq->sock_hlen))
return -EFAULT;
@@ -719,18 +682,17 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue 
*nvq,
return -ENOSPC;
 
buflen += SKB_DATA_ALIGN(len + pad);
-   alloc_frag->offset = ALIGN((u64)alloc_frag->offset, SMP_CACHE_BYTES);
-   if (unlikely(!vhost_net_page_frag_refill(net, buflen,
-alloc_frag, GFP_KERNEL)))
+   buf = page_frag_alloc_align(&net->pf_cache, buflen, GFP_KERNEL,
+   SMP_CACHE_BYTES);
+   if (unlikely(!buf))
return -ENOMEM;
 
-   buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
-   copied = copy_page_from_iter(alloc_frag->page,
-alloc_frag->offset +
-offsetof(struct tun_xdp_hdr, gso),
-sock_hlen, from);
-   if (copied != sock_hlen)
-   return -EFAULT;
+   copied = copy_from_iter(buf + offsetof(struct tun_xdp_hdr, gso),
+   sock_hlen, from);
+   if (copied != sock_hlen) {
+   ret = -EFAULT;
+   goto err;
+   }
 
hdr = buf;
gso = &hdr->gso;
@@ -743,27 +705,30 @@ static int vhost_net_build_xdp(struct vhost_net_

[PATCH net-next v3 2/5] page_frag: unify gfp bits for order 3 page allocation

2024-01-23 Thread Yunsheng Lin
Currently there seems to be three page frag implementions
which all try to allocate order 3 page, if that fails, it
then fail back to allocate order 0 page, and each of them
all allow order 3 page allocation to fail under certain
condition by using specific gfp bits.

The gfp bits for order 3 page allocation are different
between different implementation, __GFP_NOMEMALLOC is
or'd to forbid access to emergency reserves memory for
__page_frag_cache_refill(), but it is not or'd in other
implementions, __GFP_DIRECT_RECLAIM is masked off to avoid
direct reclaim in skb_page_frag_refill(), but it is not
masked off in __page_frag_cache_refill().

This patch unifies the gfp bits used between different
implementions by or'ing __GFP_NOMEMALLOC and masking off
__GFP_DIRECT_RECLAIM for order 3 page allocation to avoid
possible pressure for mm.

Signed-off-by: Yunsheng Lin 
Reviewed-by: Alexander Duyck 
CC: Alexander Duyck 
---
 drivers/vhost/net.c | 2 +-
 mm/page_alloc.c | 4 ++--
 net/core/sock.c | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index f2ed7167c848..e574e21cc0ca 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -670,7 +670,7 @@ static bool vhost_net_page_frag_refill(struct vhost_net 
*net, unsigned int sz,
/* Avoid direct reclaim but allow kswapd to wake */
pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
  __GFP_COMP | __GFP_NOWARN |
- __GFP_NORETRY,
+ __GFP_NORETRY | __GFP_NOMEMALLOC,
  SKB_FRAG_PAGE_ORDER);
if (likely(pfrag->page)) {
pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c0f7e67c4250..636145c29f70 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4685,8 +4685,8 @@ static struct page *__page_frag_cache_refill(struct 
page_frag_cache *nc,
gfp_t gfp = gfp_mask;
 
 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
-   gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY |
-   __GFP_NOMEMALLOC;
+   gfp_mask = (gfp_mask & ~__GFP_DIRECT_RECLAIM) |  __GFP_COMP |
+  __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC;
page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
PAGE_FRAG_CACHE_MAX_ORDER);
nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
diff --git a/net/core/sock.c b/net/core/sock.c
index 158dbdebce6a..d4bc4269d7d7 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2908,7 +2908,7 @@ bool skb_page_frag_refill(unsigned int sz, struct 
page_frag *pfrag, gfp_t gfp)
/* Avoid direct reclaim but allow kswapd to wake */
pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
  __GFP_COMP | __GFP_NOWARN |
- __GFP_NORETRY,
+ __GFP_NORETRY | __GFP_NOMEMALLOC,
  SKB_FRAG_PAGE_ORDER);
if (likely(pfrag->page)) {
pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
-- 
2.33.0




Re: [PATCH net-next 4/6] vhost/net: remove vhost_net_page_frag_refill()

2024-01-08 Thread Yunsheng Lin
On 2024/1/6 0:06, Alexander H Duyck wrote:
>>  
>>  static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
>> @@ -1353,8 +1318,7 @@ static int vhost_net_open(struct inode *inode, struct 
>> file *f)
>>  vqs[VHOST_NET_VQ_RX]);
>>  
>>  f->private_data = n;
>> -n->page_frag.page = NULL;
>> -n->refcnt_bias = 0;
>> +n->pf_cache.va = NULL;
>>  
>>  return 0;
>>  }
>> @@ -1422,8 +1386,9 @@ static int vhost_net_release(struct inode *inode, 
>> struct file *f)
>>  kfree(n->vqs[VHOST_NET_VQ_RX].rxq.queue);
>>  kfree(n->vqs[VHOST_NET_VQ_TX].xdp);
>>  kfree(n->dev.vqs);
>> -if (n->page_frag.page)
>> -__page_frag_cache_drain(n->page_frag.page, n->refcnt_bias);
>> +if (n->pf_cache.va)
>> +__page_frag_cache_drain(virt_to_head_page(n->pf_cache.va),
>> +n->pf_cache.pagecnt_bias);
>>  kvfree(n);
>>  return 0;
>>  }
> 
> I would recommend reordering this patch with patch 5. Then you could
> remove the block that is setting "n->pf_cache.va = NULL" above and just
> make use of page_frag_cache_drain in the lower block which would also
> return the va to NULL.

I am not sure if we can as there is no zeroing for 'struct vhost_net' in
vhost_net_open().

If we don't have "n->pf_cache.va = NULL", don't we use the uninitialized data
when calling page_frag_alloc_align() for the first time?

> .
> 



Re: [PATCH net-next 2/6] page_frag: unify gfp bits for order 3 page allocation

2024-01-08 Thread Yunsheng Lin
On 2024/1/5 23:35, Alexander H Duyck wrote:
> On Wed, 2024-01-03 at 17:56 +0800, Yunsheng Lin wrote:
>> Currently there seems to be three page frag implementions
>> which all try to allocate order 3 page, if that fails, it
>> then fail back to allocate order 0 page, and each of them
>> all allow order 3 page allocation to fail under certain
>> condition by using specific gfp bits.
>>
>> The gfp bits for order 3 page allocation are different
>> between different implementation, __GFP_NOMEMALLOC is
>> or'd to forbid access to emergency reserves memory for
>> __page_frag_cache_refill(), but it is not or'd in other
>> implementions, __GFP_DIRECT_RECLAIM is masked off to avoid
>> direct reclaim in skb_page_frag_refill(), but it is not
>> masked off in __page_frag_cache_refill().
>>
>> This patch unifies the gfp bits used between different
>> implementions by or'ing __GFP_NOMEMALLOC and masking off
>> __GFP_DIRECT_RECLAIM for order 3 page allocation to avoid
>> possible pressure for mm.
>>
>> Signed-off-by: Yunsheng Lin 
>> CC: Alexander Duyck 
>> ---
>>  drivers/vhost/net.c | 2 +-
>>  mm/page_alloc.c | 4 ++--
>>  net/core/sock.c | 2 +-
>>  3 files changed, 4 insertions(+), 4 deletions(-)
>>
>> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
>> index f2ed7167c848..e574e21cc0ca 100644
>> --- a/drivers/vhost/net.c
>> +++ b/drivers/vhost/net.c
>> @@ -670,7 +670,7 @@ static bool vhost_net_page_frag_refill(struct vhost_net 
>> *net, unsigned int sz,
>>  /* Avoid direct reclaim but allow kswapd to wake */
>>  pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
>>__GFP_COMP | __GFP_NOWARN |
>> -  __GFP_NORETRY,
>> +  __GFP_NORETRY | __GFP_NOMEMALLOC,
>>SKB_FRAG_PAGE_ORDER);
>>  if (likely(pfrag->page)) {
>>  pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>> index 9a16305cf985..1f0b36dd81b5 100644
>> --- a/mm/page_alloc.c
>> +++ b/mm/page_alloc.c
>> @@ -4693,8 +4693,8 @@ static struct page *__page_frag_cache_refill(struct 
>> page_frag_cache *nc,
>>  gfp_t gfp = gfp_mask;
>>  
>>  #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
>> -gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY |
>> -__GFP_NOMEMALLOC;
>> +gfp_mask = (gfp_mask & ~__GFP_DIRECT_RECLAIM) |  __GFP_COMP |
>> +   __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC;
>>  page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
>>  PAGE_FRAG_CACHE_MAX_ORDER);
>>  nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
>> diff --git a/net/core/sock.c b/net/core/sock.c
>> index 446e945f736b..d643332c3ee5 100644
>> --- a/net/core/sock.c
>> +++ b/net/core/sock.c
>> @@ -2900,7 +2900,7 @@ bool skb_page_frag_refill(unsigned int sz, struct 
>> page_frag *pfrag, gfp_t gfp)
>>  /* Avoid direct reclaim but allow kswapd to wake */
>>  pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
>>__GFP_COMP | __GFP_NOWARN |
>> -  __GFP_NORETRY,
>> +  __GFP_NORETRY | __GFP_NOMEMALLOC,
>>SKB_FRAG_PAGE_ORDER);
>>  if (likely(pfrag->page)) {
>>  pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
> 
> Looks fine to me.
> 
> One thing you may want to consider would be to place this all in an
> inline function that could just consolidate all the code.

Do you think it is possible to further unify the implementations of the
'struct page_frag_cache' and 'struct page_frag', so adding a inline
function for above is unnecessary?

> 
> Reviewed-by: Alexander Duyck 
> 
> .
> 



Re: [PATCH net-next 6/6] tools: virtio: introduce vhost_net_test

2024-01-04 Thread Yunsheng Lin
On 2024/1/5 0:17, Eugenio Perez Martin wrote:
> On Wed, Jan 3, 2024 at 11:00 AM Yunsheng Lin  wrote:

...

>> +
>> +static void run_tx_test(struct vdev_info *dev, struct vq_info *vq,
>> +   bool delayed, int batch, int bufs)
>> +{
>> +   const bool random_batch = batch == RANDOM_BATCH;
>> +   long long spurious = 0;
>> +   struct scatterlist sl;
>> +   unsigned int len;
>> +   int r;
>> +
>> +   for (;;) {
>> +   long started_before = vq->started;
>> +   long completed_before = vq->completed;
>> +
>> +   virtqueue_disable_cb(vq->vq);
>> +   do {
>> +   if (random_batch)
>> +   batch = (random() % vq->vring.num) + 1;
>> +
>> +   while (vq->started < bufs &&
>> +  (vq->started - vq->completed) < batch) {
>> +   sg_init_one(&sl, dev->test_buf, HDR_LEN + 
>> TEST_BUF_LEN);
>> +   r = virtqueue_add_outbuf(vq->vq, &sl, 1,
>> +dev->test_buf + 
>> vq->started,
>> +GFP_ATOMIC);
>> +   if (unlikely(r != 0)) {
>> +   if (r == -ENOSPC &&
>> +   vq->started > started_before)
>> +   r = 0;
>> +   else
>> +   r = -1;
>> +   break;
>> +   }
>> +
>> +   ++vq->started;
>> +
>> +   if (unlikely(!virtqueue_kick(vq->vq))) {
>> +   r = -1;
>> +   break;
>> +   }
>> +   }
>> +
>> +   if (vq->started >= bufs)
>> +   r = -1;
>> +
>> +   /* Flush out completed bufs if any */
>> +   while (virtqueue_get_buf(vq->vq, &len)) {
>> +   int n, i;
>> +
>> +   n = recvfrom(dev->sock, dev->res_buf, 
>> TEST_BUF_LEN, 0, NULL, NULL);
>> +   assert(n == TEST_BUF_LEN);
>> +
>> +   for (i = ETHER_HDR_LEN; i < n; i++)
>> +   assert(dev->res_buf[i] == (char)i);
>> +
>> +   ++vq->completed;
>> +   r = 0;
>> +   }
>> +   } while (r == 0);
>> +
>> +   if (vq->completed == completed_before && vq->started == 
>> started_before)
>> +   ++spurious;
>> +
>> +   assert(vq->completed <= bufs);
>> +   assert(vq->started <= bufs);
>> +   if (vq->completed == bufs)
>> +   break;
>> +
>> +   if (delayed) {
>> +   if (virtqueue_enable_cb_delayed(vq->vq))
>> +   wait_for_interrupt(vq);
>> +   } else {
>> +   if (virtqueue_enable_cb(vq->vq))
>> +   wait_for_interrupt(vq);
>> +   }
>> +   }
>> +   printf("TX spurious wakeups: 0x%llx started=0x%lx completed=0x%lx\n",
>> +  spurious, vq->started, vq->completed);
>> +}
>> +
>> +static void run_rx_test(struct vdev_info *dev, struct vq_info *vq,
>> +   bool delayed, int batch, int bufs)
>> +{
>> +   const bool random_batch = batch == RANDOM_BATCH;
>> +   long long spurious = 0;
>> +   struct scatterlist sl;
>> +   unsigned int len;
>> +   int r;
>> +
>> +   for (;;) {
>> +   long started_before = vq->started;
>> +   long completed_before = vq->completed;
>> +
>> +   do {
>> +   if (random_batch)
>> +   batch = (random() % vq->vring.num) + 1;
>> +
&

[PATCH net-next 6/6] tools: virtio: introduce vhost_net_test

2024-01-03 Thread Yunsheng Lin
introduce vhost_net_test basing on virtio_test to test
vhost_net changing in the kernel.

Signed-off-by: Yunsheng Lin 
---
 tools/virtio/Makefile |   8 +-
 tools/virtio/vhost_net_test.c | 574 ++
 2 files changed, 579 insertions(+), 3 deletions(-)
 create mode 100644 tools/virtio/vhost_net_test.c

diff --git a/tools/virtio/Makefile b/tools/virtio/Makefile
index d128925980e0..e25e99c1c3b7 100644
--- a/tools/virtio/Makefile
+++ b/tools/virtio/Makefile
@@ -1,8 +1,9 @@
 # SPDX-License-Identifier: GPL-2.0
 all: test mod
-test: virtio_test vringh_test
+test: virtio_test vringh_test vhost_net_test
 virtio_test: virtio_ring.o virtio_test.o
 vringh_test: vringh_test.o vringh.o virtio_ring.o
+vhost_net_test: virtio_ring.o vhost_net_test.o
 
 try-run = $(shell set -e;  \
if ($(1)) >/dev/null 2>&1;  \
@@ -49,6 +50,7 @@ oot-clean: OOT_BUILD+=clean
 
 .PHONY: all test mod clean vhost oot oot-clean oot-build
 clean:
-   ${RM} *.o vringh_test virtio_test vhost_test/*.o vhost_test/.*.cmd \
-  vhost_test/Module.symvers vhost_test/modules.order *.d
+   ${RM} *.o vringh_test virtio_test vhost_net_test vhost_test/*.o \
+  vhost_test/.*.cmd vhost_test/Module.symvers \
+  vhost_test/modules.order *.d
 -include *.d
diff --git a/tools/virtio/vhost_net_test.c b/tools/virtio/vhost_net_test.c
new file mode 100644
index ..cfffcef53d94
--- /dev/null
+++ b/tools/virtio/vhost_net_test.c
@@ -0,0 +1,574 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define RANDOM_BATCH   -1
+#define HDR_LEN12
+#define TEST_BUF_LEN   256
+#define TEST_PTYPE ETH_P_LOOPBACK
+
+/* Used by implementation of kmalloc() in tools/virtio/linux/kernel.h */
+void *__kmalloc_fake, *__kfree_ignore_start, *__kfree_ignore_end;
+
+struct vq_info {
+   int kick;
+   int call;
+   int idx;
+   long started;
+   long completed;
+   struct pollfd fds;
+   void *ring;
+   /* copy used for control */
+   struct vring vring;
+   struct virtqueue *vq;
+};
+
+struct vdev_info {
+   struct virtio_device vdev;
+   int control;
+   struct vq_info vqs[2];
+   int nvqs;
+   void *buf;
+   size_t buf_size;
+   char *test_buf;
+   char *res_buf;
+   struct vhost_memory *mem;
+   int sock;
+   int ifindex;
+   unsigned char mac[ETHER_ADDR_LEN];
+};
+
+static int tun_alloc(struct vdev_info *dev)
+{
+   struct ifreq ifr;
+   int len = HDR_LEN;
+   int fd, e;
+
+   fd = open("/dev/net/tun", O_RDWR);
+   if (fd < 0) {
+   perror("Cannot open /dev/net/tun");
+   return fd;
+   }
+
+   memset(&ifr, 0, sizeof(ifr));
+
+   ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR;
+   snprintf(ifr.ifr_name, IFNAMSIZ, "tun_%d", getpid());
+
+   e = ioctl(fd, TUNSETIFF, &ifr);
+   if (e < 0) {
+   perror("ioctl[TUNSETIFF]");
+   close(fd);
+   return e;
+   }
+
+   e = ioctl(fd, TUNSETVNETHDRSZ, &len);
+   if (e < 0) {
+   perror("ioctl[TUNSETVNETHDRSZ]");
+   close(fd);
+   return e;
+   }
+
+   e = ioctl(fd, SIOCGIFHWADDR, &ifr);
+   if (e < 0) {
+   perror("ioctl[SIOCGIFHWADDR]");
+   close(fd);
+   return e;
+   }
+
+   memcpy(dev->mac, &ifr.ifr_hwaddr.sa_data, ETHER_ADDR_LEN);
+   return fd;
+}
+
+static void vdev_create_socket(struct vdev_info *dev)
+{
+   struct ifreq ifr;
+
+   dev->sock = socket(AF_PACKET, SOCK_RAW, htons(TEST_PTYPE));
+   assert(dev->sock != -1);
+
+   snprintf(ifr.ifr_name, IFNAMSIZ, "tun_%d", getpid());
+   assert(ioctl(dev->sock, SIOCGIFINDEX, &ifr) >= 0);
+
+   dev->ifindex = ifr.ifr_ifindex;
+
+   /* Set the flags that bring the device up */
+   assert(ioctl(dev->sock, SIOCGIFFLAGS, &ifr) >= 0);
+   ifr.ifr_flags |= (IFF_UP | IFF_RUNNING);
+   assert(ioctl(dev->sock, SIOCSIFFLAGS, &ifr) >= 0);
+}
+
+static void vdev_send_packet(struct vdev_info *dev)
+{
+   char *sendbuf = dev->test_buf + HDR_LEN;
+   struct sockaddr_ll saddrll = {0};
+   int sockfd = dev->sock;
+   int ret;
+
+   memset(&saddrll, 0, sizeof(saddrll));
+   saddrll.sll_family = PF_PACKET;
+   saddrll.sll_ifindex = dev->ifindex;
+   saddrll.sll_halen = ETH_ALEN;
+   saddrll.sll_protocol = htons(TEST_PTYPE);
+
+   ret = sendto(sockfd, sendbuf, TEST_BUF_LEN, 0,
+  

[PATCH net-next 5/6] net: introduce page_frag_cache_drain()

2024-01-03 Thread Yunsheng Lin
When draining a page_frag_cache, most user are doing
the similar steps, so introduce an API to avoid code
duplication.

Signed-off-by: Yunsheng Lin 
Acked-by: Jason Wang 
---
 drivers/net/ethernet/google/gve/gve_main.c | 11 ++-
 drivers/net/ethernet/mediatek/mtk_wed_wo.c | 17 ++---
 drivers/nvme/host/tcp.c|  7 +--
 drivers/nvme/target/tcp.c  |  4 +---
 drivers/vhost/net.c|  4 +---
 include/linux/gfp.h|  2 ++
 mm/page_alloc.c| 10 ++
 7 files changed, 19 insertions(+), 36 deletions(-)

diff --git a/drivers/net/ethernet/google/gve/gve_main.c 
b/drivers/net/ethernet/google/gve/gve_main.c
index 619bf63ec935..d976190b0f4d 100644
--- a/drivers/net/ethernet/google/gve/gve_main.c
+++ b/drivers/net/ethernet/google/gve/gve_main.c
@@ -1278,17 +1278,10 @@ static void gve_unreg_xdp_info(struct gve_priv *priv)
 
 static void gve_drain_page_cache(struct gve_priv *priv)
 {
-   struct page_frag_cache *nc;
int i;
 
-   for (i = 0; i < priv->rx_cfg.num_queues; i++) {
-   nc = &priv->rx[i].page_cache;
-   if (nc->va) {
-   __page_frag_cache_drain(virt_to_page(nc->va),
-   nc->pagecnt_bias);
-   nc->va = NULL;
-   }
-   }
+   for (i = 0; i < priv->rx_cfg.num_queues; i++)
+   page_frag_cache_drain(&priv->rx[i].page_cache);
 }
 
 static int gve_open(struct net_device *dev)
diff --git a/drivers/net/ethernet/mediatek/mtk_wed_wo.c 
b/drivers/net/ethernet/mediatek/mtk_wed_wo.c
index d58b07e7e123..7063c78bd35f 100644
--- a/drivers/net/ethernet/mediatek/mtk_wed_wo.c
+++ b/drivers/net/ethernet/mediatek/mtk_wed_wo.c
@@ -286,7 +286,6 @@ mtk_wed_wo_queue_free(struct mtk_wed_wo *wo, struct 
mtk_wed_wo_queue *q)
 static void
 mtk_wed_wo_queue_tx_clean(struct mtk_wed_wo *wo, struct mtk_wed_wo_queue *q)
 {
-   struct page *page;
int i;
 
for (i = 0; i < q->n_desc; i++) {
@@ -301,19 +300,12 @@ mtk_wed_wo_queue_tx_clean(struct mtk_wed_wo *wo, struct 
mtk_wed_wo_queue *q)
entry->buf = NULL;
}
 
-   if (!q->cache.va)
-   return;
-
-   page = virt_to_page(q->cache.va);
-   __page_frag_cache_drain(page, q->cache.pagecnt_bias);
-   memset(&q->cache, 0, sizeof(q->cache));
+   page_frag_cache_drain(&q->cache);
 }
 
 static void
 mtk_wed_wo_queue_rx_clean(struct mtk_wed_wo *wo, struct mtk_wed_wo_queue *q)
 {
-   struct page *page;
-
for (;;) {
void *buf = mtk_wed_wo_dequeue(wo, q, NULL, true);
 
@@ -323,12 +315,7 @@ mtk_wed_wo_queue_rx_clean(struct mtk_wed_wo *wo, struct 
mtk_wed_wo_queue *q)
skb_free_frag(buf);
}
 
-   if (!q->cache.va)
-   return;
-
-   page = virt_to_page(q->cache.va);
-   __page_frag_cache_drain(page, q->cache.pagecnt_bias);
-   memset(&q->cache, 0, sizeof(q->cache));
+   page_frag_cache_drain(&q->cache);
 }
 
 static void
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 08805f027810..c80037a78066 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -1344,7 +1344,6 @@ static int nvme_tcp_alloc_async_req(struct nvme_tcp_ctrl 
*ctrl)
 
 static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid)
 {
-   struct page *page;
struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
struct nvme_tcp_queue *queue = &ctrl->queues[qid];
unsigned int noreclaim_flag;
@@ -1355,11 +1354,7 @@ static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, 
int qid)
if (queue->hdr_digest || queue->data_digest)
nvme_tcp_free_crypto(queue);
 
-   if (queue->pf_cache.va) {
-   page = virt_to_head_page(queue->pf_cache.va);
-   __page_frag_cache_drain(page, queue->pf_cache.pagecnt_bias);
-   queue->pf_cache.va = NULL;
-   }
+   page_frag_cache_drain(&queue->pf_cache);
 
noreclaim_flag = memalloc_noreclaim_save();
/* ->sock will be released by fput() */
diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
index 4cc27856aa8f..11237557cfc5 100644
--- a/drivers/nvme/target/tcp.c
+++ b/drivers/nvme/target/tcp.c
@@ -1576,7 +1576,6 @@ static void nvmet_tcp_free_cmd_data_in_buffers(struct 
nvmet_tcp_queue *queue)
 
 static void nvmet_tcp_release_queue_work(struct work_struct *w)
 {
-   struct page *page;
struct nvmet_tcp_queue *queue =
container_of(w, struct nvmet_tcp_queue, release_work);
 
@@ -1600,8 +1599,7 @@ static void nvmet_tcp_release_queue_work(struct 
work_struct *w)
if (queue->hdr_digest || queue->data_digest)
nvmet_tcp_free_crypto(queue);
i

[PATCH net-next 4/6] vhost/net: remove vhost_net_page_frag_refill()

2024-01-03 Thread Yunsheng Lin
The page frag in vhost_net_page_frag_refill() uses the
'struct page_frag' from skb_page_frag_refill(), but it's
implementation is similar to page_frag_alloc_align() now.

This patch removes vhost_net_page_frag_refill() by using
'struct page_frag_cache' instead of 'struct page_frag',
and allocating frag using page_frag_alloc_align().

The added benefit is that not only unifying the page frag
implementation a little, but also having about 0.5% performance
boost testing by using the vhost_net_test introduced in the
last patch.

Signed-off-by: Yunsheng Lin 
Acked-by: Jason Wang 
---
 drivers/vhost/net.c | 93 ++---
 1 file changed, 29 insertions(+), 64 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index e574e21cc0ca..805e11d598e4 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -141,10 +141,8 @@ struct vhost_net {
unsigned tx_zcopy_err;
/* Flush in progress. Protected by tx vq lock. */
bool tx_flush;
-   /* Private page frag */
-   struct page_frag page_frag;
-   /* Refcount bias of page frag */
-   int refcnt_bias;
+   /* Private page frag cache */
+   struct page_frag_cache pf_cache;
 };
 
 static unsigned vhost_net_zcopy_mask __read_mostly;
@@ -655,41 +653,6 @@ static bool tx_can_batch(struct vhost_virtqueue *vq, 
size_t total_len)
   !vhost_vq_avail_empty(vq->dev, vq);
 }
 
-static bool vhost_net_page_frag_refill(struct vhost_net *net, unsigned int sz,
-  struct page_frag *pfrag, gfp_t gfp)
-{
-   if (pfrag->page) {
-   if (pfrag->offset + sz <= pfrag->size)
-   return true;
-   __page_frag_cache_drain(pfrag->page, net->refcnt_bias);
-   }
-
-   pfrag->offset = 0;
-   net->refcnt_bias = 0;
-   if (SKB_FRAG_PAGE_ORDER) {
-   /* Avoid direct reclaim but allow kswapd to wake */
-   pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
- __GFP_COMP | __GFP_NOWARN |
- __GFP_NORETRY | __GFP_NOMEMALLOC,
- SKB_FRAG_PAGE_ORDER);
-   if (likely(pfrag->page)) {
-   pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
-   goto done;
-   }
-   }
-   pfrag->page = alloc_page(gfp);
-   if (likely(pfrag->page)) {
-   pfrag->size = PAGE_SIZE;
-   goto done;
-   }
-   return false;
-
-done:
-   net->refcnt_bias = USHRT_MAX;
-   page_ref_add(pfrag->page, USHRT_MAX - 1);
-   return true;
-}
-
 #define VHOST_NET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
 
 static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq,
@@ -699,7 +662,6 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue 
*nvq,
struct vhost_net *net = container_of(vq->dev, struct vhost_net,
 dev);
struct socket *sock = vhost_vq_get_backend(vq);
-   struct page_frag *alloc_frag = &net->page_frag;
struct virtio_net_hdr *gso;
struct xdp_buff *xdp = &nvq->xdp[nvq->batched_xdp];
struct tun_xdp_hdr *hdr;
@@ -710,6 +672,7 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue 
*nvq,
int sock_hlen = nvq->sock_hlen;
void *buf;
int copied;
+   int ret;
 
if (unlikely(len < nvq->sock_hlen))
return -EFAULT;
@@ -719,18 +682,17 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue 
*nvq,
return -ENOSPC;
 
buflen += SKB_DATA_ALIGN(len + pad);
-   alloc_frag->offset = ALIGN((u64)alloc_frag->offset, SMP_CACHE_BYTES);
-   if (unlikely(!vhost_net_page_frag_refill(net, buflen,
-alloc_frag, GFP_KERNEL)))
+   buf = page_frag_alloc_align(&net->pf_cache, buflen, GFP_KERNEL,
+   SMP_CACHE_BYTES);
+   if (unlikely(!buf))
return -ENOMEM;
 
-   buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
-   copied = copy_page_from_iter(alloc_frag->page,
-alloc_frag->offset +
-offsetof(struct tun_xdp_hdr, gso),
-sock_hlen, from);
-   if (copied != sock_hlen)
-   return -EFAULT;
+   copied = copy_from_iter(buf + offsetof(struct tun_xdp_hdr, gso),
+   sock_hlen, from);
+   if (copied != sock_hlen) {
+   ret = -EFAULT;
+   goto err;
+   }
 
hdr = buf;
gso = &hdr->gso;
@@ -743,27 +705,30 @@ static int vhost_net_build_xdp(struct vhost_net_

[PATCH net-next 2/6] page_frag: unify gfp bits for order 3 page allocation

2024-01-03 Thread Yunsheng Lin
Currently there seems to be three page frag implementions
which all try to allocate order 3 page, if that fails, it
then fail back to allocate order 0 page, and each of them
all allow order 3 page allocation to fail under certain
condition by using specific gfp bits.

The gfp bits for order 3 page allocation are different
between different implementation, __GFP_NOMEMALLOC is
or'd to forbid access to emergency reserves memory for
__page_frag_cache_refill(), but it is not or'd in other
implementions, __GFP_DIRECT_RECLAIM is masked off to avoid
direct reclaim in skb_page_frag_refill(), but it is not
masked off in __page_frag_cache_refill().

This patch unifies the gfp bits used between different
implementions by or'ing __GFP_NOMEMALLOC and masking off
__GFP_DIRECT_RECLAIM for order 3 page allocation to avoid
possible pressure for mm.

Signed-off-by: Yunsheng Lin 
CC: Alexander Duyck 
---
 drivers/vhost/net.c | 2 +-
 mm/page_alloc.c | 4 ++--
 net/core/sock.c | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index f2ed7167c848..e574e21cc0ca 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -670,7 +670,7 @@ static bool vhost_net_page_frag_refill(struct vhost_net 
*net, unsigned int sz,
/* Avoid direct reclaim but allow kswapd to wake */
pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
  __GFP_COMP | __GFP_NOWARN |
- __GFP_NORETRY,
+ __GFP_NORETRY | __GFP_NOMEMALLOC,
  SKB_FRAG_PAGE_ORDER);
if (likely(pfrag->page)) {
pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9a16305cf985..1f0b36dd81b5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4693,8 +4693,8 @@ static struct page *__page_frag_cache_refill(struct 
page_frag_cache *nc,
gfp_t gfp = gfp_mask;
 
 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
-   gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY |
-   __GFP_NOMEMALLOC;
+   gfp_mask = (gfp_mask & ~__GFP_DIRECT_RECLAIM) |  __GFP_COMP |
+  __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC;
page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
PAGE_FRAG_CACHE_MAX_ORDER);
nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
diff --git a/net/core/sock.c b/net/core/sock.c
index 446e945f736b..d643332c3ee5 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2900,7 +2900,7 @@ bool skb_page_frag_refill(unsigned int sz, struct 
page_frag *pfrag, gfp_t gfp)
/* Avoid direct reclaim but allow kswapd to wake */
pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
  __GFP_COMP | __GFP_NOWARN |
- __GFP_NORETRY,
+ __GFP_NORETRY | __GFP_NOMEMALLOC,
  SKB_FRAG_PAGE_ORDER);
if (likely(pfrag->page)) {
pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
-- 
2.33.0




Re: [PATCH net-next 6/6] tools: virtio: introduce vhost_net_test

2023-12-20 Thread Yunsheng Lin
On 2023/12/21 10:33, Jason Wang wrote:
> On Wed, Dec 20, 2023 at 8:45 PM Yunsheng Lin  wrote:
>>
>> On 2023/12/12 12:35, Jason Wang wrote:>>>> +done:
>>>>>> +   backend.fd = tun_alloc();
>>>>>> +   assert(backend.fd >= 0);
>>>>>> +   vdev_info_init(&dev, features);
>>>>>> +   vq_info_add(&dev, 256);
>>>>>> +   run_test(&dev, &dev.vqs[0], delayed, batch, reset, nbufs);
>>>>>
>>>>> I'd expect we are testing some basic traffic here. E.g can we use a
>>>>> packet socket then we can test both tx and rx?
>>>>
>>>> Yes, only rx for tun is tested.
>>>> Do you have an idea how to test the tx too? As I am not familar enough
>>>> with vhost_net and tun yet.
>>>
>>> Maybe you can have a packet socket to bind to the tun/tap. Then you can 
>>> test:
>>>
>>> 1) TAP RX: by write a packet via virtqueue through vhost_net and read
>>> it from packet socket
>>> 2) TAP TX:  by write via packet socket and read it from the virtqueue
>>> through vhost_net
>>
>> When implementing the TAP TX by adding VHOST_NET_F_VIRTIO_NET_HDR,
>> I found one possible use of uninitialized data in vhost_net_build_xdp().
>>
>> And vhost_hlen is set to sizeof(struct virtio_net_hdr_mrg_rxbuf) and
>> sock_hlen is set to zero in vhost_net_set_features() for both tx and rx
>> queue.
>>
>> For vhost_net_build_xdp() called by handle_tx_copy():
>>
>> The (gso->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) checking below may cause a
>> read of uninitialized data if sock_hlen is zero.
> 
> Which data is uninitialized here?

The 'gso', as the sock_hlen is zero, there is no copying for:

 copied = copy_page_from_iter(alloc_frag->page,
  alloc_frag->offset +
  offsetof(struct tun_xdp_hdr, gso),
  sock_hlen, from);

> 
>>
>> And it seems vhost_hdr is skipped in get_tx_bufs():
>> https://elixir.bootlin.com/linux/latest/source/drivers/vhost/net.c#L616
>>
>> static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq,
>>struct iov_iter *from)
>> {
>> ...
>> buflen += SKB_DATA_ALIGN(len + pad);
>> alloc_frag->offset = ALIGN((u64)alloc_frag->offset, SMP_CACHE_BYTES);
>> if (unlikely(!vhost_net_page_frag_refill(net, buflen,
>>  alloc_frag, GFP_KERNEL)))
>> return -ENOMEM;
>>
>> buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
>> copied = copy_page_from_iter(alloc_frag->page,
>>  alloc_frag->offset +
>>  offsetof(struct tun_xdp_hdr, gso),
>>  sock_hlen, from);
>> if (copied != sock_hlen)
>> return -EFAULT;
>>
>> hdr = buf;
>> gso = &hdr->gso;
>>
>> if ((gso->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
>> vhost16_to_cpu(vq, gso->csum_start) +
>> vhost16_to_cpu(vq, gso->csum_offset) + 2 >
>> vhost16_to_cpu(vq, gso->hdr_len)) {
>> ...
>> }
>>
>> I seems the handle_tx_copy() does not handle the VHOST_NET_F_VIRTIO_NET_HDR
>> case correctly, Or do I miss something obvious here?
> 
> In get_tx_bufs() we did:
> 
> *len = init_iov_iter(vq, &msg->msg_iter, nvq->vhost_hlen, *out);
> 
> Which covers this case?

It does not seems to cover it, as the vhost_hdr is just skipped without any
handling in get_tx_bufs():
https://elixir.bootlin.com/linux/v6.7-rc6/source/drivers/vhost/net.c#L616

> 
> Thanks



Re: [PATCH net-next 6/6] tools: virtio: introduce vhost_net_test

2023-12-20 Thread Yunsheng Lin
On 2023/12/12 12:35, Jason Wang wrote: +done:
 +   backend.fd = tun_alloc();
 +   assert(backend.fd >= 0);
 +   vdev_info_init(&dev, features);
 +   vq_info_add(&dev, 256);
 +   run_test(&dev, &dev.vqs[0], delayed, batch, reset, nbufs);
>>>
>>> I'd expect we are testing some basic traffic here. E.g can we use a
>>> packet socket then we can test both tx and rx?
>>
>> Yes, only rx for tun is tested.
>> Do you have an idea how to test the tx too? As I am not familar enough
>> with vhost_net and tun yet.
> 
> Maybe you can have a packet socket to bind to the tun/tap. Then you can test:
> 
> 1) TAP RX: by write a packet via virtqueue through vhost_net and read
> it from packet socket
> 2) TAP TX:  by write via packet socket and read it from the virtqueue
> through vhost_net

When implementing the TAP TX by adding VHOST_NET_F_VIRTIO_NET_HDR,
I found one possible use of uninitialized data in vhost_net_build_xdp().

And vhost_hlen is set to sizeof(struct virtio_net_hdr_mrg_rxbuf) and
sock_hlen is set to zero in vhost_net_set_features() for both tx and rx
queue.

For vhost_net_build_xdp() called by handle_tx_copy():

The (gso->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) checking below may cause a
read of uninitialized data if sock_hlen is zero.

And it seems vhost_hdr is skipped in get_tx_bufs():
https://elixir.bootlin.com/linux/latest/source/drivers/vhost/net.c#L616

static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq,
   struct iov_iter *from)
{
...
buflen += SKB_DATA_ALIGN(len + pad);
alloc_frag->offset = ALIGN((u64)alloc_frag->offset, SMP_CACHE_BYTES);
if (unlikely(!vhost_net_page_frag_refill(net, buflen,
 alloc_frag, GFP_KERNEL)))
return -ENOMEM;

buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
copied = copy_page_from_iter(alloc_frag->page,
 alloc_frag->offset +
 offsetof(struct tun_xdp_hdr, gso),
 sock_hlen, from);
if (copied != sock_hlen)
return -EFAULT;

hdr = buf;
gso = &hdr->gso;

if ((gso->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
vhost16_to_cpu(vq, gso->csum_start) +
vhost16_to_cpu(vq, gso->csum_offset) + 2 >
vhost16_to_cpu(vq, gso->hdr_len)) {
...
}

I seems the handle_tx_copy() does not handle the VHOST_NET_F_VIRTIO_NET_HDR
case correctly, Or do I miss something obvious here?

> 
> Thanks
> 
>>
>>>
>>> Thanks
>>
> 
> .
> 



Re: [PATCH net-next v2 3/3] net: add netmem_t to skb_frag_t

2023-12-18 Thread Yunsheng Lin
On 2023/12/17 16:09, Mina Almasry wrote:
> Use netmem_t instead of page directly in skb_frag_t. Currently netmem_t
> is always a struct page underneath, but the abstraction allows efforts
> to add support for skb frags not backed by pages.
> 
> There is unfortunately 1 instance where the skb_frag_t is assumed to be
> a bio_vec in kcm. For this case, add a debug assert that the skb frag is
> indeed backed by a page, and do a cast.
> 
> Add skb[_frag]_fill_netmem_*() and skb_add_rx_frag_netmem() helpers so
> that the API can be used to create netmem skbs.
> 
> Signed-off-by: Mina Almasry 
> 

...

>  
> -typedef struct bio_vec skb_frag_t;
> +typedef struct skb_frag {
> + struct netmem *bv_page;

bv_page -> bv_netmem?

> + unsigned int bv_len;
> + unsigned int bv_offset;
> +} skb_frag_t;
>  
>  /**
>   * skb_frag_size() - Returns the size of a skb fragment
> @@ -2431,22 +2436,37 @@ static inline unsigned int skb_pagelen(const struct 
> sk_buff *skb)
>   return skb_headlen(skb) + __skb_pagelen(skb);
>  }
>  

...

>  /**
> @@ -2462,10 +2482,10 @@ static inline void skb_len_add(struct sk_buff *skb, 
> int delta)
>  }
>  
>  /**
> - * __skb_fill_page_desc - initialise a paged fragment in an skb
> + * __skb_fill_netmem_desc - initialise a paged fragment in an skb
>   * @skb: buffer containing fragment to be initialised
>   * @i: paged fragment index to initialise
> - * @page: the page to use for this fragment
> + * @netmem: the netmem to use for this fragment
>   * @off: the offset to the data with @page
>   * @size: the length of the data
>   *
> @@ -2474,10 +2494,13 @@ static inline void skb_len_add(struct sk_buff *skb, 
> int delta)
>   *
>   * Does not take any additional reference on the fragment.
>   */
> -static inline void __skb_fill_page_desc(struct sk_buff *skb, int i,
> - struct page *page, int off, int size)
> +static inline void __skb_fill_netmem_desc(struct sk_buff *skb, int i,
> +   struct netmem *netmem, int off,
> +   int size)
>  {
> - __skb_fill_page_desc_noacc(skb_shinfo(skb), i, page, off, size);
> + struct page *page = netmem_to_page(netmem);
> +
> + __skb_fill_netmem_desc_noacc(skb_shinfo(skb), i, netmem, off, size);
>  
>   /* Propagate page pfmemalloc to the skb if we can. The problem is
>* that not all callers have unique ownership of the page but rely
> @@ -2485,7 +2508,21 @@ static inline void __skb_fill_page_desc(struct sk_buff 
> *skb, int i,
>*/
>   page = compound_head(page);
>   if (page_is_pfmemalloc(page))
> - skb->pfmemalloc = true;
> + skb->pfmemalloc = true;

Is it possible to introduce netmem_is_pfmemalloc() and netmem_compound_head()
for netmem, and have some built-time testing to ensure the implementation
is the same between page_is_pfmemalloc()/compound_head() and
netmem_is_pfmemalloc()/netmem_compound_head()? So that we can avoid the
netmem_to_page() as much as possible, especially in the driver.


> +}
> +
> +static inline void __skb_fill_page_desc(struct sk_buff *skb, int i,
> + struct page *page, int off, int size)
> +{
> + __skb_fill_netmem_desc(skb, i, page_to_netmem(page), off, size);
> +}
> +

...

>   */
>  static inline struct page *skb_frag_page(const skb_frag_t *frag)
>  {
> - return frag->bv_page;
> + return netmem_to_page(frag->bv_page);

It seems we are not able to have a safe type protection for the above
function, as the driver may be able to pass a devmem frag as a param here,
and pass the returned page into the mm subsystem, and compiler is not able
to catch it when compiling.

>  }
>  
>  /**
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index 83af8aaeb893..053d220aa2f2 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -845,16 +845,24 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct 
> *napi, unsigned int len,
>  }
>  EXPORT_SYMBOL(__napi_alloc_skb);
>  

...

> diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
> index 65d1f6755f98..5c46db045f4c 100644
> --- a/net/kcm/kcmsock.c
> +++ b/net/kcm/kcmsock.c
> @@ -636,9 +636,15 @@ static int kcm_write_msgs(struct kcm_sock *kcm)
>   for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
>   msize += skb_shinfo(skb)->frags[i].bv_len;
>  
> + /* The cast to struct bio_vec* here assumes the frags are
> +  * struct page based.
> +  */
> + DEBUG_NET_WARN_ON_ONCE(
> + !skb_frag_page(&skb_shinfo(skb)->frags[0]));

It seems skb_frag_page() always return non-NULL in this patch, the above
checking seems unnecessary?



Re: [PATCH net-next 6/6] tools: virtio: introduce vhost_net_test

2023-12-07 Thread Yunsheng Lin
On 2023/12/7 14:00, Jason Wang wrote:
> On Tue, Dec 5, 2023 at 7:35 PM Yunsheng Lin  wrote:
...

>> +
>> +static int tun_alloc(void)
>> +{
>> +   struct ifreq ifr;
>> +   int fd, e;
>> +
>> +   fd = open("/dev/net/tun", O_RDWR);
>> +   if (fd < 0) {
>> +   perror("Cannot open /dev/net/tun");
>> +   return fd;
>> +   }
>> +
>> +   memset(&ifr, 0, sizeof(ifr));
>> +
>> +   ifr.ifr_flags = IFF_TUN | IFF_NO_PI;
> 
> Why did you use IFF_TUN but not IFF_TAP here?

To be honest, no particular reason, I just picked IFF_TUN and it happened
to work for me to test changing in vhost_net_build_xdp().

Is there a particular reason you perfer the IFF_TAP over IFF_TUN?

> 
>> +   strncpy(ifr.ifr_name, "tun0", IFNAMSIZ);
> 
> tun0 is pretty common if there's a VPN. Do we need some randomized name here?

How about something like below?

snprintf(ifr.ifr_name, IFNAMSIZ, "tun_%d", getpid());

> 
> 
>> +
>> +   e = ioctl(fd, TUNSETIFF, (void *) &ifr);
>> +   if (e < 0) {
>> +   perror("ioctl[TUNSETIFF]");
>> +   close(fd);
>> +   return e;
>> +   }
>> +
>> +   return fd;
>> +}
>> +
>> +/* Unused */
>> +void *__kmalloc_fake, *__kfree_ignore_start, *__kfree_ignore_end;
> 
> Why do we need trick like these here?

That is because of the below error:
tools/virtio/./linux/kernel.h:58: undefined reference to `__kmalloc_fake'

when virtio_ring.c is compiled in the userspace, the kmalloc raleted function
is implemented in tools/virtio/./linux/kernel.h, which requires those varibles
to be defined.

> 
>> +
>> +struct vq_info {
>> +   int kick;
>> +   int call;
>> +   int num;
>> +   int idx;
>> +   void *ring;
>> +   /* copy used for control */
>> +   struct vring vring;
>> +   struct virtqueue *vq;
>> +};
>> +
>> +struct vdev_info {
>> +   struct virtio_device vdev;
>> +   int control;
>> +   struct pollfd fds[1];
>> +   struct vq_info vqs[1];
>> +   int nvqs;
>> +   void *buf;
>> +   size_t buf_size;
>> +   struct vhost_memory *mem;
>> +};
>> +
>> +static struct vhost_vring_file no_backend = { .index = 1, .fd = -1 },
>> +backend = { .index = 1, .fd = 1 };
> 
> A magic number like fd = 1 is pretty confusing.
> 
> And I don't see why we need global variables here.

I was using the virtio_test.c as reference, will try to remove it
if it is possible.

> 
>> +static const struct vhost_vring_state null_state = {};
>> +

..

>> +
>> +done:
>> +   backend.fd = tun_alloc();
>> +   assert(backend.fd >= 0);
>> +   vdev_info_init(&dev, features);
>> +   vq_info_add(&dev, 256);
>> +   run_test(&dev, &dev.vqs[0], delayed, batch, reset, nbufs);
> 
> I'd expect we are testing some basic traffic here. E.g can we use a
> packet socket then we can test both tx and rx?

Yes, only rx for tun is tested.
Do you have an idea how to test the tx too? As I am not familar enough
with vhost_net and tun yet.

> 
> Thanks



Re: [PATCH net-next 2/6] page_frag: unify gfp bit for order 3 page allocation

2023-12-07 Thread Yunsheng Lin
On 2023/12/7 11:15, Jakub Kicinski wrote:
> On Tue, 5 Dec 2023 19:34:40 +0800 Yunsheng Lin wrote:
>> __GFP_DIRECT_RECLAIM is xor'd to avoid
>> direct reclaim in skb_page_frag_refill(), but it is not
>> xor'd in __page_frag_cache_refill().
> 
> xor is not the same thing as masking a bit off.

You are right.
Will use 'mask off', thanks.

> The patch itself LGTM.
> .
> 



[PATCH net-next 6/6] tools: virtio: introduce vhost_net_test

2023-12-05 Thread Yunsheng Lin
introduce vhost_net_test basing on virtio_test to test
vhost_net changing in the kernel.

Signed-off-by: Yunsheng Lin 
---
 tools/virtio/Makefile |   8 +-
 tools/virtio/vhost_net_test.c | 441 ++
 2 files changed, 446 insertions(+), 3 deletions(-)
 create mode 100644 tools/virtio/vhost_net_test.c

diff --git a/tools/virtio/Makefile b/tools/virtio/Makefile
index d128925980e0..e25e99c1c3b7 100644
--- a/tools/virtio/Makefile
+++ b/tools/virtio/Makefile
@@ -1,8 +1,9 @@
 # SPDX-License-Identifier: GPL-2.0
 all: test mod
-test: virtio_test vringh_test
+test: virtio_test vringh_test vhost_net_test
 virtio_test: virtio_ring.o virtio_test.o
 vringh_test: vringh_test.o vringh.o virtio_ring.o
+vhost_net_test: virtio_ring.o vhost_net_test.o
 
 try-run = $(shell set -e;  \
if ($(1)) >/dev/null 2>&1;  \
@@ -49,6 +50,7 @@ oot-clean: OOT_BUILD+=clean
 
 .PHONY: all test mod clean vhost oot oot-clean oot-build
 clean:
-   ${RM} *.o vringh_test virtio_test vhost_test/*.o vhost_test/.*.cmd \
-  vhost_test/Module.symvers vhost_test/modules.order *.d
+   ${RM} *.o vringh_test virtio_test vhost_net_test vhost_test/*.o \
+  vhost_test/.*.cmd vhost_test/Module.symvers \
+  vhost_test/modules.order *.d
 -include *.d
diff --git a/tools/virtio/vhost_net_test.c b/tools/virtio/vhost_net_test.c
new file mode 100644
index ..7e7b7aba3668
--- /dev/null
+++ b/tools/virtio/vhost_net_test.c
@@ -0,0 +1,441 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define RANDOM_BATCH -1
+
+static int tun_alloc(void)
+{
+   struct ifreq ifr;
+   int fd, e;
+
+   fd = open("/dev/net/tun", O_RDWR);
+   if (fd < 0) {
+   perror("Cannot open /dev/net/tun");
+   return fd;
+   }
+
+   memset(&ifr, 0, sizeof(ifr));
+
+   ifr.ifr_flags = IFF_TUN | IFF_NO_PI;
+   strncpy(ifr.ifr_name, "tun0", IFNAMSIZ);
+
+   e = ioctl(fd, TUNSETIFF, (void *) &ifr);
+   if (e < 0) {
+   perror("ioctl[TUNSETIFF]");
+   close(fd);
+   return e;
+   }
+
+   return fd;
+}
+
+/* Unused */
+void *__kmalloc_fake, *__kfree_ignore_start, *__kfree_ignore_end;
+
+struct vq_info {
+   int kick;
+   int call;
+   int num;
+   int idx;
+   void *ring;
+   /* copy used for control */
+   struct vring vring;
+   struct virtqueue *vq;
+};
+
+struct vdev_info {
+   struct virtio_device vdev;
+   int control;
+   struct pollfd fds[1];
+   struct vq_info vqs[1];
+   int nvqs;
+   void *buf;
+   size_t buf_size;
+   struct vhost_memory *mem;
+};
+
+static struct vhost_vring_file no_backend = { .index = 1, .fd = -1 },
+backend = { .index = 1, .fd = 1 };
+static const struct vhost_vring_state null_state = {};
+
+bool vq_notify(struct virtqueue *vq)
+{
+   struct vq_info *info = vq->priv;
+   unsigned long long v = 1;
+   int r;
+   r = write(info->kick, &v, sizeof v);
+   assert(r == sizeof v);
+   return true;
+}
+
+void vq_callback(struct virtqueue *vq)
+{
+}
+
+
+void vhost_vq_setup(struct vdev_info *dev, struct vq_info *info)
+{
+   struct vhost_vring_state state = { .index = info->idx };
+   struct vhost_vring_file file = { .index = info->idx };
+   unsigned long long features = dev->vdev.features;
+   struct vhost_vring_addr addr = {
+   .index = info->idx,
+   .desc_user_addr = (uint64_t)(unsigned long)info->vring.desc,
+   .avail_user_addr = (uint64_t)(unsigned long)info->vring.avail,
+   .used_user_addr = (uint64_t)(unsigned long)info->vring.used,
+   };
+   int r;
+   r = ioctl(dev->control, VHOST_SET_FEATURES, &features);
+   assert(r >= 0);
+   state.num = info->vring.num;
+   r = ioctl(dev->control, VHOST_SET_VRING_NUM, &state);
+   assert(r >= 0);
+   state.num = 0;
+   r = ioctl(dev->control, VHOST_SET_VRING_BASE, &state);
+   assert(r >= 0);
+   r = ioctl(dev->control, VHOST_SET_VRING_ADDR, &addr);
+   assert(r >= 0);
+   file.fd = info->kick;
+   r = ioctl(dev->control, VHOST_SET_VRING_KICK, &file);
+   assert(r >= 0);
+   file.fd = info->call;
+   r = ioctl(dev->control, VHOST_SET_VRING_CALL, &file);
+   assert(r >= 0);
+}
+
+static void vq_reset(struct vq_info *info, int num, struct virtio_device *vdev)
+{
+   if (info->vq)
+   vring_del_virtqueue(info->vq);
+
+   memset(info

[PATCH net-next 5/6] net: introduce page_frag_cache_drain()

2023-12-05 Thread Yunsheng Lin
When draining a page_frag_cache, most user are doing
the similar steps, so introduce an API to avoid code
duplication.

Signed-off-by: Yunsheng Lin 
---
 drivers/net/ethernet/google/gve/gve_main.c | 11 ++-
 drivers/net/ethernet/mediatek/mtk_wed_wo.c | 17 ++---
 drivers/nvme/host/tcp.c|  7 +--
 drivers/nvme/target/tcp.c  |  4 +---
 drivers/vhost/net.c|  4 +---
 include/linux/gfp.h|  2 ++
 mm/page_alloc.c| 10 ++
 7 files changed, 19 insertions(+), 36 deletions(-)

diff --git a/drivers/net/ethernet/google/gve/gve_main.c 
b/drivers/net/ethernet/google/gve/gve_main.c
index 619bf63ec935..d976190b0f4d 100644
--- a/drivers/net/ethernet/google/gve/gve_main.c
+++ b/drivers/net/ethernet/google/gve/gve_main.c
@@ -1278,17 +1278,10 @@ static void gve_unreg_xdp_info(struct gve_priv *priv)
 
 static void gve_drain_page_cache(struct gve_priv *priv)
 {
-   struct page_frag_cache *nc;
int i;
 
-   for (i = 0; i < priv->rx_cfg.num_queues; i++) {
-   nc = &priv->rx[i].page_cache;
-   if (nc->va) {
-   __page_frag_cache_drain(virt_to_page(nc->va),
-   nc->pagecnt_bias);
-   nc->va = NULL;
-   }
-   }
+   for (i = 0; i < priv->rx_cfg.num_queues; i++)
+   page_frag_cache_drain(&priv->rx[i].page_cache);
 }
 
 static int gve_open(struct net_device *dev)
diff --git a/drivers/net/ethernet/mediatek/mtk_wed_wo.c 
b/drivers/net/ethernet/mediatek/mtk_wed_wo.c
index 7ffbd4fca881..df0a3ceaf59b 100644
--- a/drivers/net/ethernet/mediatek/mtk_wed_wo.c
+++ b/drivers/net/ethernet/mediatek/mtk_wed_wo.c
@@ -286,7 +286,6 @@ mtk_wed_wo_queue_free(struct mtk_wed_wo *wo, struct 
mtk_wed_wo_queue *q)
 static void
 mtk_wed_wo_queue_tx_clean(struct mtk_wed_wo *wo, struct mtk_wed_wo_queue *q)
 {
-   struct page *page;
int i;
 
for (i = 0; i < q->n_desc; i++) {
@@ -298,19 +297,12 @@ mtk_wed_wo_queue_tx_clean(struct mtk_wed_wo *wo, struct 
mtk_wed_wo_queue *q)
entry->buf = NULL;
}
 
-   if (!q->cache.va)
-   return;
-
-   page = virt_to_page(q->cache.va);
-   __page_frag_cache_drain(page, q->cache.pagecnt_bias);
-   memset(&q->cache, 0, sizeof(q->cache));
+   page_frag_cache_drain(&q->cache);
 }
 
 static void
 mtk_wed_wo_queue_rx_clean(struct mtk_wed_wo *wo, struct mtk_wed_wo_queue *q)
 {
-   struct page *page;
-
for (;;) {
void *buf = mtk_wed_wo_dequeue(wo, q, NULL, true);
 
@@ -320,12 +312,7 @@ mtk_wed_wo_queue_rx_clean(struct mtk_wed_wo *wo, struct 
mtk_wed_wo_queue *q)
skb_free_frag(buf);
}
 
-   if (!q->cache.va)
-   return;
-
-   page = virt_to_page(q->cache.va);
-   __page_frag_cache_drain(page, q->cache.pagecnt_bias);
-   memset(&q->cache, 0, sizeof(q->cache));
+   page_frag_cache_drain(&q->cache);
 }
 
 static void
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index d79811cfa0ce..1c85e1398e4e 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -1344,7 +1344,6 @@ static int nvme_tcp_alloc_async_req(struct nvme_tcp_ctrl 
*ctrl)
 
 static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid)
 {
-   struct page *page;
struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
struct nvme_tcp_queue *queue = &ctrl->queues[qid];
unsigned int noreclaim_flag;
@@ -1355,11 +1354,7 @@ static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, 
int qid)
if (queue->hdr_digest || queue->data_digest)
nvme_tcp_free_crypto(queue);
 
-   if (queue->pf_cache.va) {
-   page = virt_to_head_page(queue->pf_cache.va);
-   __page_frag_cache_drain(page, queue->pf_cache.pagecnt_bias);
-   queue->pf_cache.va = NULL;
-   }
+   page_frag_cache_drain(&queue->pf_cache);
 
noreclaim_flag = memalloc_noreclaim_save();
/* ->sock will be released by fput() */
diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
index 4cc27856aa8f..11237557cfc5 100644
--- a/drivers/nvme/target/tcp.c
+++ b/drivers/nvme/target/tcp.c
@@ -1576,7 +1576,6 @@ static void nvmet_tcp_free_cmd_data_in_buffers(struct 
nvmet_tcp_queue *queue)
 
 static void nvmet_tcp_release_queue_work(struct work_struct *w)
 {
-   struct page *page;
struct nvmet_tcp_queue *queue =
container_of(w, struct nvmet_tcp_queue, release_work);
 
@@ -1600,8 +1599,7 @@ static void nvmet_tcp_release_queue_work(struct 
work_struct *w)
if (queue->hdr_digest || queue->data_digest)
nvmet_tcp_free_crypto(queue);
ida_free(&nvmet_tcp_

[PATCH net-next 4/6] vhost/net: remove vhost_net_page_frag_refill()

2023-12-05 Thread Yunsheng Lin
The page frag in vhost_net_page_frag_refill() uses the
'struct page_frag' from skb_page_frag_refill(), but it's
implementation is similar to page_frag_alloc_align() now.

This patch removes vhost_net_page_frag_refill() by using
'struct page_frag_cache' instead of 'struct page_frag',
and allocating frag using page_frag_alloc_align().

The added benefit is that not only unifying the page frag
implementation a little, but also having about 0.5% performance
boost testing by using the vhost_net_test introduced in the
last patch.

Signed-off-by: Yunsheng Lin 
---
 drivers/vhost/net.c | 93 ++---
 1 file changed, 29 insertions(+), 64 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index e574e21cc0ca..805e11d598e4 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -141,10 +141,8 @@ struct vhost_net {
unsigned tx_zcopy_err;
/* Flush in progress. Protected by tx vq lock. */
bool tx_flush;
-   /* Private page frag */
-   struct page_frag page_frag;
-   /* Refcount bias of page frag */
-   int refcnt_bias;
+   /* Private page frag cache */
+   struct page_frag_cache pf_cache;
 };
 
 static unsigned vhost_net_zcopy_mask __read_mostly;
@@ -655,41 +653,6 @@ static bool tx_can_batch(struct vhost_virtqueue *vq, 
size_t total_len)
   !vhost_vq_avail_empty(vq->dev, vq);
 }
 
-static bool vhost_net_page_frag_refill(struct vhost_net *net, unsigned int sz,
-  struct page_frag *pfrag, gfp_t gfp)
-{
-   if (pfrag->page) {
-   if (pfrag->offset + sz <= pfrag->size)
-   return true;
-   __page_frag_cache_drain(pfrag->page, net->refcnt_bias);
-   }
-
-   pfrag->offset = 0;
-   net->refcnt_bias = 0;
-   if (SKB_FRAG_PAGE_ORDER) {
-   /* Avoid direct reclaim but allow kswapd to wake */
-   pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
- __GFP_COMP | __GFP_NOWARN |
- __GFP_NORETRY | __GFP_NOMEMALLOC,
- SKB_FRAG_PAGE_ORDER);
-   if (likely(pfrag->page)) {
-   pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
-   goto done;
-   }
-   }
-   pfrag->page = alloc_page(gfp);
-   if (likely(pfrag->page)) {
-   pfrag->size = PAGE_SIZE;
-   goto done;
-   }
-   return false;
-
-done:
-   net->refcnt_bias = USHRT_MAX;
-   page_ref_add(pfrag->page, USHRT_MAX - 1);
-   return true;
-}
-
 #define VHOST_NET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
 
 static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq,
@@ -699,7 +662,6 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue 
*nvq,
struct vhost_net *net = container_of(vq->dev, struct vhost_net,
 dev);
struct socket *sock = vhost_vq_get_backend(vq);
-   struct page_frag *alloc_frag = &net->page_frag;
struct virtio_net_hdr *gso;
struct xdp_buff *xdp = &nvq->xdp[nvq->batched_xdp];
struct tun_xdp_hdr *hdr;
@@ -710,6 +672,7 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue 
*nvq,
int sock_hlen = nvq->sock_hlen;
void *buf;
int copied;
+   int ret;
 
if (unlikely(len < nvq->sock_hlen))
return -EFAULT;
@@ -719,18 +682,17 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue 
*nvq,
return -ENOSPC;
 
buflen += SKB_DATA_ALIGN(len + pad);
-   alloc_frag->offset = ALIGN((u64)alloc_frag->offset, SMP_CACHE_BYTES);
-   if (unlikely(!vhost_net_page_frag_refill(net, buflen,
-alloc_frag, GFP_KERNEL)))
+   buf = page_frag_alloc_align(&net->pf_cache, buflen, GFP_KERNEL,
+   SMP_CACHE_BYTES);
+   if (unlikely(!buf))
return -ENOMEM;
 
-   buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
-   copied = copy_page_from_iter(alloc_frag->page,
-alloc_frag->offset +
-offsetof(struct tun_xdp_hdr, gso),
-sock_hlen, from);
-   if (copied != sock_hlen)
-   return -EFAULT;
+   copied = copy_from_iter(buf + offsetof(struct tun_xdp_hdr, gso),
+   sock_hlen, from);
+   if (copied != sock_hlen) {
+   ret = -EFAULT;
+   goto err;
+   }
 
hdr = buf;
gso = &hdr->gso;
@@ -743,27 +705,30 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue 
*nvq,
  

[PATCH net-next 2/6] page_frag: unify gfp bit for order 3 page allocation

2023-12-05 Thread Yunsheng Lin
Currently there seems to be three page frag implementions
which all try to allocate order 3 page, if that fails, it
then fail back to allocate order 0 page, and each of them
all allow order 3 page allocation to fail under certain
condition by using specific gfp bits.

The gfp bits for order 3 page allocation are different
between different implementation, __GFP_NOMEMALLOC is
or'd to forbid access to emergency reserves memory for
__page_frag_cache_refill(), but it is not or'd in other
implementions, __GFP_DIRECT_RECLAIM is xor'd to avoid
direct reclaim in skb_page_frag_refill(), but it is not
xor'd in __page_frag_cache_refill().

This patch unifies the gfp bits used between different
implementions by or'ing __GFP_NOMEMALLOC and xor'ing
__GFP_DIRECT_RECLAIM for order 3 page allocation to avoid
possible pressure for mm.

Signed-off-by: Yunsheng Lin 
CC: Alexander Duyck 
---
 drivers/vhost/net.c | 2 +-
 mm/page_alloc.c | 4 ++--
 net/core/sock.c | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index f2ed7167c848..e574e21cc0ca 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -670,7 +670,7 @@ static bool vhost_net_page_frag_refill(struct vhost_net 
*net, unsigned int sz,
/* Avoid direct reclaim but allow kswapd to wake */
pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
  __GFP_COMP | __GFP_NOWARN |
- __GFP_NORETRY,
+ __GFP_NORETRY | __GFP_NOMEMALLOC,
  SKB_FRAG_PAGE_ORDER);
if (likely(pfrag->page)) {
pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9a16305cf985..1f0b36dd81b5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4693,8 +4693,8 @@ static struct page *__page_frag_cache_refill(struct 
page_frag_cache *nc,
gfp_t gfp = gfp_mask;
 
 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
-   gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY |
-   __GFP_NOMEMALLOC;
+   gfp_mask = (gfp_mask & ~__GFP_DIRECT_RECLAIM) |  __GFP_COMP |
+  __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC;
page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
PAGE_FRAG_CACHE_MAX_ORDER);
nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
diff --git a/net/core/sock.c b/net/core/sock.c
index fef349dd72fa..4efa9cae4b0d 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2904,7 +2904,7 @@ bool skb_page_frag_refill(unsigned int sz, struct 
page_frag *pfrag, gfp_t gfp)
/* Avoid direct reclaim but allow kswapd to wake */
pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
  __GFP_COMP | __GFP_NOWARN |
- __GFP_NORETRY,
+ __GFP_NORETRY | __GFP_NOMEMALLOC,
  SKB_FRAG_PAGE_ORDER);
if (likely(pfrag->page)) {
pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
-- 
2.33.0




[PATCH RFC 5/6] net: introduce page_frag_cache_drain()

2023-12-01 Thread Yunsheng Lin
When draining a page_frag_cache, most user are doing
the similar steps, so introduce an API to avoid code
duplication.

Signed-off-by: Yunsheng Lin 
---
 drivers/net/ethernet/google/gve/gve_main.c | 11 ++-
 drivers/net/ethernet/mediatek/mtk_wed_wo.c | 17 ++---
 drivers/nvme/host/tcp.c|  7 +--
 drivers/nvme/target/tcp.c  |  4 +---
 drivers/vhost/net.c|  4 +---
 include/linux/gfp.h|  2 ++
 mm/page_alloc.c| 10 ++
 7 files changed, 19 insertions(+), 36 deletions(-)

diff --git a/drivers/net/ethernet/google/gve/gve_main.c 
b/drivers/net/ethernet/google/gve/gve_main.c
index 619bf63ec935..d976190b0f4d 100644
--- a/drivers/net/ethernet/google/gve/gve_main.c
+++ b/drivers/net/ethernet/google/gve/gve_main.c
@@ -1278,17 +1278,10 @@ static void gve_unreg_xdp_info(struct gve_priv *priv)
 
 static void gve_drain_page_cache(struct gve_priv *priv)
 {
-   struct page_frag_cache *nc;
int i;
 
-   for (i = 0; i < priv->rx_cfg.num_queues; i++) {
-   nc = &priv->rx[i].page_cache;
-   if (nc->va) {
-   __page_frag_cache_drain(virt_to_page(nc->va),
-   nc->pagecnt_bias);
-   nc->va = NULL;
-   }
-   }
+   for (i = 0; i < priv->rx_cfg.num_queues; i++)
+   page_frag_cache_drain(&priv->rx[i].page_cache);
 }
 
 static int gve_open(struct net_device *dev)
diff --git a/drivers/net/ethernet/mediatek/mtk_wed_wo.c 
b/drivers/net/ethernet/mediatek/mtk_wed_wo.c
index 7ffbd4fca881..df0a3ceaf59b 100644
--- a/drivers/net/ethernet/mediatek/mtk_wed_wo.c
+++ b/drivers/net/ethernet/mediatek/mtk_wed_wo.c
@@ -286,7 +286,6 @@ mtk_wed_wo_queue_free(struct mtk_wed_wo *wo, struct 
mtk_wed_wo_queue *q)
 static void
 mtk_wed_wo_queue_tx_clean(struct mtk_wed_wo *wo, struct mtk_wed_wo_queue *q)
 {
-   struct page *page;
int i;
 
for (i = 0; i < q->n_desc; i++) {
@@ -298,19 +297,12 @@ mtk_wed_wo_queue_tx_clean(struct mtk_wed_wo *wo, struct 
mtk_wed_wo_queue *q)
entry->buf = NULL;
}
 
-   if (!q->cache.va)
-   return;
-
-   page = virt_to_page(q->cache.va);
-   __page_frag_cache_drain(page, q->cache.pagecnt_bias);
-   memset(&q->cache, 0, sizeof(q->cache));
+   page_frag_cache_drain(&q->cache);
 }
 
 static void
 mtk_wed_wo_queue_rx_clean(struct mtk_wed_wo *wo, struct mtk_wed_wo_queue *q)
 {
-   struct page *page;
-
for (;;) {
void *buf = mtk_wed_wo_dequeue(wo, q, NULL, true);
 
@@ -320,12 +312,7 @@ mtk_wed_wo_queue_rx_clean(struct mtk_wed_wo *wo, struct 
mtk_wed_wo_queue *q)
skb_free_frag(buf);
}
 
-   if (!q->cache.va)
-   return;
-
-   page = virt_to_page(q->cache.va);
-   __page_frag_cache_drain(page, q->cache.pagecnt_bias);
-   memset(&q->cache, 0, sizeof(q->cache));
+   page_frag_cache_drain(&q->cache);
 }
 
 static void
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 89661a9cf850..8d4f4a06f9d9 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -1338,7 +1338,6 @@ static int nvme_tcp_alloc_async_req(struct nvme_tcp_ctrl 
*ctrl)
 
 static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid)
 {
-   struct page *page;
struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
struct nvme_tcp_queue *queue = &ctrl->queues[qid];
unsigned int noreclaim_flag;
@@ -1349,11 +1348,7 @@ static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, 
int qid)
if (queue->hdr_digest || queue->data_digest)
nvme_tcp_free_crypto(queue);
 
-   if (queue->pf_cache.va) {
-   page = virt_to_head_page(queue->pf_cache.va);
-   __page_frag_cache_drain(page, queue->pf_cache.pagecnt_bias);
-   queue->pf_cache.va = NULL;
-   }
+   page_frag_cache_drain(&queue->pf_cache);
 
noreclaim_flag = memalloc_noreclaim_save();
/* ->sock will be released by fput() */
diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
index 92b74d0b8686..f9a553d70a61 100644
--- a/drivers/nvme/target/tcp.c
+++ b/drivers/nvme/target/tcp.c
@@ -1576,7 +1576,6 @@ static void nvmet_tcp_free_cmd_data_in_buffers(struct 
nvmet_tcp_queue *queue)
 
 static void nvmet_tcp_release_queue_work(struct work_struct *w)
 {
-   struct page *page;
struct nvmet_tcp_queue *queue =
container_of(w, struct nvmet_tcp_queue, release_work);
 
@@ -1600,8 +1599,7 @@ static void nvmet_tcp_release_queue_work(struct 
work_struct *w)
if (queue->hdr_digest || queue->data_digest)
nvmet_tcp_free_crypto(queue);
ida_free(&nvmet_tcp_

[PATCH RFC 6/6] tools: virtio: introduce vhost_net_test

2023-12-01 Thread Yunsheng Lin
introduce vhost_net_test basing on virtio_test to test
vhost_net changing in the kernel.

Signed-off-by: Yunsheng Lin 
---
 tools/virtio/Makefile |   8 +-
 tools/virtio/vhost_net_test.c | 441 ++
 2 files changed, 446 insertions(+), 3 deletions(-)
 create mode 100644 tools/virtio/vhost_net_test.c

diff --git a/tools/virtio/Makefile b/tools/virtio/Makefile
index d128925980e0..e25e99c1c3b7 100644
--- a/tools/virtio/Makefile
+++ b/tools/virtio/Makefile
@@ -1,8 +1,9 @@
 # SPDX-License-Identifier: GPL-2.0
 all: test mod
-test: virtio_test vringh_test
+test: virtio_test vringh_test vhost_net_test
 virtio_test: virtio_ring.o virtio_test.o
 vringh_test: vringh_test.o vringh.o virtio_ring.o
+vhost_net_test: virtio_ring.o vhost_net_test.o
 
 try-run = $(shell set -e;  \
if ($(1)) >/dev/null 2>&1;  \
@@ -49,6 +50,7 @@ oot-clean: OOT_BUILD+=clean
 
 .PHONY: all test mod clean vhost oot oot-clean oot-build
 clean:
-   ${RM} *.o vringh_test virtio_test vhost_test/*.o vhost_test/.*.cmd \
-  vhost_test/Module.symvers vhost_test/modules.order *.d
+   ${RM} *.o vringh_test virtio_test vhost_net_test vhost_test/*.o \
+  vhost_test/.*.cmd vhost_test/Module.symvers \
+  vhost_test/modules.order *.d
 -include *.d
diff --git a/tools/virtio/vhost_net_test.c b/tools/virtio/vhost_net_test.c
new file mode 100644
index ..7e7b7aba3668
--- /dev/null
+++ b/tools/virtio/vhost_net_test.c
@@ -0,0 +1,441 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define RANDOM_BATCH -1
+
+static int tun_alloc(void)
+{
+   struct ifreq ifr;
+   int fd, e;
+
+   fd = open("/dev/net/tun", O_RDWR);
+   if (fd < 0) {
+   perror("Cannot open /dev/net/tun");
+   return fd;
+   }
+
+   memset(&ifr, 0, sizeof(ifr));
+
+   ifr.ifr_flags = IFF_TUN | IFF_NO_PI;
+   strncpy(ifr.ifr_name, "tun0", IFNAMSIZ);
+
+   e = ioctl(fd, TUNSETIFF, (void *) &ifr);
+   if (e < 0) {
+   perror("ioctl[TUNSETIFF]");
+   close(fd);
+   return e;
+   }
+
+   return fd;
+}
+
+/* Unused */
+void *__kmalloc_fake, *__kfree_ignore_start, *__kfree_ignore_end;
+
+struct vq_info {
+   int kick;
+   int call;
+   int num;
+   int idx;
+   void *ring;
+   /* copy used for control */
+   struct vring vring;
+   struct virtqueue *vq;
+};
+
+struct vdev_info {
+   struct virtio_device vdev;
+   int control;
+   struct pollfd fds[1];
+   struct vq_info vqs[1];
+   int nvqs;
+   void *buf;
+   size_t buf_size;
+   struct vhost_memory *mem;
+};
+
+static struct vhost_vring_file no_backend = { .index = 1, .fd = -1 },
+backend = { .index = 1, .fd = 1 };
+static const struct vhost_vring_state null_state = {};
+
+bool vq_notify(struct virtqueue *vq)
+{
+   struct vq_info *info = vq->priv;
+   unsigned long long v = 1;
+   int r;
+   r = write(info->kick, &v, sizeof v);
+   assert(r == sizeof v);
+   return true;
+}
+
+void vq_callback(struct virtqueue *vq)
+{
+}
+
+
+void vhost_vq_setup(struct vdev_info *dev, struct vq_info *info)
+{
+   struct vhost_vring_state state = { .index = info->idx };
+   struct vhost_vring_file file = { .index = info->idx };
+   unsigned long long features = dev->vdev.features;
+   struct vhost_vring_addr addr = {
+   .index = info->idx,
+   .desc_user_addr = (uint64_t)(unsigned long)info->vring.desc,
+   .avail_user_addr = (uint64_t)(unsigned long)info->vring.avail,
+   .used_user_addr = (uint64_t)(unsigned long)info->vring.used,
+   };
+   int r;
+   r = ioctl(dev->control, VHOST_SET_FEATURES, &features);
+   assert(r >= 0);
+   state.num = info->vring.num;
+   r = ioctl(dev->control, VHOST_SET_VRING_NUM, &state);
+   assert(r >= 0);
+   state.num = 0;
+   r = ioctl(dev->control, VHOST_SET_VRING_BASE, &state);
+   assert(r >= 0);
+   r = ioctl(dev->control, VHOST_SET_VRING_ADDR, &addr);
+   assert(r >= 0);
+   file.fd = info->kick;
+   r = ioctl(dev->control, VHOST_SET_VRING_KICK, &file);
+   assert(r >= 0);
+   file.fd = info->call;
+   r = ioctl(dev->control, VHOST_SET_VRING_CALL, &file);
+   assert(r >= 0);
+}
+
+static void vq_reset(struct vq_info *info, int num, struct virtio_device *vdev)
+{
+   if (info->vq)
+   vring_del_virtqueue(info->vq);
+
+   memset(info

[PATCH RFC 4/6] vhost/net: remove vhost_net_page_frag_refill()

2023-12-01 Thread Yunsheng Lin
The page frag in vhost_net_page_frag_refill() uses the
'struct page_frag' from skb_page_frag_refill(), but it's
implementation is similar to page_frag_alloc_align() now.

This patch removes vhost_net_page_frag_refill() by using
'struct page_frag_cache' instead of 'struct page_frag',
and allocating frag using page_frag_alloc_align().

The added benefit is that not only unifying the page frag
implementation a little, but also having about 0.5% performance
boost testing by using the vhost_net_test introduced in the
last patch.

Signed-off-by: Yunsheng Lin 
---
 drivers/vhost/net.c | 93 ++---
 1 file changed, 29 insertions(+), 64 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index e574e21cc0ca..805e11d598e4 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -141,10 +141,8 @@ struct vhost_net {
unsigned tx_zcopy_err;
/* Flush in progress. Protected by tx vq lock. */
bool tx_flush;
-   /* Private page frag */
-   struct page_frag page_frag;
-   /* Refcount bias of page frag */
-   int refcnt_bias;
+   /* Private page frag cache */
+   struct page_frag_cache pf_cache;
 };
 
 static unsigned vhost_net_zcopy_mask __read_mostly;
@@ -655,41 +653,6 @@ static bool tx_can_batch(struct vhost_virtqueue *vq, 
size_t total_len)
   !vhost_vq_avail_empty(vq->dev, vq);
 }
 
-static bool vhost_net_page_frag_refill(struct vhost_net *net, unsigned int sz,
-  struct page_frag *pfrag, gfp_t gfp)
-{
-   if (pfrag->page) {
-   if (pfrag->offset + sz <= pfrag->size)
-   return true;
-   __page_frag_cache_drain(pfrag->page, net->refcnt_bias);
-   }
-
-   pfrag->offset = 0;
-   net->refcnt_bias = 0;
-   if (SKB_FRAG_PAGE_ORDER) {
-   /* Avoid direct reclaim but allow kswapd to wake */
-   pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
- __GFP_COMP | __GFP_NOWARN |
- __GFP_NORETRY | __GFP_NOMEMALLOC,
- SKB_FRAG_PAGE_ORDER);
-   if (likely(pfrag->page)) {
-   pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
-   goto done;
-   }
-   }
-   pfrag->page = alloc_page(gfp);
-   if (likely(pfrag->page)) {
-   pfrag->size = PAGE_SIZE;
-   goto done;
-   }
-   return false;
-
-done:
-   net->refcnt_bias = USHRT_MAX;
-   page_ref_add(pfrag->page, USHRT_MAX - 1);
-   return true;
-}
-
 #define VHOST_NET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
 
 static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq,
@@ -699,7 +662,6 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue 
*nvq,
struct vhost_net *net = container_of(vq->dev, struct vhost_net,
 dev);
struct socket *sock = vhost_vq_get_backend(vq);
-   struct page_frag *alloc_frag = &net->page_frag;
struct virtio_net_hdr *gso;
struct xdp_buff *xdp = &nvq->xdp[nvq->batched_xdp];
struct tun_xdp_hdr *hdr;
@@ -710,6 +672,7 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue 
*nvq,
int sock_hlen = nvq->sock_hlen;
void *buf;
int copied;
+   int ret;
 
if (unlikely(len < nvq->sock_hlen))
return -EFAULT;
@@ -719,18 +682,17 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue 
*nvq,
return -ENOSPC;
 
buflen += SKB_DATA_ALIGN(len + pad);
-   alloc_frag->offset = ALIGN((u64)alloc_frag->offset, SMP_CACHE_BYTES);
-   if (unlikely(!vhost_net_page_frag_refill(net, buflen,
-alloc_frag, GFP_KERNEL)))
+   buf = page_frag_alloc_align(&net->pf_cache, buflen, GFP_KERNEL,
+   SMP_CACHE_BYTES);
+   if (unlikely(!buf))
return -ENOMEM;
 
-   buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
-   copied = copy_page_from_iter(alloc_frag->page,
-alloc_frag->offset +
-offsetof(struct tun_xdp_hdr, gso),
-sock_hlen, from);
-   if (copied != sock_hlen)
-   return -EFAULT;
+   copied = copy_from_iter(buf + offsetof(struct tun_xdp_hdr, gso),
+   sock_hlen, from);
+   if (copied != sock_hlen) {
+   ret = -EFAULT;
+   goto err;
+   }
 
hdr = buf;
gso = &hdr->gso;
@@ -743,27 +705,30 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue 
*nvq,
  

[PATCH RFC 2/6] page_frag: unify gfp bit for order 3 page allocation

2023-12-01 Thread Yunsheng Lin
Currently there seems to be three page frag implementions
which all try to allocate order 3 page, if that fails, it
then fail back to allocate order 0 page, and each of them
all allow order 3 page allocation to fail under certain
condition by using specific gfp bits.

The gfp bits for order 3 page allocation are different
between different implementation, __GFP_NOMEMALLOC is
or'd to forbid access to emergency reserves memory for
__page_frag_cache_refill(), but it is not or'd in other
implementions, __GFP_DIRECT_RECLAIM is xor'd to avoid
direct reclaim in skb_page_frag_refill(), but it is not
xor'd in __page_frag_cache_refill().

This patch unifies the gfp bits used between different
implementions by or'ing __GFP_NOMEMALLOC and xor'ing
__GFP_DIRECT_RECLAIM for order 3 page allocation to avoid
possible pressure for mm.

Signed-off-by: Yunsheng Lin 
CC: Alexander Duyck 
---
 drivers/vhost/net.c | 2 +-
 mm/page_alloc.c | 4 ++--
 net/core/sock.c | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index f2ed7167c848..e574e21cc0ca 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -670,7 +670,7 @@ static bool vhost_net_page_frag_refill(struct vhost_net 
*net, unsigned int sz,
/* Avoid direct reclaim but allow kswapd to wake */
pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
  __GFP_COMP | __GFP_NOWARN |
- __GFP_NORETRY,
+ __GFP_NORETRY | __GFP_NOMEMALLOC,
  SKB_FRAG_PAGE_ORDER);
if (likely(pfrag->page)) {
pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9a16305cf985..1f0b36dd81b5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4693,8 +4693,8 @@ static struct page *__page_frag_cache_refill(struct 
page_frag_cache *nc,
gfp_t gfp = gfp_mask;
 
 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
-   gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY |
-   __GFP_NOMEMALLOC;
+   gfp_mask = (gfp_mask & ~__GFP_DIRECT_RECLAIM) |  __GFP_COMP |
+  __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC;
page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
PAGE_FRAG_CACHE_MAX_ORDER);
nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
diff --git a/net/core/sock.c b/net/core/sock.c
index fef349dd72fa..4efa9cae4b0d 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2904,7 +2904,7 @@ bool skb_page_frag_refill(unsigned int sz, struct 
page_frag *pfrag, gfp_t gfp)
/* Avoid direct reclaim but allow kswapd to wake */
pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
  __GFP_COMP | __GFP_NOWARN |
- __GFP_NORETRY,
+ __GFP_NORETRY | __GFP_NOMEMALLOC,
  SKB_FRAG_PAGE_ORDER);
if (likely(pfrag->page)) {
pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
-- 
2.33.0




Re: [PATCH] memory tier: rename destroy_memory_type() to put_memory_type()

2023-07-06 Thread Miaohe Lin
On 2023/7/6 19:58, Xiao Yang wrote:
> On 2023/7/6 14:39, Miaohe Lin wrote:
>> It appears that destroy_memory_type() isn't a very good name because
>> we usually will not free the memory_type here. So rename it to a more
>> appropriate name i.e. put_memory_type().
>>
>> Suggested-by: Huang, Ying 
>> Signed-off-by: Miaohe Lin 
>> ---
>>   drivers/dax/kmem.c   | 4 ++--
>>   include/linux/memory-tiers.h | 4 ++--
>>   mm/memory-tiers.c    | 6 +++---
>>   3 files changed, 7 insertions(+), 7 deletions(-)
>>
>> diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c
>> index 898ca9505754..c57acb73e3db 100644
>> --- a/drivers/dax/kmem.c
>> +++ b/drivers/dax/kmem.c
>> @@ -264,7 +264,7 @@ static int __init dax_kmem_init(void)
>>   return rc;
>>     error_dax_driver:
>> -    destroy_memory_type(dax_slowmem_type);
>> +    put_memory_type(dax_slowmem_type);
>>   err_dax_slowmem_type:
>>   kfree_const(kmem_name);
>>   return rc;
>> @@ -275,7 +275,7 @@ static void __exit dax_kmem_exit(void)
>>   dax_driver_unregister(&device_dax_kmem_driver);
>>   if (!any_hotremove_failed)
>>   kfree_const(kmem_name);
>> -    destroy_memory_type(dax_slowmem_type);
>> +    put_memory_type(dax_slowmem_type);
>>   }
>>     MODULE_AUTHOR("Intel Corporation");
>> diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
>> index fc9647b1b4f9..437441cdf78f 100644
>> --- a/include/linux/memory-tiers.h
>> +++ b/include/linux/memory-tiers.h
>> @@ -33,7 +33,7 @@ struct memory_dev_type {
>>   #ifdef CONFIG_NUMA
>>   extern bool numa_demotion_enabled;
>>   struct memory_dev_type *alloc_memory_type(int adistance);
>> -void destroy_memory_type(struct memory_dev_type *memtype);
>> +void put_memory_type(struct memory_dev_type *memtype);
>>   void init_node_memory_type(int node, struct memory_dev_type *default_type);
>>   void clear_node_memory_type(int node, struct memory_dev_type *memtype);
>>   #ifdef CONFIG_MIGRATION
>> @@ -68,7 +68,7 @@ static inline struct memory_dev_type 
>> *alloc_memory_type(int adistance)
>>   return NULL;
>>   }
>>   -static inline void destroy_memory_type(struct memory_dev_type *memtype)
>> +static inline void put_memory_type(struct memory_dev_type *memtype)
>>   {
>>     }
>> diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
>> index 1719fa3bcf02..c49ab03f49b1 100644
>> --- a/mm/memory-tiers.c
>> +++ b/mm/memory-tiers.c
>> @@ -560,11 +560,11 @@ struct memory_dev_type *alloc_memory_type(int 
>> adistance)
>>   }
>>   EXPORT_SYMBOL_GPL(alloc_memory_type);
>>   -void destroy_memory_type(struct memory_dev_type *memtype)
>> +void put_memory_type(struct memory_dev_type *memtype)
>>   {
>>   kref_put(&memtype->kref, release_memtype);
>>   }
>> -EXPORT_SYMBOL_GPL(destroy_memory_type);
>> +EXPORT_SYMBOL_GPL(put_memory_type);
>>     void init_node_memory_type(int node, struct memory_dev_type *memtype)
>>   {
>> @@ -586,7 +586,7 @@ void clear_node_memory_type(int node, struct 
>> memory_dev_type *memtype)
>>    */
>>   if (!node_memory_types[node].map_count) {
>>   node_memory_types[node].memtype = NULL;
>> -    destroy_memory_type(memtype);
>> +    put_memory_type(memtype);
> Hi Maohe,
> 
> I didn't find that destroy_memory_type(memtype) is called here on mainline 
> kernel. Did I miss something?

It's on linux-next tree:
 
https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?id=e5bf0402b80d80b66e9765b2c160b5199a5c7d3b

> 
> Other than that, it looks good to me.
> Reviewed-by: Xiao Yang 

Thanks for your review and comment.




[PATCH] memory tier: rename destroy_memory_type() to put_memory_type()

2023-07-05 Thread Miaohe Lin
It appears that destroy_memory_type() isn't a very good name because
we usually will not free the memory_type here. So rename it to a more
appropriate name i.e. put_memory_type().

Suggested-by: Huang, Ying 
Signed-off-by: Miaohe Lin 
---
 drivers/dax/kmem.c   | 4 ++--
 include/linux/memory-tiers.h | 4 ++--
 mm/memory-tiers.c| 6 +++---
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c
index 898ca9505754..c57acb73e3db 100644
--- a/drivers/dax/kmem.c
+++ b/drivers/dax/kmem.c
@@ -264,7 +264,7 @@ static int __init dax_kmem_init(void)
return rc;
 
 error_dax_driver:
-   destroy_memory_type(dax_slowmem_type);
+   put_memory_type(dax_slowmem_type);
 err_dax_slowmem_type:
kfree_const(kmem_name);
return rc;
@@ -275,7 +275,7 @@ static void __exit dax_kmem_exit(void)
dax_driver_unregister(&device_dax_kmem_driver);
if (!any_hotremove_failed)
kfree_const(kmem_name);
-   destroy_memory_type(dax_slowmem_type);
+   put_memory_type(dax_slowmem_type);
 }
 
 MODULE_AUTHOR("Intel Corporation");
diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
index fc9647b1b4f9..437441cdf78f 100644
--- a/include/linux/memory-tiers.h
+++ b/include/linux/memory-tiers.h
@@ -33,7 +33,7 @@ struct memory_dev_type {
 #ifdef CONFIG_NUMA
 extern bool numa_demotion_enabled;
 struct memory_dev_type *alloc_memory_type(int adistance);
-void destroy_memory_type(struct memory_dev_type *memtype);
+void put_memory_type(struct memory_dev_type *memtype);
 void init_node_memory_type(int node, struct memory_dev_type *default_type);
 void clear_node_memory_type(int node, struct memory_dev_type *memtype);
 #ifdef CONFIG_MIGRATION
@@ -68,7 +68,7 @@ static inline struct memory_dev_type *alloc_memory_type(int 
adistance)
return NULL;
 }
 
-static inline void destroy_memory_type(struct memory_dev_type *memtype)
+static inline void put_memory_type(struct memory_dev_type *memtype)
 {
 
 }
diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
index 1719fa3bcf02..c49ab03f49b1 100644
--- a/mm/memory-tiers.c
+++ b/mm/memory-tiers.c
@@ -560,11 +560,11 @@ struct memory_dev_type *alloc_memory_type(int adistance)
 }
 EXPORT_SYMBOL_GPL(alloc_memory_type);
 
-void destroy_memory_type(struct memory_dev_type *memtype)
+void put_memory_type(struct memory_dev_type *memtype)
 {
kref_put(&memtype->kref, release_memtype);
 }
-EXPORT_SYMBOL_GPL(destroy_memory_type);
+EXPORT_SYMBOL_GPL(put_memory_type);
 
 void init_node_memory_type(int node, struct memory_dev_type *memtype)
 {
@@ -586,7 +586,7 @@ void clear_node_memory_type(int node, struct 
memory_dev_type *memtype)
 */
if (!node_memory_types[node].map_count) {
node_memory_types[node].memtype = NULL;
-   destroy_memory_type(memtype);
+   put_memory_type(memtype);
}
mutex_unlock(&memory_tier_lock);
 }
-- 
2.33.0




[PATCH v3 0/4] close various race windows for swap

2021-04-20 Thread Miaohe Lin
Hi all,
When I was investigating the swap code, I found some possible race
windows. This series aims to fix all these races. But using current
get/put_swap_device() to guard against concurrent swapoff for
swap_readpage() looks terrible because swap_readpage() may take really
long time. And to reduce the performance overhead on the hot-path as
much as possible, it appears we can use the percpu_ref to close this
race window(as suggested by Huang, Ying). The patch 1 adds percpu_ref
support for swap and most of the remaining patches try to use this to
close various race windows. More details can be found in the respective
changelogs. Thanks!

v2->v3:
  some commit log and comment enhance per Huang, Ying
  remove ref_initialized field
  squash PATCH 1-2

v1->v2:
  reorganize the patch-2/5
  various enhance and fixup per Huang, Ying
  Many thanks for the comments of Huang, Ying, Dennis Zhou and Tim Chen.

Miaohe Lin (4):
  mm/swapfile: use percpu_ref to serialize against concurrent swapoff
  swap: fix do_swap_page() race with swapoff
  mm/swap: remove confusing checking for non_swap_entry() in
swap_ra_info()
  mm/shmem: fix shmem_swapin() race with swapoff

 include/linux/swap.h | 14 ++--
 mm/memory.c  |  9 +
 mm/shmem.c   |  6 
 mm/swap_state.c  |  6 
 mm/swapfile.c| 79 +++-
 5 files changed, 76 insertions(+), 38 deletions(-)

-- 
2.19.1



[PATCH v3 2/4] swap: fix do_swap_page() race with swapoff

2021-04-20 Thread Miaohe Lin
When I was investigating the swap code, I found the below possible race
window:

CPU 1   CPU 2
-   -
do_swap_page
  if (data_race(si->flags & SWP_SYNCHRONOUS_IO)
  swap_readpage
if (data_race(sis->flags & SWP_FS_OPS)) {
swapoff
  p->flags &= ~SWP_VALID;
  ..
  synchronize_rcu();
  ..
  p->swap_file = NULL;
struct file *swap_file = sis->swap_file;
struct address_space *mapping = swap_file->f_mapping;[oops!]

Note that for the pages that are swapped in through swap cache, this isn't
an issue. Because the page is locked, and the swap entry will be marked
with SWAP_HAS_CACHE, so swapoff() can not proceed until the page has been
unlocked.

Using current get/put_swap_device() to guard against concurrent swapoff for
swap_readpage() looks terrible because swap_readpage() may take really long
time. And this race may not be really pernicious because swapoff is usually
done when system shutdown only. To reduce the performance overhead on the
hot-path as much as possible, it appears we can use the percpu_ref to close
this race window(as suggested by Huang, Ying).

Fixes: 0bcac06f27d7 ("mm,swap: skip swapcache for swapin of synchronous device")
Reported-by: kernel test robot  (auto build test ERROR)
Signed-off-by: Miaohe Lin 
---
 include/linux/swap.h | 9 +
 mm/memory.c  | 9 +
 2 files changed, 18 insertions(+)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index c9e7fea10b83..46d51d058d05 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -527,6 +527,15 @@ static inline struct swap_info_struct 
*swp_swap_info(swp_entry_t entry)
return NULL;
 }
 
+static inline struct swap_info_struct *get_swap_device(swp_entry_t entry)
+{
+   return NULL;
+}
+
+static inline void put_swap_device(struct swap_info_struct *si)
+{
+}
+
 #define swap_address_space(entry)  (NULL)
 #define get_nr_swap_pages()0L
 #define total_swap_pages   0L
diff --git a/mm/memory.c b/mm/memory.c
index 27014c3bde9f..7a2fe12cf641 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3311,6 +3311,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 {
struct vm_area_struct *vma = vmf->vma;
struct page *page = NULL, *swapcache;
+   struct swap_info_struct *si = NULL;
swp_entry_t entry;
pte_t pte;
int locked;
@@ -3338,6 +3339,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
goto out;
}
 
+   /* Prevent swapoff from happening to us. */
+   si = get_swap_device(entry);
+   if (unlikely(!si))
+   goto out;
 
delayacct_set_flag(current, DELAYACCT_PF_SWAPIN);
page = lookup_swap_cache(entry, vma, vmf->address);
@@ -3514,6 +3519,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 unlock:
pte_unmap_unlock(vmf->pte, vmf->ptl);
 out:
+   if (si)
+   put_swap_device(si);
return ret;
 out_nomap:
pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -3525,6 +3532,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
unlock_page(swapcache);
put_page(swapcache);
}
+   if (si)
+   put_swap_device(si);
return ret;
 }
 
-- 
2.19.1



[PATCH v3 1/4] mm/swapfile: use percpu_ref to serialize against concurrent swapoff

2021-04-20 Thread Miaohe Lin
Using current get/put_swap_device() to guard against concurrent swapoff
for some swap ops, e.g. swap_readpage(), looks terrible because they
might take really long time. This patch adds the percpu_ref support to
serialize against concurrent swapoff. Also we remove the SWP_VALID flag
because it's used together with RCU solution.

Signed-off-by: Miaohe Lin 
---
 include/linux/swap.h |  5 +--
 mm/swapfile.c| 79 +++-
 2 files changed, 52 insertions(+), 32 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 144727041e78..c9e7fea10b83 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -177,7 +177,6 @@ enum {
SWP_PAGE_DISCARD = (1 << 10),   /* freed swap page-cluster discards */
SWP_STABLE_WRITES = (1 << 11),  /* no overwrite PG_writeback pages */
SWP_SYNCHRONOUS_IO = (1 << 12), /* synchronous IO is efficient */
-   SWP_VALID   = (1 << 13),/* swap is valid to be operated on? */
/* add others here before... */
SWP_SCANNING= (1 << 14),/* refcount in scan_swap_map */
 };
@@ -240,6 +239,7 @@ struct swap_cluster_list {
  * The in-memory structure used to track swap areas.
  */
 struct swap_info_struct {
+   struct percpu_ref users;/* indicate and keep swap device valid. 
*/
unsigned long   flags;  /* SWP_USED etc: see above */
signed shortprio;   /* swap priority of this type */
struct plist_node list; /* entry in swap_active_head */
@@ -260,6 +260,7 @@ struct swap_info_struct {
struct block_device *bdev;  /* swap device or bdev of swap file */
struct file *swap_file; /* seldom referenced */
unsigned int old_block_size;/* seldom referenced */
+   struct completion comp; /* seldom referenced */
 #ifdef CONFIG_FRONTSWAP
unsigned long *frontswap_map;   /* frontswap in-use, one bit per page */
atomic_t frontswap_pages;   /* frontswap pages in-use counter */
@@ -511,7 +512,7 @@ sector_t swap_page_sector(struct page *page);
 
 static inline void put_swap_device(struct swap_info_struct *si)
 {
-   rcu_read_unlock();
+   percpu_ref_put(&si->users);
 }
 
 #else /* CONFIG_SWAP */
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 149e77454e3c..a640fc84be5b 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -39,6 +39,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -511,6 +512,14 @@ static void swap_discard_work(struct work_struct *work)
spin_unlock(&si->lock);
 }
 
+static void swap_users_ref_free(struct percpu_ref *ref)
+{
+   struct swap_info_struct *si;
+
+   si = container_of(ref, struct swap_info_struct, users);
+   complete(&si->comp);
+}
+
 static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
 {
struct swap_cluster_info *ci = si->cluster_info;
@@ -1270,18 +1279,12 @@ static unsigned char __swap_entry_free_locked(struct 
swap_info_struct *p,
  * via preventing the swap device from being swapoff, until
  * put_swap_device() is called.  Otherwise return NULL.
  *
- * The entirety of the RCU read critical section must come before the
- * return from or after the call to synchronize_rcu() in
- * enable_swap_info() or swapoff().  So if "si->flags & SWP_VALID" is
- * true, the si->map, si->cluster_info, etc. must be valid in the
- * critical section.
- *
  * Notice that swapoff or swapoff+swapon can still happen before the
- * rcu_read_lock() in get_swap_device() or after the rcu_read_unlock()
- * in put_swap_device() if there isn't any other way to prevent
- * swapoff, such as page lock, page table lock, etc.  The caller must
- * be prepared for that.  For example, the following situation is
- * possible.
+ * percpu_ref_tryget_live() in get_swap_device() or after the
+ * percpu_ref_put() in put_swap_device() if there isn't any other way
+ * to prevent swapoff, such as page lock, page table lock, etc.  The
+ * caller must be prepared for that.  For example, the following
+ * situation is possible.
  *
  *   CPU1  CPU2
  *   do_swap_page()
@@ -1309,21 +1312,27 @@ struct swap_info_struct *get_swap_device(swp_entry_t 
entry)
si = swp_swap_info(entry);
if (!si)
goto bad_nofile;
-
-   rcu_read_lock();
-   if (data_race(!(si->flags & SWP_VALID)))
-   goto unlock_out;
+   if (!percpu_ref_tryget_live(&si->users))
+   goto out;
+   /*
+* Guarantee the si->users are checked before accessing other
+* fields of swap_info_struct.
+*
+* Paired with the spin_unlock() after setup_swap_info() in
+* enable_swap_info().
+*/
+   smp_rmb();
offset = swp_offset(entry);
if (offse

[PATCH v3 4/4] mm/shmem: fix shmem_swapin() race with swapoff

2021-04-20 Thread Miaohe Lin
When I was investigating the swap code, I found the below possible race
window:

CPU 1 CPU 2
- -
shmem_swapin
  swap_cluster_readahead
if (likely(si->flags & (SWP_BLKDEV | SWP_FS_OPS))) {
  swapoff
percpu_ref_kill(&p->users)
synchronize_rcu()
wait_for_completion
..
si->swap_file = NULL;
struct inode *inode = si->swap_file->f_mapping->host;[oops!]

Close this race window by using get/put_swap_device() to guard against
concurrent swapoff.

Fixes: 8fd2e0b505d1 ("mm: swap: check if swap backing device is congested or 
not")
Signed-off-by: Miaohe Lin 
---
 mm/shmem.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/mm/shmem.c b/mm/shmem.c
index 26c76b13ad23..936ba5595297 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1492,15 +1492,21 @@ static void shmem_pseudo_vma_destroy(struct 
vm_area_struct *vma)
 static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
struct shmem_inode_info *info, pgoff_t index)
 {
+   struct swap_info_struct *si;
struct vm_area_struct pvma;
struct page *page;
struct vm_fault vmf = {
.vma = &pvma,
};
 
+   /* Prevent swapoff from happening to us. */
+   si = get_swap_device(swap);
+   if (unlikely(!si))
+   return NULL;
shmem_pseudo_vma_init(&pvma, info, index);
page = swap_cluster_readahead(swap, gfp, &vmf);
shmem_pseudo_vma_destroy(&pvma);
+   put_swap_device(si);
 
return page;
 }
-- 
2.19.1



[PATCH v3 3/4] mm/swap: remove confusing checking for non_swap_entry() in swap_ra_info()

2021-04-20 Thread Miaohe Lin
The non_swap_entry() was used for working with VMA based swap readahead
via commit ec560175c0b6 ("mm, swap: VMA based swap readahead"). Then it's
moved to swap_ra_info() since commit eaf649ebc3ac ("mm: swap: clean up swap
readahead"). But this makes the code confusing. The non_swap_entry() check
looks racy because while we released the pte lock, somebody else might have
faulted in this pte. So we should check whether it's swap pte first to
guard against such race or swap_type will be unexpected. But the swap_entry
isn't used in this function and we will have enough checking when we really
operate the PTE entries later. So checking for non_swap_entry() is not
really needed here and should be removed to avoid confusion.

Signed-off-by: Miaohe Lin 
---
 mm/swap_state.c | 6 --
 1 file changed, 6 deletions(-)

diff --git a/mm/swap_state.c b/mm/swap_state.c
index 272ea2108c9d..df5405384520 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -721,7 +721,6 @@ static void swap_ra_info(struct vm_fault *vmf,
 {
struct vm_area_struct *vma = vmf->vma;
unsigned long ra_val;
-   swp_entry_t entry;
unsigned long faddr, pfn, fpfn;
unsigned long start, end;
pte_t *pte, *orig_pte;
@@ -739,11 +738,6 @@ static void swap_ra_info(struct vm_fault *vmf,
 
faddr = vmf->address;
orig_pte = pte = pte_offset_map(vmf->pmd, faddr);
-   entry = pte_to_swp_entry(*pte);
-   if ((unlikely(non_swap_entry(entry {
-   pte_unmap(orig_pte);
-   return;
-   }
 
fpfn = PFN_DOWN(faddr);
ra_val = GET_SWAP_RA_VAL(vma);
-- 
2.19.1



Re: [PATCH net v4 1/2] net: sched: fix packet stuck problem for lockless qdisc

2021-04-19 Thread Yunsheng Lin
On 2021/4/20 7:55, Michal Kubecek wrote:
> On Mon, Apr 19, 2021 at 05:29:46PM +0200, Michal Kubecek wrote:
>>
>> As pointed out in the discussion on v3, this patch may result in
>> significantly higher CPU consumption with multiple threads competing on
>> a saturated outgoing device. I missed this submission so that I haven't
>> checked it yet but given the description of v3->v4 changes above, it's
>> quite likely that it suffers from the same problem.
> 
> And it indeed does. However, with the additional patch from the v3
> discussion, the numbers are approximately the same as with an unpatched
> mainline kernel.
> 
> As with v4, I tried this patch on top of 5.12-rc7 with real devices.
> I used two machines with 10Gb/s Intel ixgbe NICs, sender has 16 CPUs
> (2 8-core CPUs with HT disabled) and 16 Rx/Tx queues, receiver has
> 48 CPUs (2 12-core CPUs with HT enabled) and 48 Rx/Tx queues.
> 
>   threads5.12-rc75.12-rc7 + v45.12-rc7 + v4 + stop
>  125.1%  38.1%22.9%
>  866.2% 277.0%74.1%
> 1690.1% 150.7%91.0%
> 32   107.2% 272.6%   108.3%
> 64   116.3% 487.5%   118.1%
>128   126.1% 946.7%   126.9%
> 
> (The values are normalized to one core, i.e. 100% corresponds to one
> fully used logical CPU.)
> 
> So it seems that repeated scheduling while the queue was stopped is
> indeed the main performance issue and that other cases of the logic
> being too pessimistic do not play significant role. There is an
> exception with 8 connections/threads and the result with just this
> series also looks abnormally high (e.g. much higher than with
> 16 threads). It might be worth investigating what happens there and
> what do the results with other thread counts around 8 look like.

Will try to investigate the 8 connections/threads case.

> 
> I'll run some more tests with other traffic patterns tomorrow and
> I'm also going to take a closer look at the additional patch.

Thanks for taking the detail testing and looking.

> 
> Michal
> 
> .
> 



Re: [PATCH net v3] net: sched: fix packet stuck problem for lockless qdisc

2021-04-19 Thread Yunsheng Lin
On 2021/4/19 22:57, Michal Kubecek wrote:
> On Mon, Apr 19, 2021 at 10:04:27AM +0800, Yunsheng Lin wrote:
>>>
>>> I tried this patch o top of 5.12-rc7 with real devices. I used two
>>> machines with 10Gb/s Intel ixgbe NICs, sender has 16 CPUs (2 8-core CPUs
>>> with HT disabled) and 16 Rx/Tx queues, receiver has 48 CPUs (2 12-core
>>> CPUs with HT enabled) and 48 Rx/Tx queues. With multiple TCP streams on
>>> a saturated ethernet, the CPU consumption grows quite a lot:
>>>
>>> threads unpatched 5.12-rc75.12-rc7 + v3   
>>>   1   25.6%   30.6%
>>>   8   73.1%  241.4%
>>> 128  132.2% 1012.0%
>>>
>>> (The values are normalized to one core, i.e. 100% corresponds to one
>>> fully used logical CPU.) I didn't perform a full statistical evaluation
>>> but the growth is way beyond any statistical fluctuation with one
>>> exception: 8-thread test of patched kernel showed values from 155.5% to
>>> 311.4%. Closer look shows that most of the CPU time was spent in softirq
>>> and running top in parallel with the test confirms that there are
>>> multiple ksofirqd threads running at 100% CPU. I had similar problem
>>> with earlier versions of my patch (work in progress, I still need to
>>> check some corner cases and write commit message explaining the logic)
>>
>> Great, if there is a better idea, maybe share the core idea first so
>> that we both can work on the that?
> 
> I'm not sure if it's really better but to minimize the false positives
> and unnecessary calls to __netif_schedule(), I replaced q->seqlock with
> an atomic combination of a "running" flag (which corresponds to current
> seqlock being locked) and a "drainers" count (number of other threads
> going to clean up the qdisc queue). This way we could keep track of them
> and get reliable information if another thread is going to run a cleanup
> after we leave the qdisc_run() critical section (so that there is no
> need to schedule).

It seems you are trying to match the skb enqueuing with the calling of
__qdisc_run() here, which is not reliable when considering the dequeue
batching, see try_bulk_dequeue_skb() or try_bulk_dequeue_skb_slow() in
dequeue_skb().

> 
>>> The biggest problem IMHO is that the loop in __qdisc_run() may finish
>>> without rescheduling not only when the qdisc queue is empty but also
>>> when the corresponding device Tx queue is stopped which devices tend to
>>> do whenever they cannot send any more packets out. Thus whenever
>>> __QDISC_STATE_NEED_RESCHEDULE is set with device queue stopped or
>>> frozen, we keep rescheduling the queue cleanup without any chance to
>>> progress or clear __QDISC_STATE_NEED_RESCHEDULE. For this to happen, all
>>> we need is another thready to fail the first spin_trylock() while device
>>> queue is stopped and qdisc queue not empty.
>>
>> Yes, We could just return false before doing the second spin_trylock() if
>> the netdev queue corresponding qdisc is stopped, and when the netdev queue
>> is restarted, __netif_schedule() is called again, see netif_tx_wake_queue().
>>
>> Maybe add a sch_qdisc_stopped() function and do the testting in 
>> qdisc_run_begin:
>>
>> if (dont_retry || sch_qdisc_stopped())
>>  return false;
>>
>> bool sch_qdisc_stopped(struct Qdisc *q)
>> {
>>  const struct netdev_queue *txq = q->dev_queue;
>>
>>  if (!netif_xmit_frozen_or_stopped(txq))
>>  return true;
>>
>>  reture false;
>> }
>>
>> At least for qdisc with TCQ_F_ONETXQUEUE flags set is doable?
> 
> Either this or you can do the check in qdisc_run_end() - when the device
> queue is stopped or frozen, there is no need to schedule as we know it's
> going to be done when the flag is cleared again (and we cannot do
> anything until then anyway).
> 
>>> Another part of the problem may be that to avoid the race, the logic is
>>> too pessimistic: consider e.g. (dotted lines show "barriers" where
>>> ordering is important):
>>>
>>> CPU ACPU B
>>> spin_trylock() succeeds
>>>  pfifo_fast_enqueue()
>>> ..
>>> skb_array empty, exit loop
>>>  first spin_trylock() fails
>>>  set __QDISC_STA

Re: [Linuxarm] Re: [PATCH net v3] net: sched: fix packet stuck problem for lockless qdisc

2021-04-19 Thread Yunsheng Lin
On 2021/4/19 10:04, Yunsheng Lin wrote:
> On 2021/4/19 6:59, Michal Kubecek wrote:
>> On Thu, Mar 25, 2021 at 11:13:11AM +0800, Yunsheng Lin wrote:
>>> Lockless qdisc has below concurrent problem:
>>> cpu0 cpu1
>>>  . .
>>> q->enqueue .
>>>  . .
>>> qdisc_run_begin()  .
>>>  . .
>>> dequeue_skb()  .
>>>  . .
>>> sch_direct_xmit()  .
>>>  . .
>>>  .q->enqueue
>>>  . qdisc_run_begin()
>>>  .return and do nothing
>>>  . .
>>> qdisc_run_end().
>>>
>>> cpu1 enqueue a skb without calling __qdisc_run() because cpu0
>>> has not released the lock yet and spin_trylock() return false
>>> for cpu1 in qdisc_run_begin(), and cpu0 do not see the skb
>>> enqueued by cpu1 when calling dequeue_skb() because cpu1 may
>>> enqueue the skb after cpu0 calling dequeue_skb() and before
>>> cpu0 calling qdisc_run_end().
>>>
>>> Lockless qdisc has below another concurrent problem when
>>> tx_action is involved:
>>>
>>> cpu0(serving tx_action) cpu1 cpu2
>>>   .   ..
>>>   .  q->enqueue.
>>>   .qdisc_run_begin()   .
>>>   .  dequeue_skb() .
>>>   .   .q->enqueue
>>>   .   ..
>>>   . sch_direct_xmit()  .
>>>   .   . qdisc_run_begin()
>>>   .   .   return and do nothing
>>>   .   ..
>>>  clear __QDISC_STATE_SCHED..
>>>  qdisc_run_begin()..
>>>  return and do nothing..
>>>   .   ..
>>>   .qdisc_run_end() .
>>>
>>> This patch fixes the above data race by:
>>> 1. Get the flag before doing spin_trylock().
>>> 2. If the first spin_trylock() return false and the flag is not
>>>set before the first spin_trylock(), Set the flag and retry
>>>another spin_trylock() in case other CPU may not see the new
>>>flag after it releases the lock.
>>> 3. reschedule if the flags is set after the lock is released
>>>at the end of qdisc_run_end().
>>>
>>> For tx_action case, the flags is also set when cpu1 is at the
>>> end if qdisc_run_end(), so tx_action will be rescheduled
>>> again to dequeue the skb enqueued by cpu2.
>>>
>>> Only clear the flag before retrying a dequeuing when dequeuing
>>> returns NULL in order to reduce the overhead of the above double
>>> spin_trylock() and __netif_schedule() calling.
>>>
>>> The performance impact of this patch, tested using pktgen and
>>> dummy netdev with pfifo_fast qdisc attached:
>>>
>>>  threads  without+this_patch   with+this_patch  delta
>>> 12.61Mpps2.60Mpps   -0.3%
>>> 23.97Mpps3.82Mpps   -3.7%
>>> 45.62Mpps5.59Mpps   -0.5%
>>> 82.78Mpps2.77Mpps   -0.3%
>>>162.22Mpps2.22Mpps   -0.0%
>>>
>>> Fixes: 6b3ba9146fe6 ("net: sched: allow qdiscs to handle locking")
>>> Signed-off-by: Yunsheng Lin 
>>> ---
>>> V3: fix a compile error and a few comment typo, remove the
>>> __QDISC_STATE_DEACTIVATED checking, and update the
>>> performance data.
>>> V2: Avoid the overhead of fixing the data race as much as
>>> possible.
>>
>> I tried this patch o top of 5.12-rc7 with real devices. I used two
>> machines with 10Gb/s Intel ixgbe NICs, sender has 16 CPUs (2 8-core CPUs
>> with HT disabled) and 16 Rx/Tx queues, receiver has 48 CPUs (2 12-core
>> CPUs with HT enabled) and 48 Rx/Tx queues. With multiple TCP streams on
>> a saturated ethernet, the CPU consumption grows quite a lot:
>>
>> threads unpatched 5.12-rc75.12-rc7 + v3   
>>   1   25.6%   30.6%
>>   8

Re: [PATCH v2 1/5] mm/swapfile: add percpu_ref support for swap

2021-04-19 Thread Miaohe Lin
On 2021/4/19 15:52, Huang, Ying wrote:
> Miaohe Lin  writes:
> 
>> On 2021/4/19 15:09, Huang, Ying wrote:
>>> Miaohe Lin  writes:
>>>
>>>> On 2021/4/19 10:48, Huang, Ying wrote:
>>>>> Miaohe Lin  writes:
>>>>>
>>>>>> We will use percpu-refcount to serialize against concurrent swapoff. This
>>>>>> patch adds the percpu_ref support for swap.
>>>>>>
>>>>>> Signed-off-by: Miaohe Lin 
>>>>>> ---
>>>>>>  include/linux/swap.h |  3 +++
>>>>>>  mm/swapfile.c| 33 +
>>>>>>  2 files changed, 32 insertions(+), 4 deletions(-)
>>>>>>
>>>>>> diff --git a/include/linux/swap.h b/include/linux/swap.h
>>>>>> index 144727041e78..8be36eb58b7a 100644
>>>>>> --- a/include/linux/swap.h
>>>>>> +++ b/include/linux/swap.h
>>>>>> @@ -240,6 +240,7 @@ struct swap_cluster_list {
>>>>>>   * The in-memory structure used to track swap areas.
>>>>>>   */
>>>>>>  struct swap_info_struct {
>>>>>> +struct percpu_ref users;/* serialization against 
>>>>>> concurrent swapoff */
>>>>>
>>>>> The comments aren't general enough.  We use this to check whether the
>>>>> swap device has been fully initialized, etc. May be something as below?
>>>>>
>>>>> /* indicate and keep swap device valid */
>>>>
>>>> Looks good.
>>>>
>>>>>
>>>>>>  unsigned long   flags;  /* SWP_USED etc: see above */
>>>>>>  signed shortprio;   /* swap priority of this type */
>>>>>>  struct plist_node list; /* entry in swap_active_head */
>>>>>> @@ -260,6 +261,8 @@ struct swap_info_struct {
>>>>>>  struct block_device *bdev;  /* swap device or bdev of swap 
>>>>>> file */
>>>>>>  struct file *swap_file; /* seldom referenced */
>>>>>>  unsigned int old_block_size;/* seldom referenced */
>>>>>> +bool ref_initialized;   /* seldom referenced */
>>>>>> +struct completion comp; /* seldom referenced */
>>>>>>  #ifdef CONFIG_FRONTSWAP
>>>>>>  unsigned long *frontswap_map;   /* frontswap in-use, one bit 
>>>>>> per page */
>>>>>>  atomic_t frontswap_pages;   /* frontswap pages in-use 
>>>>>> counter */
>>>>>> diff --git a/mm/swapfile.c b/mm/swapfile.c
>>>>>> index 149e77454e3c..66515a3a2824 100644
>>>>>> --- a/mm/swapfile.c
>>>>>> +++ b/mm/swapfile.c
>>>>>> @@ -39,6 +39,7 @@
>>>>>>  #include 
>>>>>>  #include 
>>>>>>  #include 
>>>>>> +#include 
>>>>>>  
>>>>>>  #include 
>>>>>>  #include 
>>>>>> @@ -511,6 +512,14 @@ static void swap_discard_work(struct work_struct 
>>>>>> *work)
>>>>>>  spin_unlock(&si->lock);
>>>>>>  }
>>>>>>  
>>>>>> +static void swap_users_ref_free(struct percpu_ref *ref)
>>>>>> +{
>>>>>> +struct swap_info_struct *si;
>>>>>> +
>>>>>> +si = container_of(ref, struct swap_info_struct, users);
>>>>>> +complete(&si->comp);
>>>>>> +}
>>>>>> +
>>>>>>  static void alloc_cluster(struct swap_info_struct *si, unsigned long 
>>>>>> idx)
>>>>>>  {
>>>>>>  struct swap_cluster_info *ci = si->cluster_info;
>>>>>> @@ -2500,7 +2509,7 @@ static void enable_swap_info(struct 
>>>>>> swap_info_struct *p, int prio,
>>>>>>   * Guarantee swap_map, cluster_info, etc. fields are valid
>>>>>>   * between get/put_swap_device() if SWP_VALID bit is set
>>>>>>   */
>>>>>> -synchronize_rcu();
>>>>>
>>>>> You cannot remove this without changing get/put_swap_device().  It's
>>>>> better to squash at least PATCH 1-2.
>>>>
>>>&g

Re: [PATCH v2 5/5] mm/shmem: fix shmem_swapin() race with swapoff

2021-04-19 Thread Miaohe Lin
On 2021/4/19 15:41, Huang, Ying wrote:
> Miaohe Lin  writes:
> 
>> On 2021/4/19 15:04, Huang, Ying wrote:
>>> Miaohe Lin  writes:
>>>
>>>> On 2021/4/19 10:15, Huang, Ying wrote:
>>>>> Miaohe Lin  writes:
>>>>>
>>>>>> When I was investigating the swap code, I found the below possible race
>>>>>> window:
>>>>>>
>>>>>> CPU 1   CPU 2
>>>>>> -   -
>>>>>> shmem_swapin
>>>>>>   swap_cluster_readahead
>>>>>> if (likely(si->flags & (SWP_BLKDEV | SWP_FS_OPS))) {
>>>>>> swapoff
>>>>>>   si->flags &= 
>>>>>> ~SWP_VALID;
>>>>>>   ..
>>>>>>   synchronize_rcu();
>>>>>>   ..
>>>>>
>>>>> You have removed these code in the previous patches of the series.  And
>>>>> they are not relevant in this patch.
>>>>
>>>> Yes, I should change these. Thanks.
>>>>
>>>>>
>>>>>>   si->swap_file = NULL;
>>>>>> struct inode *inode = si->swap_file->f_mapping->host;[oops!]
>>>>>>
>>>>>> Close this race window by using get/put_swap_device() to guard against
>>>>>> concurrent swapoff.
>>>>>>
>>>>>> Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
>>>>>
>>>>> No.  This isn't the commit that introduces the race condition.  Please
>>>>> recheck your git blame result.
>>>>>
>>>>
>>>> I think this is really hard to find exact commit. I used git blame and 
>>>> found
>>>> this race should be existed when this is introduced. Any suggestion ?
>>>> Thanks.
>>>
>>> I think the commit that introduces the race condition is commit
>>> 8fd2e0b505d1 ("mm: swap: check if swap backing device is congested or
>>> not")
>>>
>>
>> Thanks.
>> The commit log only describes one race condition. And for that one, this 
>> should be correct
>> Fixes tag. But there are still many other race conditions inside 
>> swap_cluster_readahead,
>> such as swap_readpage() called from swap_cluster_readahead. This tag could 
>> not cover the
>> all race windows.
> 
> No. swap_readpage() in swap_cluster_readahead() is OK.  Because
> __read_swap_cache_async() is called before that, so the swap entry will
> be marked with SWAP_HAS_CACHE, and page will be locked.
> 

Oh... I missed this. Many thanks for your remind.

> Best Regards,
> Huang, Ying
> 
>>> Best Regards,
>>> Huang, Ying
>>>
>>>>> Best Regards,
>>>>> Huang, Ying
>>>>>
>>>>>> Signed-off-by: Miaohe Lin 
>>>>>> ---
>>>>>>  mm/shmem.c | 6 ++
>>>>>>  1 file changed, 6 insertions(+)
>>>>>>
>>>>>> diff --git a/mm/shmem.c b/mm/shmem.c
>>>>>> index 26c76b13ad23..936ba5595297 100644
>>>>>> --- a/mm/shmem.c
>>>>>> +++ b/mm/shmem.c
>>>>>> @@ -1492,15 +1492,21 @@ static void shmem_pseudo_vma_destroy(struct 
>>>>>> vm_area_struct *vma)
>>>>>>  static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
>>>>>>  struct shmem_inode_info *info, pgoff_t index)
>>>>>>  {
>>>>>> +struct swap_info_struct *si;
>>>>>>  struct vm_area_struct pvma;
>>>>>>  struct page *page;
>>>>>>  struct vm_fault vmf = {
>>>>>>  .vma = &pvma,
>>>>>>  };
>>>>>>  
>>>>>> +/* Prevent swapoff from happening to us. */
>>>>>> +si = get_swap_device(swap);
>>>>>> +if (unlikely(!si))
>>>>>> +return NULL;
>>>>>>  shmem_pseudo_vma_init(&pvma, info, index);
>>>>>>  page = swap_cluster_readahead(swap, gfp, &vmf);
>>>>>>  shmem_pseudo_vma_destroy(&pvma);
>>>>>> +put_swap_device(si);
>>>>>>  
>>>>>>  return page;
>>>>>>  }
>>>>> .
>>>>>
>>> .
>>>
> .
> 



Re: [PATCH v2 1/5] mm/swapfile: add percpu_ref support for swap

2021-04-19 Thread Miaohe Lin
On 2021/4/19 15:09, Huang, Ying wrote:
> Miaohe Lin  writes:
> 
>> On 2021/4/19 10:48, Huang, Ying wrote:
>>> Miaohe Lin  writes:
>>>
>>>> We will use percpu-refcount to serialize against concurrent swapoff. This
>>>> patch adds the percpu_ref support for swap.
>>>>
>>>> Signed-off-by: Miaohe Lin 
>>>> ---
>>>>  include/linux/swap.h |  3 +++
>>>>  mm/swapfile.c| 33 +
>>>>  2 files changed, 32 insertions(+), 4 deletions(-)
>>>>
>>>> diff --git a/include/linux/swap.h b/include/linux/swap.h
>>>> index 144727041e78..8be36eb58b7a 100644
>>>> --- a/include/linux/swap.h
>>>> +++ b/include/linux/swap.h
>>>> @@ -240,6 +240,7 @@ struct swap_cluster_list {
>>>>   * The in-memory structure used to track swap areas.
>>>>   */
>>>>  struct swap_info_struct {
>>>> +  struct percpu_ref users;/* serialization against concurrent 
>>>> swapoff */
>>>
>>> The comments aren't general enough.  We use this to check whether the
>>> swap device has been fully initialized, etc. May be something as below?
>>>
>>> /* indicate and keep swap device valid */
>>
>> Looks good.
>>
>>>
>>>>unsigned long   flags;  /* SWP_USED etc: see above */
>>>>signed shortprio;   /* swap priority of this type */
>>>>struct plist_node list; /* entry in swap_active_head */
>>>> @@ -260,6 +261,8 @@ struct swap_info_struct {
>>>>struct block_device *bdev;  /* swap device or bdev of swap file */
>>>>struct file *swap_file; /* seldom referenced */
>>>>unsigned int old_block_size;/* seldom referenced */
>>>> +  bool ref_initialized;   /* seldom referenced */
>>>> +  struct completion comp; /* seldom referenced */
>>>>  #ifdef CONFIG_FRONTSWAP
>>>>unsigned long *frontswap_map;   /* frontswap in-use, one bit per page */
>>>>atomic_t frontswap_pages;   /* frontswap pages in-use counter */
>>>> diff --git a/mm/swapfile.c b/mm/swapfile.c
>>>> index 149e77454e3c..66515a3a2824 100644
>>>> --- a/mm/swapfile.c
>>>> +++ b/mm/swapfile.c
>>>> @@ -39,6 +39,7 @@
>>>>  #include 
>>>>  #include 
>>>>  #include 
>>>> +#include 
>>>>  
>>>>  #include 
>>>>  #include 
>>>> @@ -511,6 +512,14 @@ static void swap_discard_work(struct work_struct 
>>>> *work)
>>>>spin_unlock(&si->lock);
>>>>  }
>>>>  
>>>> +static void swap_users_ref_free(struct percpu_ref *ref)
>>>> +{
>>>> +  struct swap_info_struct *si;
>>>> +
>>>> +  si = container_of(ref, struct swap_info_struct, users);
>>>> +  complete(&si->comp);
>>>> +}
>>>> +
>>>>  static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
>>>>  {
>>>>struct swap_cluster_info *ci = si->cluster_info;
>>>> @@ -2500,7 +2509,7 @@ static void enable_swap_info(struct swap_info_struct 
>>>> *p, int prio,
>>>> * Guarantee swap_map, cluster_info, etc. fields are valid
>>>> * between get/put_swap_device() if SWP_VALID bit is set
>>>> */
>>>> -  synchronize_rcu();
>>>
>>> You cannot remove this without changing get/put_swap_device().  It's
>>> better to squash at least PATCH 1-2.
>>
>> Will squash PATCH 1-2. Thanks.
>>
>>>
>>>> +  percpu_ref_resurrect(&p->users);
>>>>spin_lock(&swap_lock);
>>>>spin_lock(&p->lock);
>>>>_enable_swap_info(p);
>>>> @@ -2621,11 +2630,18 @@ SYSCALL_DEFINE1(swapoff, const char __user *, 
>>>> specialfile)
>>>>p->flags &= ~SWP_VALID; /* mark swap device as invalid */
>>>>spin_unlock(&p->lock);
>>>>spin_unlock(&swap_lock);
>>>> +
>>>> +  percpu_ref_kill(&p->users);
>>>>/*
>>>> -   * wait for swap operations protected by get/put_swap_device()
>>>> -   * to complete
>>>> +   * We need synchronize_rcu() here to protect the accessing
>>>> +   * to the swap cache data structure.
>>>> */
>>>>synchronize_rc

Re: [PATCH v2 5/5] mm/shmem: fix shmem_swapin() race with swapoff

2021-04-19 Thread Miaohe Lin
On 2021/4/19 15:04, Huang, Ying wrote:
> Miaohe Lin  writes:
> 
>> On 2021/4/19 10:15, Huang, Ying wrote:
>>> Miaohe Lin  writes:
>>>
>>>> When I was investigating the swap code, I found the below possible race
>>>> window:
>>>>
>>>> CPU 1   CPU 2
>>>> -   -
>>>> shmem_swapin
>>>>   swap_cluster_readahead
>>>> if (likely(si->flags & (SWP_BLKDEV | SWP_FS_OPS))) {
>>>> swapoff
>>>>   si->flags &= ~SWP_VALID;
>>>>   ..
>>>>   synchronize_rcu();
>>>>   ..
>>>
>>> You have removed these code in the previous patches of the series.  And
>>> they are not relevant in this patch.
>>
>> Yes, I should change these. Thanks.
>>
>>>
>>>>   si->swap_file = NULL;
>>>> struct inode *inode = si->swap_file->f_mapping->host;[oops!]
>>>>
>>>> Close this race window by using get/put_swap_device() to guard against
>>>> concurrent swapoff.
>>>>
>>>> Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
>>>
>>> No.  This isn't the commit that introduces the race condition.  Please
>>> recheck your git blame result.
>>>
>>
>> I think this is really hard to find exact commit. I used git blame and found
>> this race should be existed when this is introduced. Any suggestion ?
>> Thanks.
> 
> I think the commit that introduces the race condition is commit
> 8fd2e0b505d1 ("mm: swap: check if swap backing device is congested or
> not")
> 

Thanks.
The commit log only describes one race condition. And for that one, this should 
be correct
Fixes tag. But there are still many other race conditions inside 
swap_cluster_readahead,
such as swap_readpage() called from swap_cluster_readahead. This tag could not 
cover the
all race windows.

> Best Regards,
> Huang, Ying
> 
>>> Best Regards,
>>> Huang, Ying
>>>
>>>> Signed-off-by: Miaohe Lin 
>>>> ---
>>>>  mm/shmem.c | 6 ++
>>>>  1 file changed, 6 insertions(+)
>>>>
>>>> diff --git a/mm/shmem.c b/mm/shmem.c
>>>> index 26c76b13ad23..936ba5595297 100644
>>>> --- a/mm/shmem.c
>>>> +++ b/mm/shmem.c
>>>> @@ -1492,15 +1492,21 @@ static void shmem_pseudo_vma_destroy(struct 
>>>> vm_area_struct *vma)
>>>>  static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
>>>>struct shmem_inode_info *info, pgoff_t index)
>>>>  {
>>>> +  struct swap_info_struct *si;
>>>>struct vm_area_struct pvma;
>>>>struct page *page;
>>>>struct vm_fault vmf = {
>>>>.vma = &pvma,
>>>>};
>>>>  
>>>> +  /* Prevent swapoff from happening to us. */
>>>> +  si = get_swap_device(swap);
>>>> +  if (unlikely(!si))
>>>> +  return NULL;
>>>>shmem_pseudo_vma_init(&pvma, info, index);
>>>>page = swap_cluster_readahead(swap, gfp, &vmf);
>>>>shmem_pseudo_vma_destroy(&pvma);
>>>> +  put_swap_device(si);
>>>>  
>>>>return page;
>>>>  }
>>> .
>>>
> .
> 



Re: [PATCH v2 2/5] mm/swapfile: use percpu_ref to serialize against concurrent swapoff

2021-04-18 Thread Miaohe Lin
On 2021/4/19 10:54, Huang, Ying wrote:
> Miaohe Lin  writes:
> 
>> Use percpu_ref to serialize against concurrent swapoff. Also remove the
>> SWP_VALID flag because it's used together with RCU solution.
>>
>> Signed-off-by: Miaohe Lin 
>> ---
>>  include/linux/swap.h |  3 +--
>>  mm/swapfile.c| 43 +--
>>  2 files changed, 18 insertions(+), 28 deletions(-)
>>
>> diff --git a/include/linux/swap.h b/include/linux/swap.h
>> index 8be36eb58b7a..993693b38109 100644
>> --- a/include/linux/swap.h
>> +++ b/include/linux/swap.h
>> @@ -177,7 +177,6 @@ enum {
>>  SWP_PAGE_DISCARD = (1 << 10),   /* freed swap page-cluster discards */
>>  SWP_STABLE_WRITES = (1 << 11),  /* no overwrite PG_writeback pages */
>>  SWP_SYNCHRONOUS_IO = (1 << 12), /* synchronous IO is efficient */
>> -SWP_VALID   = (1 << 13),/* swap is valid to be operated on? */
>>  /* add others here before... */
>>  SWP_SCANNING= (1 << 14),/* refcount in scan_swap_map */
>>  };
>> @@ -514,7 +513,7 @@ sector_t swap_page_sector(struct page *page);
>>  
>>  static inline void put_swap_device(struct swap_info_struct *si)
>>  {
>> -rcu_read_unlock();
>> +percpu_ref_put(&si->users);
>>  }
>>  
>>  #else /* CONFIG_SWAP */
>> diff --git a/mm/swapfile.c b/mm/swapfile.c
>> index 66515a3a2824..90e197bc2eeb 100644
>> --- a/mm/swapfile.c
>> +++ b/mm/swapfile.c
>> @@ -1279,18 +1279,12 @@ static unsigned char __swap_entry_free_locked(struct 
>> swap_info_struct *p,
>>   * via preventing the swap device from being swapoff, until
>>   * put_swap_device() is called.  Otherwise return NULL.
>>   *
>> - * The entirety of the RCU read critical section must come before the
>> - * return from or after the call to synchronize_rcu() in
>> - * enable_swap_info() or swapoff().  So if "si->flags & SWP_VALID" is
>> - * true, the si->map, si->cluster_info, etc. must be valid in the
>> - * critical section.
>> - *
>>   * Notice that swapoff or swapoff+swapon can still happen before the
>> - * rcu_read_lock() in get_swap_device() or after the rcu_read_unlock()
>> - * in put_swap_device() if there isn't any other way to prevent
>> - * swapoff, such as page lock, page table lock, etc.  The caller must
>> - * be prepared for that.  For example, the following situation is
>> - * possible.
>> + * percpu_ref_tryget_live() in get_swap_device() or after the
>> + * percpu_ref_put() in put_swap_device() if there isn't any other way
>> + * to prevent swapoff, such as page lock, page table lock, etc.  The
>> + * caller must be prepared for that.  For example, the following
>> + * situation is possible.
>>   *
>>   *   CPU1   CPU2
>>   *   do_swap_page()
>> @@ -1318,21 +1312,24 @@ struct swap_info_struct *get_swap_device(swp_entry_t 
>> entry)
>>  si = swp_swap_info(entry);
>>  if (!si)
>>  goto bad_nofile;
>> -
>> -rcu_read_lock();
>> -if (data_race(!(si->flags & SWP_VALID)))
>> -goto unlock_out;
>> +if (!percpu_ref_tryget_live(&si->users))
>> +goto out;
>> +/*
>> + * Guarantee we will not reference uninitialized fields
>> + * of swap_info_struct.
>> + */
> 
> /*
>  * Guarantee the si->users are checked before accessing other fields of
>  * swap_info_struct.
> */
> 
>> +smp_rmb();
> 
> Usually, smp_rmb() need to be paired with smp_wmb().  Some comments are
> needed for that.  Here smb_rmb() is paired with the spin_unlock() after
> setup_swap_info() in enable_swap_info().
> 
>>  offset = swp_offset(entry);
>>  if (offset >= si->max)
>> -goto unlock_out;
>> +goto put_out;
>>  
>>  return si;
>>  bad_nofile:
>>  pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val);
>>  out:
>>  return NULL;
>> -unlock_out:
>> -rcu_read_unlock();
>> +put_out:
>> +percpu_ref_put(&si->users);
>>  return NULL;
>>  }
>>  
>> @@ -2475,7 +2472,7 @@ static void setup_swap_info(struct swap_info_struct 
>> *p, int prio,
>>  
>>  static void _enable_swap_info(struct swap_info_struct *p)
>>  {
>> -p->flags |= SWP_WRITEOK | SWP_VALID;
>> +p->flags |= SWP_WRITEOK;
>>  atomic_long

Re: [PATCH v2 3/5] swap: fix do_swap_page() race with swapoff

2021-04-18 Thread Miaohe Lin
On 2021/4/19 10:23, Huang, Ying wrote:
> Miaohe Lin  writes:
> 
>> When I was investigating the swap code, I found the below possible race
>> window:
>>
>> CPU 1CPU 2
>> --
>> do_swap_page
> 
> This is OK for swap cache cases.  So
> 
>   if (data_race(si->flags & SWP_SYNCHRONOUS_IO))
> 
> should be shown here.

Ok.

> 
>>   swap_readpage(skip swap cache case)
>> if (data_race(sis->flags & SWP_FS_OPS)) {
>>  swapoff
>>p->flags = &= ~SWP_VALID;
>>..
>>synchronize_rcu();
>>..
>>p->swap_file = NULL;
>> struct file *swap_file = sis->swap_file;
>> struct address_space *mapping = swap_file->f_mapping;[oops!]
>>
>> Note that for the pages that are swapped in through swap cache, this isn't
>> an issue. Because the page is locked, and the swap entry will be marked
>> with SWAP_HAS_CACHE, so swapoff() can not proceed until the page has been
>> unlocked.
>>
>> Using current get/put_swap_device() to guard against concurrent swapoff for
>> swap_readpage() looks terrible because swap_readpage() may take really long
>> time. And this race may not be really pernicious because swapoff is usually
>> done when system shutdown only. To reduce the performance overhead on the
>> hot-path as much as possible, it appears we can use the percpu_ref to close
>> this race window(as suggested by Huang, Ying).
> 
> I still suggest to squash PATCH 1-3, at least PATCH 1-2.  That will
> change the relevant code together and make it easier to review.
> 

Will squash PATCH 1-2. Thanks.

> Best Regards,
> Huang, Ying
> 
>> Fixes: 0bcac06f27d7 ("mm,swap: skip swapcache for swapin of synchronous 
>> device")
>> Reported-by: kernel test robot  (auto build test ERROR)
>> Signed-off-by: Miaohe Lin 
>> ---
>>  include/linux/swap.h | 9 +
>>  mm/memory.c  | 9 +
>>  2 files changed, 18 insertions(+)
>>
>> diff --git a/include/linux/swap.h b/include/linux/swap.h
>> index 993693b38109..523c2411a135 100644
>> --- a/include/linux/swap.h
>> +++ b/include/linux/swap.h
>> @@ -528,6 +528,15 @@ static inline struct swap_info_struct 
>> *swp_swap_info(swp_entry_t entry)
>>  return NULL;
>>  }
>>  
>> +static inline struct swap_info_struct *get_swap_device(swp_entry_t entry)
>> +{
>> +return NULL;
>> +}
>> +
>> +static inline void put_swap_device(struct swap_info_struct *si)
>> +{
>> +}
>> +
>>  #define swap_address_space(entry)   (NULL)
>>  #define get_nr_swap_pages() 0L
>>  #define total_swap_pages0L
>> diff --git a/mm/memory.c b/mm/memory.c
>> index 27014c3bde9f..7a2fe12cf641 100644
>> --- a/mm/memory.c
>> +++ b/mm/memory.c
>> @@ -3311,6 +3311,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>>  {
>>  struct vm_area_struct *vma = vmf->vma;
>>  struct page *page = NULL, *swapcache;
>> +struct swap_info_struct *si = NULL;
>>  swp_entry_t entry;
>>  pte_t pte;
>>  int locked;
>> @@ -3338,6 +3339,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>>  goto out;
>>  }
>>  
>> +/* Prevent swapoff from happening to us. */
>> +si = get_swap_device(entry);
>> +if (unlikely(!si))
>> +goto out;
>>  
>>  delayacct_set_flag(current, DELAYACCT_PF_SWAPIN);
>>  page = lookup_swap_cache(entry, vma, vmf->address);
>> @@ -3514,6 +3519,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>>  unlock:
>>  pte_unmap_unlock(vmf->pte, vmf->ptl);
>>  out:
>> +if (si)
>> +put_swap_device(si);
>>  return ret;
>>  out_nomap:
>>  pte_unmap_unlock(vmf->pte, vmf->ptl);
>> @@ -3525,6 +3532,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>>  unlock_page(swapcache);
>>  put_page(swapcache);
>>  }
>> +if (si)
>> +put_swap_device(si);
>>  return ret;
>>  }
> .
> 



Re: [PATCH v2 5/5] mm/shmem: fix shmem_swapin() race with swapoff

2021-04-18 Thread Miaohe Lin
On 2021/4/19 10:15, Huang, Ying wrote:
> Miaohe Lin  writes:
> 
>> When I was investigating the swap code, I found the below possible race
>> window:
>>
>> CPU 1   CPU 2
>> -   -
>> shmem_swapin
>>   swap_cluster_readahead
>> if (likely(si->flags & (SWP_BLKDEV | SWP_FS_OPS))) {
>> swapoff
>>   si->flags &= ~SWP_VALID;
>>   ..
>>   synchronize_rcu();
>>   ..
> 
> You have removed these code in the previous patches of the series.  And
> they are not relevant in this patch.

Yes, I should change these. Thanks.

> 
>>   si->swap_file = NULL;
>> struct inode *inode = si->swap_file->f_mapping->host;[oops!]
>>
>> Close this race window by using get/put_swap_device() to guard against
>> concurrent swapoff.
>>
>> Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
> 
> No.  This isn't the commit that introduces the race condition.  Please
> recheck your git blame result.
> 

I think this is really hard to find exact commit. I used git blame and found
this race should be existed when this is introduced. Any suggestion ?
Thanks.

> Best Regards,
> Huang, Ying
> 
>> Signed-off-by: Miaohe Lin 
>> ---
>>  mm/shmem.c | 6 ++
>>  1 file changed, 6 insertions(+)
>>
>> diff --git a/mm/shmem.c b/mm/shmem.c
>> index 26c76b13ad23..936ba5595297 100644
>> --- a/mm/shmem.c
>> +++ b/mm/shmem.c
>> @@ -1492,15 +1492,21 @@ static void shmem_pseudo_vma_destroy(struct 
>> vm_area_struct *vma)
>>  static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
>>  struct shmem_inode_info *info, pgoff_t index)
>>  {
>> +struct swap_info_struct *si;
>>  struct vm_area_struct pvma;
>>  struct page *page;
>>  struct vm_fault vmf = {
>>  .vma = &pvma,
>>  };
>>  
>> +/* Prevent swapoff from happening to us. */
>> +si = get_swap_device(swap);
>> +if (unlikely(!si))
>> +return NULL;
>>  shmem_pseudo_vma_init(&pvma, info, index);
>>  page = swap_cluster_readahead(swap, gfp, &vmf);
>>  shmem_pseudo_vma_destroy(&pvma);
>> +put_swap_device(si);
>>  
>>  return page;
>>  }
> .
> 



  1   2   3   4   5   6   7   8   9   10   >