[PATCH net-next v9 7/7] rstreason: make it work in trace world

2024-04-24 Thread Jason Xing
From: Jason Xing 

At last, we should let it work by introducing this reset reason in
trace world.

One of the possible expected outputs is:
... tcp_send_reset: skbaddr=xxx skaddr=xxx src=xxx dest=xxx
state=TCP_ESTABLISHED reason=NOT_SPECIFIED

Signed-off-by: Jason Xing 
Reviewed-by: Steven Rostedt (Google) 
---
 include/trace/events/tcp.h | 26 ++
 net/ipv4/tcp_ipv4.c|  2 +-
 net/ipv4/tcp_output.c  |  2 +-
 net/ipv6/tcp_ipv6.c|  2 +-
 4 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
index 5c04a61a11c2..49b5ee091cf6 100644
--- a/include/trace/events/tcp.h
+++ b/include/trace/events/tcp.h
@@ -11,6 +11,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /*
  * tcp event with arguments sk and skb
@@ -74,20 +75,32 @@ DEFINE_EVENT(tcp_event_sk_skb, tcp_retransmit_skb,
TP_ARGS(sk, skb)
 );
 
+#undef FN
+#define FN(reason) TRACE_DEFINE_ENUM(SK_RST_REASON_##reason);
+DEFINE_RST_REASON(FN, FN)
+
+#undef FN
+#undef FNe
+#define FN(reason) { SK_RST_REASON_##reason, #reason },
+#define FNe(reason){ SK_RST_REASON_##reason, #reason }
+
 /*
  * skb of trace_tcp_send_reset is the skb that caused RST. In case of
  * active reset, skb should be NULL
  */
 TRACE_EVENT(tcp_send_reset,
 
-   TP_PROTO(const struct sock *sk, const struct sk_buff *skb),
+   TP_PROTO(const struct sock *sk,
+const struct sk_buff *skb,
+const enum sk_rst_reason reason),
 
-   TP_ARGS(sk, skb),
+   TP_ARGS(sk, skb, reason),
 
TP_STRUCT__entry(
__field(const void *, skbaddr)
__field(const void *, skaddr)
__field(int, state)
+   __field(enum sk_rst_reason, reason)
__array(__u8, saddr, sizeof(struct sockaddr_in6))
__array(__u8, daddr, sizeof(struct sockaddr_in6))
),
@@ -113,14 +126,19 @@ TRACE_EVENT(tcp_send_reset,
 */
TP_STORE_ADDR_PORTS_SKB(skb, th, entry->daddr, 
entry->saddr);
}
+   __entry->reason = reason;
),
 
-   TP_printk("skbaddr=%p skaddr=%p src=%pISpc dest=%pISpc state=%s",
+   TP_printk("skbaddr=%p skaddr=%p src=%pISpc dest=%pISpc state=%s 
reason=%s",
  __entry->skbaddr, __entry->skaddr,
  __entry->saddr, __entry->daddr,
- __entry->state ? show_tcp_state_name(__entry->state) : 
"UNKNOWN")
+ __entry->state ? show_tcp_state_name(__entry->state) : 
"UNKNOWN",
+ __print_symbolic(__entry->reason, DEFINE_RST_REASON(FN, FNe)))
 );
 
+#undef FN
+#undef FNe
+
 /*
  * tcp event with arguments sk
  *
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 6bd3a0fb9439..6096ac7a3a02 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -871,7 +871,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct 
sk_buff *skb,
if (sk)
arg.bound_dev_if = sk->sk_bound_dev_if;
 
-   trace_tcp_send_reset(sk, skb);
+   trace_tcp_send_reset(sk, skb, reason);
 
BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index e4f5c8b5172a..a8d5f22c079b 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3640,7 +3640,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t 
priority,
/* skb of trace_tcp_send_reset() keeps the skb that caused RST,
 * skb here is different to the troublesome skb, so use NULL
 */
-   trace_tcp_send_reset(sk, NULL);
+   trace_tcp_send_reset(sk, NULL, SK_RST_REASON_NOT_SPECIFIED);
 }
 
 /* Send a crossed SYN-ACK during socket establishment.
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 317d7a6e6b01..77958adf2e16 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1133,7 +1133,7 @@ static void tcp_v6_send_reset(const struct sock *sk, 
struct sk_buff *skb,
label = ip6_flowlabel(ipv6h);
}
 
-   trace_tcp_send_reset(sk, skb);
+   trace_tcp_send_reset(sk, skb, reason);
 
tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, 1,
 ipv6_get_dsfield(ipv6h), label, priority, txhash,
-- 
2.37.3




[PATCH net-next v9 6/7] mptcp: introducing a helper into active reset logic

2024-04-24 Thread Jason Xing
From: Jason Xing 

Since we have mapped every mptcp reset reason definition in enum
sk_rst_reason, introducing a new helper can cover some missing places
where we have already set the subflow->reset_reason.

Note: using SK_RST_REASON_NOT_SPECIFIED is the same as
SK_RST_REASON_MPTCP_RST_EUNSPEC. They are both unknown. So we can convert
it directly.

Suggested-by: Paolo Abeni 
Signed-off-by: Jason Xing 
Reviewed-by: Matthieu Baerts (NGI0) 
---
Link: 
https://lore.kernel.org/all/2d3ea199eef53cf6a0c48e21abdee0eefbdee927.ca...@redhat.com/
---
 net/mptcp/protocol.c |  4 +---
 net/mptcp/protocol.h | 11 +++
 net/mptcp/subflow.c  |  6 ++
 3 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 065967086492..4b13ca362efa 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -21,7 +21,6 @@
 #endif
 #include 
 #include 
-#include 
 #include 
 #include "protocol.h"
 #include "mib.h"
@@ -2570,8 +2569,7 @@ static void mptcp_check_fastclose(struct mptcp_sock *msk)
 
slow = lock_sock_fast(tcp_sk);
if (tcp_sk->sk_state != TCP_CLOSE) {
-   tcp_send_active_reset(tcp_sk, GFP_ATOMIC,
- SK_RST_REASON_NOT_SPECIFIED);
+   mptcp_send_active_reset_reason(tcp_sk);
tcp_set_state(tcp_sk, TCP_CLOSE);
}
unlock_sock_fast(tcp_sk, slow);
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 252618859ee8..cfc5f9c3f113 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -12,6 +12,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "mptcp_pm_gen.h"
 
@@ -608,6 +609,16 @@ sk_rst_convert_mptcp_reason(u32 reason)
}
 }
 
+static inline void
+mptcp_send_active_reset_reason(struct sock *sk)
+{
+   struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+   enum sk_rst_reason reason;
+
+   reason = sk_rst_convert_mptcp_reason(subflow->reset_reason);
+   tcp_send_active_reset(sk, GFP_ATOMIC, reason);
+}
+
 static inline u64
 mptcp_subflow_get_map_offset(const struct mptcp_subflow_context *subflow)
 {
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index fb7abf2d01ca..97ec44d1df30 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -20,7 +20,6 @@
 #include 
 #endif
 #include 
-#include 
 
 #include "protocol.h"
 #include "mib.h"
@@ -424,7 +423,7 @@ void mptcp_subflow_reset(struct sock *ssk)
/* must hold: tcp_done() could drop last reference on parent */
sock_hold(sk);
 
-   tcp_send_active_reset(ssk, GFP_ATOMIC, SK_RST_REASON_NOT_SPECIFIED);
+   mptcp_send_active_reset_reason(ssk);
tcp_done(ssk);
if (!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, _sk(sk)->flags))
mptcp_schedule_work(sk);
@@ -1362,8 +1361,7 @@ static bool subflow_check_data_avail(struct sock *ssk)
tcp_set_state(ssk, TCP_CLOSE);
while ((skb = skb_peek(>sk_receive_queue)))
sk_eat_skb(ssk, skb);
-   tcp_send_active_reset(ssk, GFP_ATOMIC,
- SK_RST_REASON_NOT_SPECIFIED);
+   mptcp_send_active_reset_reason(ssk);
WRITE_ONCE(subflow->data_avail, false);
return false;
}
-- 
2.37.3




[PATCH net-next v9 5/7] mptcp: support rstreason for passive reset

2024-04-24 Thread Jason Xing
From: Jason Xing 

It relies on what reset options in the skb are as rfc8684 says. Reusing
this logic can save us much energy. This patch replaces most of the prior
NOT_SPECIFIED reasons.

Signed-off-by: Jason Xing 
Reviewed-by: Matthieu Baerts (NGI0) 
---
 net/mptcp/protocol.h | 27 +++
 net/mptcp/subflow.c  | 22 +-
 2 files changed, 44 insertions(+), 5 deletions(-)

diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index fdfa843e2d88..252618859ee8 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -581,6 +581,33 @@ mptcp_subflow_ctx_reset(struct mptcp_subflow_context 
*subflow)
WRITE_ONCE(subflow->local_id, -1);
 }
 
+/* Convert reset reasons in MPTCP to enum sk_rst_reason type */
+static inline enum sk_rst_reason
+sk_rst_convert_mptcp_reason(u32 reason)
+{
+   switch (reason) {
+   case MPTCP_RST_EUNSPEC:
+   return SK_RST_REASON_MPTCP_RST_EUNSPEC;
+   case MPTCP_RST_EMPTCP:
+   return SK_RST_REASON_MPTCP_RST_EMPTCP;
+   case MPTCP_RST_ERESOURCE:
+   return SK_RST_REASON_MPTCP_RST_ERESOURCE;
+   case MPTCP_RST_EPROHIBIT:
+   return SK_RST_REASON_MPTCP_RST_EPROHIBIT;
+   case MPTCP_RST_EWQ2BIG:
+   return SK_RST_REASON_MPTCP_RST_EWQ2BIG;
+   case MPTCP_RST_EBADPERF:
+   return SK_RST_REASON_MPTCP_RST_EBADPERF;
+   case MPTCP_RST_EMIDDLEBOX:
+   return SK_RST_REASON_MPTCP_RST_EMIDDLEBOX;
+   default:
+   /* It should not happen, or else errors may occur
+* in MPTCP layer
+*/
+   return SK_RST_REASON_ERROR;
+   }
+}
+
 static inline u64
 mptcp_subflow_get_map_offset(const struct mptcp_subflow_context *subflow)
 {
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index ac867d277860..fb7abf2d01ca 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -309,8 +309,13 @@ static struct dst_entry *subflow_v4_route_req(const struct 
sock *sk,
return dst;
 
dst_release(dst);
-   if (!req->syncookie)
-   tcp_request_sock_ops.send_reset(sk, skb, 
SK_RST_REASON_NOT_SPECIFIED);
+   if (!req->syncookie) {
+   struct mptcp_ext *mpext = mptcp_get_ext(skb);
+   enum sk_rst_reason reason;
+
+   reason = sk_rst_convert_mptcp_reason(mpext->reset_reason);
+   tcp_request_sock_ops.send_reset(sk, skb, reason);
+   }
return NULL;
 }
 
@@ -377,8 +382,13 @@ static struct dst_entry *subflow_v6_route_req(const struct 
sock *sk,
return dst;
 
dst_release(dst);
-   if (!req->syncookie)
-   tcp6_request_sock_ops.send_reset(sk, skb, 
SK_RST_REASON_NOT_SPECIFIED);
+   if (!req->syncookie) {
+   struct mptcp_ext *mpext = mptcp_get_ext(skb);
+   enum sk_rst_reason reason;
+
+   reason = sk_rst_convert_mptcp_reason(mpext->reset_reason);
+   tcp6_request_sock_ops.send_reset(sk, skb, reason);
+   }
return NULL;
 }
 #endif
@@ -783,6 +793,7 @@ static struct sock *subflow_syn_recv_sock(const struct sock 
*sk,
struct mptcp_subflow_request_sock *subflow_req;
struct mptcp_options_received mp_opt;
bool fallback, fallback_is_fatal;
+   enum sk_rst_reason reason;
struct mptcp_sock *owner;
struct sock *child;
 
@@ -913,7 +924,8 @@ static struct sock *subflow_syn_recv_sock(const struct sock 
*sk,
tcp_rsk(req)->drop_req = true;
inet_csk_prepare_for_destroy_sock(child);
tcp_done(child);
-   req->rsk_ops->send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
+   reason = sk_rst_convert_mptcp_reason(mptcp_get_ext(skb)->reset_reason);
+   req->rsk_ops->send_reset(sk, skb, reason);
 
/* The last child reference will be released by the caller */
return child;
-- 
2.37.3




[PATCH net-next v9 4/7] tcp: support rstreason for passive reset

2024-04-24 Thread Jason Xing
From: Jason Xing 

Reuse the dropreason logic to show the exact reason of tcp reset,
so we can finally display the corresponding item in enum sk_reset_reason
instead of reinventing new reset reasons. This patch replaces all
the prior NOT_SPECIFIED reasons.

Signed-off-by: Jason Xing 
---
 include/net/rstreason.h | 15 +++
 net/ipv4/tcp_ipv4.c | 11 +++
 net/ipv6/tcp_ipv6.c | 11 +++
 3 files changed, 29 insertions(+), 8 deletions(-)

diff --git a/include/net/rstreason.h b/include/net/rstreason.h
index bc53b5a24505..df3b6ac0c9b3 100644
--- a/include/net/rstreason.h
+++ b/include/net/rstreason.h
@@ -103,4 +103,19 @@ enum sk_rst_reason {
 */
SK_RST_REASON_MAX,
 };
+
+/* Convert skb drop reasons to enum sk_rst_reason type */
+static inline enum sk_rst_reason
+sk_rst_convert_drop_reason(enum skb_drop_reason reason)
+{
+   switch (reason) {
+   case SKB_DROP_REASON_NOT_SPECIFIED:
+   return SK_RST_REASON_NOT_SPECIFIED;
+   case SKB_DROP_REASON_NO_SOCKET:
+   return SK_RST_REASON_NO_SOCKET;
+   default:
+   /* If we don't have our own corresponding reason */
+   return SK_RST_REASON_NOT_SPECIFIED;
+   }
+}
 #endif
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 418d11902fa7..6bd3a0fb9439 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1936,7 +1936,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
return 0;
 
 reset:
-   tcp_v4_send_reset(rsk, skb, SK_RST_REASON_NOT_SPECIFIED);
+   tcp_v4_send_reset(rsk, skb, sk_rst_convert_drop_reason(reason));
 discard:
kfree_skb_reason(skb, reason);
/* Be careful here. If this function gets more complicated and
@@ -2278,7 +2278,10 @@ int tcp_v4_rcv(struct sk_buff *skb)
} else {
drop_reason = tcp_child_process(sk, nsk, skb);
if (drop_reason) {
-   tcp_v4_send_reset(nsk, skb, 
SK_RST_REASON_NOT_SPECIFIED);
+   enum sk_rst_reason rst_reason;
+
+   rst_reason = 
sk_rst_convert_drop_reason(drop_reason);
+   tcp_v4_send_reset(nsk, skb, rst_reason);
goto discard_and_relse;
}
sock_put(sk);
@@ -2357,7 +2360,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
 bad_packet:
__TCP_INC_STATS(net, TCP_MIB_INERRS);
} else {
-   tcp_v4_send_reset(NULL, skb, SK_RST_REASON_NOT_SPECIFIED);
+   tcp_v4_send_reset(NULL, skb, 
sk_rst_convert_drop_reason(drop_reason));
}
 
 discard_it:
@@ -2409,7 +2412,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
tcp_v4_timewait_ack(sk, skb);
break;
case TCP_TW_RST:
-   tcp_v4_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
+   tcp_v4_send_reset(sk, skb, 
sk_rst_convert_drop_reason(drop_reason));
inet_twsk_deschedule_put(inet_twsk(sk));
goto discard_it;
case TCP_TW_SUCCESS:;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 017f6293b5f4..317d7a6e6b01 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1680,7 +1680,7 @@ int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
return 0;
 
 reset:
-   tcp_v6_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
+   tcp_v6_send_reset(sk, skb, sk_rst_convert_drop_reason(reason));
 discard:
if (opt_skb)
__kfree_skb(opt_skb);
@@ -1865,7 +1865,10 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff 
*skb)
} else {
drop_reason = tcp_child_process(sk, nsk, skb);
if (drop_reason) {
-   tcp_v6_send_reset(nsk, skb, 
SK_RST_REASON_NOT_SPECIFIED);
+   enum sk_rst_reason rst_reason;
+
+   rst_reason = 
sk_rst_convert_drop_reason(drop_reason);
+   tcp_v6_send_reset(nsk, skb, rst_reason);
goto discard_and_relse;
}
sock_put(sk);
@@ -1942,7 +1945,7 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff 
*skb)
 bad_packet:
__TCP_INC_STATS(net, TCP_MIB_INERRS);
} else {
-   tcp_v6_send_reset(NULL, skb, SK_RST_REASON_NOT_SPECIFIED);
+   tcp_v6_send_reset(NULL, skb, 
sk_rst_convert_drop_reason(drop_reason));
}
 
 discard_it:
@@ -1998,7 +2001,7 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff 
*skb)
tcp_v6_timewait_ack(sk, skb);
break;
case TCP_TW_RST:
-   tcp_v6_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
+   tcp_v6_send_reset(sk, skb, 
sk_rst_convert_drop_reason(drop_reason));

[PATCH net-next v9 3/7] rstreason: prepare for active reset

2024-04-24 Thread Jason Xing
From: Jason Xing 

Like what we did to passive reset:
only passing possible reset reason in each active reset path.

No functional changes.

Signed-off-by: Jason Xing 
Acked-by: Matthieu Baerts (NGI0) 
---
 include/net/tcp.h |  3 ++-
 net/ipv4/tcp.c| 15 ++-
 net/ipv4/tcp_output.c |  3 ++-
 net/ipv4/tcp_timer.c  |  9 ++---
 net/mptcp/protocol.c  |  4 +++-
 net/mptcp/subflow.c   |  5 +++--
 6 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index b935e1ae4caf..adeacc9aa28a 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -670,7 +670,8 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
 void tcp_send_probe0(struct sock *);
 int tcp_write_wakeup(struct sock *, int mib);
 void tcp_send_fin(struct sock *sk);
-void tcp_send_active_reset(struct sock *sk, gfp_t priority);
+void tcp_send_active_reset(struct sock *sk, gfp_t priority,
+  enum sk_rst_reason reason);
 int tcp_send_synack(struct sock *);
 void tcp_push_one(struct sock *, unsigned int mss_now);
 void __tcp_send_ack(struct sock *sk, u32 rcv_nxt);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index f23b9ea5..4ec0f4feee00 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -275,6 +275,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -2811,7 +2812,8 @@ void __tcp_close(struct sock *sk, long timeout)
/* Unread data was tossed, zap the connection. */
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
tcp_set_state(sk, TCP_CLOSE);
-   tcp_send_active_reset(sk, sk->sk_allocation);
+   tcp_send_active_reset(sk, sk->sk_allocation,
+ SK_RST_REASON_NOT_SPECIFIED);
} else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
/* Check zero linger _after_ checking for unread data. */
sk->sk_prot->disconnect(sk, 0);
@@ -2885,7 +2887,8 @@ void __tcp_close(struct sock *sk, long timeout)
struct tcp_sock *tp = tcp_sk(sk);
if (READ_ONCE(tp->linger2) < 0) {
tcp_set_state(sk, TCP_CLOSE);
-   tcp_send_active_reset(sk, GFP_ATOMIC);
+   tcp_send_active_reset(sk, GFP_ATOMIC,
+ SK_RST_REASON_NOT_SPECIFIED);
__NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPABORTONLINGER);
} else {
@@ -2903,7 +2906,8 @@ void __tcp_close(struct sock *sk, long timeout)
if (sk->sk_state != TCP_CLOSE) {
if (tcp_check_oom(sk, 0)) {
tcp_set_state(sk, TCP_CLOSE);
-   tcp_send_active_reset(sk, GFP_ATOMIC);
+   tcp_send_active_reset(sk, GFP_ATOMIC,
+ SK_RST_REASON_NOT_SPECIFIED);
__NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPABORTONMEMORY);
} else if (!check_net(sock_net(sk))) {
@@ -3007,7 +3011,7 @@ int tcp_disconnect(struct sock *sk, int flags)
/* The last check adjusts for discrepancy of Linux wrt. RFC
 * states
 */
-   tcp_send_active_reset(sk, gfp_any());
+   tcp_send_active_reset(sk, gfp_any(), 
SK_RST_REASON_NOT_SPECIFIED);
WRITE_ONCE(sk->sk_err, ECONNRESET);
} else if (old_state == TCP_SYN_SENT)
WRITE_ONCE(sk->sk_err, ECONNRESET);
@@ -4564,7 +4568,8 @@ int tcp_abort(struct sock *sk, int err)
smp_wmb();
sk_error_report(sk);
if (tcp_need_reset(sk->sk_state))
-   tcp_send_active_reset(sk, GFP_ATOMIC);
+   tcp_send_active_reset(sk, GFP_ATOMIC,
+ SK_RST_REASON_NOT_SPECIFIED);
tcp_done(sk);
}
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 99a1d88f7f47..e4f5c8b5172a 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3614,7 +3614,8 @@ void tcp_send_fin(struct sock *sk)
  * was unread data in the receive queue.  This behavior is recommended
  * by RFC 2525, section 2.17.  -DaveM
  */
-void tcp_send_active_reset(struct sock *sk, gfp_t priority)
+void tcp_send_active_reset(struct sock *sk, gfp_t priority,
+  enum sk_rst_reason reason)
 {
struct sk_buff *skb;
 
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 976db57b95d4..83fe7f62f7f1 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -22,6 +22,7 @@
 #include 
 #include 
 #include 
+#include 
 
 static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk)
 {
@@ -127,7 +128,8 @@ static int tcp_out_of_resources(struct sock *sk, bool 
do_reset)
(!tp->snd_wnd && 

[PATCH net-next v9 2/7] rstreason: prepare for passive reset

2024-04-24 Thread Jason Xing
From: Jason Xing 

Adjust the parameter and support passing reason of reset which
is for now NOT_SPECIFIED. No functional changes.

Signed-off-by: Jason Xing 
Acked-by: Matthieu Baerts (NGI0) 
---
 include/net/request_sock.h |  4 +++-
 net/dccp/ipv4.c| 10 ++
 net/dccp/ipv6.c| 10 ++
 net/dccp/minisocks.c   |  3 ++-
 net/ipv4/tcp_ipv4.c| 12 +++-
 net/ipv4/tcp_minisocks.c   |  3 ++-
 net/ipv6/tcp_ipv6.c| 15 +--
 net/mptcp/subflow.c|  8 +---
 8 files changed, 40 insertions(+), 25 deletions(-)

diff --git a/include/net/request_sock.h b/include/net/request_sock.h
index 004e651e6067..bdc737832da6 100644
--- a/include/net/request_sock.h
+++ b/include/net/request_sock.h
@@ -18,6 +18,7 @@
 #include 
 
 #include 
+#include 
 
 struct request_sock;
 struct sk_buff;
@@ -34,7 +35,8 @@ struct request_sock_ops {
void(*send_ack)(const struct sock *sk, struct sk_buff *skb,
struct request_sock *req);
void(*send_reset)(const struct sock *sk,
- struct sk_buff *skb);
+ struct sk_buff *skb,
+ enum sk_rst_reason reason);
void(*destructor)(struct request_sock *req);
void(*syn_ack_timeout)(const struct request_sock *req);
 };
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 9fc9cea4c251..ff41bd6f99c3 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -24,6 +24,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "ackvec.h"
 #include "ccid.h"
@@ -521,7 +522,8 @@ static int dccp_v4_send_response(const struct sock *sk, 
struct request_sock *req
return err;
 }
 
-static void dccp_v4_ctl_send_reset(const struct sock *sk, struct sk_buff 
*rxskb)
+static void dccp_v4_ctl_send_reset(const struct sock *sk, struct sk_buff 
*rxskb,
+  enum sk_rst_reason reason)
 {
int err;
const struct iphdr *rxiph;
@@ -706,7 +708,7 @@ int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
return 0;
 
 reset:
-   dccp_v4_ctl_send_reset(sk, skb);
+   dccp_v4_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
kfree_skb(skb);
return 0;
 }
@@ -869,7 +871,7 @@ static int dccp_v4_rcv(struct sk_buff *skb)
if (nsk == sk) {
reqsk_put(req);
} else if (dccp_child_process(sk, nsk, skb)) {
-   dccp_v4_ctl_send_reset(sk, skb);
+   dccp_v4_ctl_send_reset(sk, skb, 
SK_RST_REASON_NOT_SPECIFIED);
goto discard_and_relse;
} else {
sock_put(sk);
@@ -909,7 +911,7 @@ static int dccp_v4_rcv(struct sk_buff *skb)
if (dh->dccph_type != DCCP_PKT_RESET) {
DCCP_SKB_CB(skb)->dccpd_reset_code =
DCCP_RESET_CODE_NO_CONNECTION;
-   dccp_v4_ctl_send_reset(sk, skb);
+   dccp_v4_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
}
 
 discard_it:
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index c8ca703dc331..85f4b8fdbe5e 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -29,6 +29,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "dccp.h"
 #include "ipv6.h"
@@ -256,7 +257,8 @@ static void dccp_v6_reqsk_destructor(struct request_sock 
*req)
kfree_skb(inet_rsk(req)->pktopts);
 }
 
-static void dccp_v6_ctl_send_reset(const struct sock *sk, struct sk_buff 
*rxskb)
+static void dccp_v6_ctl_send_reset(const struct sock *sk, struct sk_buff 
*rxskb,
+  enum sk_rst_reason reason)
 {
const struct ipv6hdr *rxip6h;
struct sk_buff *skb;
@@ -656,7 +658,7 @@ static int dccp_v6_do_rcv(struct sock *sk, struct sk_buff 
*skb)
return 0;
 
 reset:
-   dccp_v6_ctl_send_reset(sk, skb);
+   dccp_v6_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
 discard:
if (opt_skb != NULL)
__kfree_skb(opt_skb);
@@ -762,7 +764,7 @@ static int dccp_v6_rcv(struct sk_buff *skb)
if (nsk == sk) {
reqsk_put(req);
} else if (dccp_child_process(sk, nsk, skb)) {
-   dccp_v6_ctl_send_reset(sk, skb);
+   dccp_v6_ctl_send_reset(sk, skb, 
SK_RST_REASON_NOT_SPECIFIED);
goto discard_and_relse;
} else {
sock_put(sk);
@@ -801,7 +803,7 @@ static int dccp_v6_rcv(struct sk_buff *skb)
if (dh->dccph_type != DCCP_PKT_RESET) {
DCCP_SKB_CB(skb)->dccpd_reset_code =
DCCP_RESET_CODE_NO_CONNECTION;
-   dccp_v6_ctl_send_reset(sk, skb);
+   dccp_v6_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
}
 
 discard_it:
diff 

[PATCH net-next v9 1/7] net: introduce rstreason to detect why the RST is sent

2024-04-24 Thread Jason Xing
From: Jason Xing 

Add a new standalone file for the easy future extension to support
both active reset and passive reset in the TCP/DCCP/MPTCP protocols.

This patch only does the preparations for reset reason mechanism,
nothing else changes.

The reset reasons are divided into three parts:
1) reuse drop reasons for passive reset in TCP
2) our own independent reasons which aren't relying on other reasons at all
3) reuse MP_TCPRST option for MPTCP

The benefits of a standalone reset reason are listed here:
1) it can cover more than one case, such as reset reasons in MPTCP,
active reset reasons.
2) people can easily/fastly understand and maintain this mechanism.
3) we get unified format of output with prefix stripped.
4) more new reset reasons are on the way
...

I will implement the basic codes of active/passive reset reason in
those three protocols, which are not complete for this moment. For
passive reset part in TCP, I only introduce the NO_SOCKET common case
which could be set as an example.

After this series applied, it will have the ability to open a new
gate to let other people contribute more reasons into it :)

Signed-off-by: Jason Xing 
Acked-by: Matthieu Baerts (NGI0) 
---
 include/net/rstreason.h | 106 
 1 file changed, 106 insertions(+)
 create mode 100644 include/net/rstreason.h

diff --git a/include/net/rstreason.h b/include/net/rstreason.h
new file mode 100644
index ..bc53b5a24505
--- /dev/null
+++ b/include/net/rstreason.h
@@ -0,0 +1,106 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifndef _LINUX_RSTREASON_H
+#define _LINUX_RSTREASON_H
+#include 
+#include 
+
+#define DEFINE_RST_REASON(FN, FNe) \
+   FN(NOT_SPECIFIED)   \
+   FN(NO_SOCKET)   \
+   FN(MPTCP_RST_EUNSPEC)   \
+   FN(MPTCP_RST_EMPTCP)\
+   FN(MPTCP_RST_ERESOURCE) \
+   FN(MPTCP_RST_EPROHIBIT) \
+   FN(MPTCP_RST_EWQ2BIG)   \
+   FN(MPTCP_RST_EBADPERF)  \
+   FN(MPTCP_RST_EMIDDLEBOX)\
+   FN(ERROR)   \
+   FNe(MAX)
+
+/**
+ * enum sk_rst_reason - the reasons of socket reset
+ *
+ * The reasons of sk reset, which are used in DCCP/TCP/MPTCP protocols.
+ *
+ * There are three parts in order:
+ * 1) skb drop reasons: relying on drop reasons for such as passive reset
+ * 2) independent reset reasons: such as active reset reasons
+ * 3) reset reasons in MPTCP: only for MPTCP use
+ */
+enum sk_rst_reason {
+   /* Refer to include/net/dropreason-core.h
+* Rely on skb drop reasons because it indicates exactly why RST
+* could happen.
+*/
+   /** @SK_RST_REASON_NOT_SPECIFIED: reset reason is not specified */
+   SK_RST_REASON_NOT_SPECIFIED,
+   /** @SK_RST_REASON_NO_SOCKET: no valid socket that can be used */
+   SK_RST_REASON_NO_SOCKET,
+
+   /* Copy from include/uapi/linux/mptcp.h.
+* These reset fields will not be changed since they adhere to
+* RFC 8684. So do not touch them. I'm going to list each definition
+* of them respectively.
+*/
+   /**
+* @SK_RST_REASON_MPTCP_RST_EUNSPEC: Unspecified error.
+* This is the default error; it implies that the subflow is no
+* longer available. The presence of this option shows that the
+* RST was generated by an MPTCP-aware device.
+*/
+   SK_RST_REASON_MPTCP_RST_EUNSPEC,
+   /**
+* @SK_RST_REASON_MPTCP_RST_EMPTCP: MPTCP-specific error.
+* An error has been detected in the processing of MPTCP options.
+* This is the usual reason code to return in the cases where a RST
+* is being sent to close a subflow because of an invalid response.
+*/
+   SK_RST_REASON_MPTCP_RST_EMPTCP,
+   /**
+* @SK_RST_REASON_MPTCP_RST_ERESOURCE: Lack of resources.
+* This code indicates that the sending host does not have enough
+* resources to support the terminated subflow.
+*/
+   SK_RST_REASON_MPTCP_RST_ERESOURCE,
+   /**
+* @SK_RST_REASON_MPTCP_RST_EPROHIBIT: Administratively prohibited.
+* This code indicates that the requested subflow is prohibited by
+* the policies of the sending host.
+*/
+   SK_RST_REASON_MPTCP_RST_EPROHIBIT,
+   /**
+* @SK_RST_REASON_MPTCP_RST_EWQ2BIG: Too much outstanding data.
+* This code indicates that there is an excessive amount of data
+* that needs to be transmitted over the terminated subflow while
+* having already been acknowledged over one or more other subflows.
+* This may occur if a path has been unavailable for a short period
+* and it is more efficient to reset and start again than it is to
+* retransmit the queued data.
+*/
+   SK_RST_REASON_MPTCP_RST_EWQ2BIG,
+   /**
+* @SK_RST_REASON_MPTCP_RST_EBADPERF: Unacceptable 

[PATCH net-next v9 0/7] Implement reset reason mechanism to detect

2024-04-24 Thread Jason Xing
From: Jason Xing 

In production, there are so many cases about why the RST skb is sent but
we don't have a very convenient/fast method to detect the exact underlying
reasons.

RST is implemented in two kinds: passive kind (like tcp_v4_send_reset())
and active kind (like tcp_send_active_reset()). The former can be traced
carefully 1) in TCP, with the help of drop reasons, which is based on
Eric's idea[1], 2) in MPTCP, with the help of reset options defined in
RFC 8684. The latter is relatively independent, which should be
implemented on our own, such as active reset reasons which can not be
replace by skb drop reason or something like this.

In this series, I focus on the fundamental implement mostly about how
the rstreason mechanism works and give the detailed passive part as an
example, not including the active reset part. In future, we can go
further and refine those NOT_SPECIFIED reasons.

Here are some examples when tracing:
-0   [002] ..s1.  1830.262425: tcp_send_reset: skbaddr=x
skaddr=x src=x dest=x state=x reason=NOT_SPECIFIED
-0   [002] ..s1.  1830.262425: tcp_send_reset: skbaddr=x
skaddr=x src=x dest=x state=x reason=NO_SOCKET

[1]
Link: 
https://lore.kernel.org/all/CANn89iJw8x-LqgsWOeJQQvgVg6DnL5aBRLi10QN2WBdr+X4k=w...@mail.gmail.com/

v9
Link: 
https://lore.kernel.org/all/20240423072137.65168-1-kerneljasonx...@gmail.com/
1. address nit problem (Matt)
2. add acked-by and reviewed-by tags (Matt)

v8
Link: 
https://lore.kernel.org/all/20240422030109.12891-1-kerneljasonx...@gmail.com/
1. put sk reset reasons into more natural order (Matt)
2. adjust those helper position (Matt)
3. rename two convert function (Matt)
4. make the kdoc format correct (Simon)

v7
Link: 
https://lore.kernel.org/all/20240417085143.69578-1-kerneljasonx...@gmail.com/
1. get rid of enum casts which could bring potential issues (Eric)
2. use switch-case method to map between reset reason in MPTCP and sk reset
reason (Steven)
3. use switch-case method to map between skb drop reason and sk reset
reason

v6
1. add back casts, or else they are treated as error.

v5
Link: 
https://lore.kernel.org/all/2024045630.38420-1-kerneljasonx...@gmail.com/
1. address format issue (like reverse xmas tree) (Eric, Paolo)
2. remove unnecessary casts. (Eric)
3. introduce a helper used in mptcp active reset. See patch 6. (Paolo)

v4
Link: 
https://lore.kernel.org/all/20240409100934.37725-1-kerneljasonx...@gmail.com/
1. passing 'enum sk_rst_reason' for readability when tracing (Antoine)

v3
Link: 
https://lore.kernel.org/all/20240404072047.11490-1-kerneljasonx...@gmail.com/
1. rebase (mptcp part) and address what Mat suggested.

v2
Link: https://lore.kernel.org/all/20240403185033.47ebc...@kernel.org/
1. rebase against the latest net-next tree


Jason Xing (7):
  net: introduce rstreason to detect why the RST is sent
  rstreason: prepare for passive reset
  rstreason: prepare for active reset
  tcp: support rstreason for passive reset
  mptcp: support rstreason for passive reset
  mptcp: introducing a helper into active reset logic
  rstreason: make it work in trace world

 include/net/request_sock.h |   4 +-
 include/net/rstreason.h| 121 +
 include/net/tcp.h  |   3 +-
 include/trace/events/tcp.h |  26 ++--
 net/dccp/ipv4.c|  10 +--
 net/dccp/ipv6.c|  10 +--
 net/dccp/minisocks.c   |   3 +-
 net/ipv4/tcp.c |  15 +++--
 net/ipv4/tcp_ipv4.c|  17 --
 net/ipv4/tcp_minisocks.c   |   3 +-
 net/ipv4/tcp_output.c  |   5 +-
 net/ipv4/tcp_timer.c   |   9 ++-
 net/ipv6/tcp_ipv6.c|  20 +++---
 net/mptcp/protocol.c   |   2 +-
 net/mptcp/protocol.h   |  38 
 net/mptcp/subflow.c|  27 ++---
 16 files changed, 266 insertions(+), 47 deletions(-)
 create mode 100644 include/net/rstreason.h

-- 
2.37.3




[PATCH RFC] rethook: inline arch_rethook_trampoline_callback() in assembly code

2024-04-24 Thread Andrii Nakryiko
At the lowest level, rethook-based kretprobes on x86-64 architecture go
through arch_rethoook_trampoline() function, manually written in
assembly, which calls into a simple arch_rethook_trampoline_callback()
function, written in C, and only doing a few straightforward field
assignments, before calling further into rethook_trampoline_handler(),
which handles kretprobe callbacks generically.

Looking at simplicity of arch_rethook_trampoline_callback(), it seems
not really worthwhile to spend an extra function call just to do 4 or
5 assignments. As such, this patch proposes to "inline"
arch_rethook_trampoline_callback() into arch_rethook_trampoline() by
manually implementing it in an assembly code.

This has two motivations. First, we do get a bit of runtime speed up by
avoiding function calls. Using BPF selftests's bench tool, we see
0.6%-0.8% throughput improvement for kretprobe/multi-kretprobe
triggering code path:

BEFORE (latest probes/for-next)
===
kretprobe  :   10.455 ± 0.024M/s
kretprobe-multi:   11.150 ± 0.012M/s

AFTER (probes/for-next + this patch)

kretprobe  :   10.540 ± 0.009M/s (+0.8%)
kretprobe-multi:   11.219 ± 0.042M/s (+0.6%)

Second, and no less importantly for some specialized use cases, this
avoids unnecessarily "polluting" LBR records with an extra function call
(recorded as a jump by CPU). This is the case for the retsnoop ([0])
tool, which relies havily on capturing LBR records to provide users with
lots of insight into kernel internals.

This RFC patch is only inlining this function for x86-64, but it's
possible to do that for 32-bit x86 arch as well and then remove
arch_rethook_trampoline_callback() implementation altogether. Please let
me know if this change is acceptable and whether I should complete it
with 32-bit "inlining" as well. Thanks!

  [0] 
https://nakryiko.com/posts/retsnoop-intro/#peering-deep-into-functions-with-lbr

Signed-off-by: Andrii Nakryiko 
---
 arch/x86/kernel/asm-offsets_64.c |  4 
 arch/x86/kernel/rethook.c| 37 +++-
 2 files changed, 36 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index bb65371ea9df..5c444abc540c 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -42,6 +42,10 @@ int main(void)
ENTRY(r14);
ENTRY(r15);
ENTRY(flags);
+   ENTRY(ip);
+   ENTRY(cs);
+   ENTRY(ss);
+   ENTRY(orig_ax);
BLANK();
 #undef ENTRY
 
diff --git a/arch/x86/kernel/rethook.c b/arch/x86/kernel/rethook.c
index 8a1c0111ae79..3e1c01beebd1 100644
--- a/arch/x86/kernel/rethook.c
+++ b/arch/x86/kernel/rethook.c
@@ -6,6 +6,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "kprobes/common.h"
 
@@ -34,10 +35,36 @@ asm(
"   pushq %rsp\n"
"   pushfq\n"
SAVE_REGS_STRING
-   "   movq %rsp, %rdi\n"
-   "   call arch_rethook_trampoline_callback\n"
+   "   movq %rsp, %rdi\n" /* $rdi points to regs */
+   /* fixup registers */
+   /* regs->cs = __KERNEL_CS; */
+   "   movq $" __stringify(__KERNEL_CS) ", " __stringify(pt_regs_cs) 
"(%rdi)\n"
+   /* regs->ip = (unsigned long)_rethook_trampoline; */
+   "   movq $arch_rethook_trampoline, " __stringify(pt_regs_ip) 
"(%rdi)\n"
+   /* regs->orig_ax = ~0UL; */
+   "   movq $0x, " __stringify(pt_regs_orig_ax) 
"(%rdi)\n"
+   /* regs->sp += 2*sizeof(long); */
+   "   addq $16, " __stringify(pt_regs_sp) "(%rdi)\n"
+   /* 2nd arg is frame_pointer = (long *)(regs + 1); */
+   "   lea " __stringify(PTREGS_SIZE) "(%rdi), %rsi\n"
+   /*
+* The return address at 'frame_pointer' is recovered by the
+* arch_rethook_fixup_return() which called from this
+* rethook_trampoline_handler().
+*/
+   "   call rethook_trampoline_handler\n"
+   /*
+* Copy FLAGS to 'pt_regs::ss' so we can do RET right after POPF.
+*
+* We don't save/restore %rax below, because we ignore
+* rethook_trampoline_handler result.
+*
+* *(unsigned long *)>ss = regs->flags;
+*/
+   "   mov " __stringify(pt_regs_flags) "(%rsp), %rax\n"
+   "   mov %rax, " __stringify(pt_regs_ss) "(%rsp)\n"
RESTORE_REGS_STRING
-   /* In the callback function, 'regs->flags' is copied to 'regs->ss'. */
+   /* We just copied 'regs->flags' into 'regs->ss'. */
"   addq $16, %rsp\n"
"   popfq\n"
 #else
@@ -61,6 +88,7 @@ asm(
 );
 NOKPROBE_SYMBOL(arch_rethook_trampoline);
 
+#ifdef CONFIG_X86_32
 /*
  * Called from arch_rethook_trampoline
  */
@@ -70,9 +98,7 @@ __used __visible void arch_rethook_trampoline_callback(struct 
pt_regs *regs)
 
/* fixup registers */
regs->cs = __KERNEL_CS;
-#ifdef CONFIG_X86_32
regs->gs = 0;
-#endif

[PATCH 5.4,4.19 2/2] tracing: Increase PERF_MAX_TRACE_SIZE to handle Sentinel1 and docker together

2024-04-24 Thread Thadeu Lima de Souza Cascardo
From: "Robin H. Johnson" 

commit e531e90b5ab0f7ce5ff298e165214c1aec6ed187 upstream.

Running endpoint security solutions like Sentinel1 that use perf-based
tracing heavily lead to this repeated dump complaining about dockerd.
The default value of 2048 is nowhere near not large enough.

Using the prior patch "tracing: show size of requested buffer", we get
"perf buffer not large enough, wanted 6644, have 6144", after repeated
up-sizing (I did 2/4/6/8K). With 8K, the problem doesn't occur at all,
so below is the trace for 6K.

I'm wondering if this value should be selectable at boot time, but this
is a good starting point.

```
[ cut here ]
perf buffer not large enough, wanted 6644, have 6144
WARNING: CPU: 1 PID: 4997 at kernel/trace/trace_event_perf.c:402 
perf_trace_buf_alloc+0x8c/0xa0
Modules linked in: [..]
CPU: 1 PID: 4997 Comm: sh Tainted: GT 
5.13.13-x86_64-00039-gb3959163488e #63
Hardware name: LENOVO 20KH002JUS/20KH002JUS, BIOS N23ET66W (1.41 ) 09/02/2019
RIP: 0010:perf_trace_buf_alloc+0x8c/0xa0
Code: 80 3d 43 97 d0 01 00 74 07 31 c0 5b 5d 41 5c c3 ba 00 18 00 00 89 ee 48 
c7 c7 00 82 7d 91 c6 05 25 97 d0 01 01 e8 22 ee bc 00 <0f> 0b 31 c0 eb db 66 66 
2e 0f 1f 84 00 00 00 00 00 0f 1f 00 55 89
RSP: 0018:b922026b7d58 EFLAGS: 00010282
RAX:  RBX: 9da5ee012000 RCX: 0027
RDX: 9da881657828 RSI: 0001 RDI: 9da881657820
RBP: 19f4 R08:  R09: b922026b7b80
R10: b922026b7b78 R11: 91dda688 R12: 000f
R13: 9da5ee012108 R14: 9da8816570a0 R15: b922026b7e30
FS:  7f420db1a080() GS:9da88164() knlGS:
CS:  0010 DS:  ES:  CR0: 80050033
CR2: 0060 CR3: 0002504a8006 CR4: 003706e0
DR0:  DR1:  DR2: 
DR3:  DR6: fffe0ff0 DR7: 0400
Call Trace:
 kprobe_perf_func+0x11e/0x270
 ? do_execveat_common.isra.0+0x1/0x1c0
 ? do_execveat_common.isra.0+0x5/0x1c0
 kprobe_ftrace_handler+0x10e/0x1d0
 0xc03aa0c8
 ? do_execveat_common.isra.0+0x1/0x1c0
 do_execveat_common.isra.0+0x5/0x1c0
 __x64_sys_execve+0x33/0x40
 do_syscall_64+0x6b/0xc0
 ? do_syscall_64+0x11/0xc0
 entry_SYSCALL_64_after_hwframe+0x44/0xae
RIP: 0033:0x7f420dc1db37
Code: ff ff 76 e7 f7 d8 64 41 89 00 eb df 0f 1f 80 00 00 00 00 f7 d8 64 41 89 
00 eb dc 0f 1f 84 00 00 00 00 00 b8 3b 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 
c3 48 8b 0d 01 43 0f 00 f7 d8 64 89 01 48
RSP: 002b:7ffd4e8b4e38 EFLAGS: 0246 ORIG_RAX: 003b
RAX: ffda RBX:  RCX: 7f420dc1db37
RDX: 564338d1e740 RSI: 564338d32d50 RDI: 564338d28f00
RBP: 564338d28f00 R08: 564338d32d50 R09: 0020
R10: 01b6 R11: 0246 R12: 564338d28f00
R13: 564338d32d50 R14: 564338d1e740 R15: 564338d28c60
---[ end trace 83ab3e8e16275e49 ]---
```

Link: https://lkml.kernel.org/r/20210831043723.13481-2-robb...@gentoo.org

Signed-off-by: Robin H. Johnson 
Signed-off-by: Steven Rostedt (VMware) 
Signed-off-by: Thadeu Lima de Souza Cascardo 
---
 include/linux/trace_events.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index b8b87e7ba93f..12ee8973ea6f 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -427,7 +427,7 @@ struct trace_event_file {
}   \
early_initcall(trace_init_perf_perm_##name);
 
-#define PERF_MAX_TRACE_SIZE2048
+#define PERF_MAX_TRACE_SIZE8192
 
 #define MAX_FILTER_STR_VAL 256 /* Should handle KSYM_SYMBOL_LEN */
 
-- 
2.34.1




[PATCH 5.15,5.10 2/2] tracing: Increase PERF_MAX_TRACE_SIZE to handle Sentinel1 and docker together

2024-04-24 Thread Thadeu Lima de Souza Cascardo
From: "Robin H. Johnson" 

commit e531e90b5ab0f7ce5ff298e165214c1aec6ed187 upstream.

Running endpoint security solutions like Sentinel1 that use perf-based
tracing heavily lead to this repeated dump complaining about dockerd.
The default value of 2048 is nowhere near not large enough.

Using the prior patch "tracing: show size of requested buffer", we get
"perf buffer not large enough, wanted 6644, have 6144", after repeated
up-sizing (I did 2/4/6/8K). With 8K, the problem doesn't occur at all,
so below is the trace for 6K.

I'm wondering if this value should be selectable at boot time, but this
is a good starting point.

```
[ cut here ]
perf buffer not large enough, wanted 6644, have 6144
WARNING: CPU: 1 PID: 4997 at kernel/trace/trace_event_perf.c:402 
perf_trace_buf_alloc+0x8c/0xa0
Modules linked in: [..]
CPU: 1 PID: 4997 Comm: sh Tainted: GT 
5.13.13-x86_64-00039-gb3959163488e #63
Hardware name: LENOVO 20KH002JUS/20KH002JUS, BIOS N23ET66W (1.41 ) 09/02/2019
RIP: 0010:perf_trace_buf_alloc+0x8c/0xa0
Code: 80 3d 43 97 d0 01 00 74 07 31 c0 5b 5d 41 5c c3 ba 00 18 00 00 89 ee 48 
c7 c7 00 82 7d 91 c6 05 25 97 d0 01 01 e8 22 ee bc 00 <0f> 0b 31 c0 eb db 66 66 
2e 0f 1f 84 00 00 00 00 00 0f 1f 00 55 89
RSP: 0018:b922026b7d58 EFLAGS: 00010282
RAX:  RBX: 9da5ee012000 RCX: 0027
RDX: 9da881657828 RSI: 0001 RDI: 9da881657820
RBP: 19f4 R08:  R09: b922026b7b80
R10: b922026b7b78 R11: 91dda688 R12: 000f
R13: 9da5ee012108 R14: 9da8816570a0 R15: b922026b7e30
FS:  7f420db1a080() GS:9da88164() knlGS:
CS:  0010 DS:  ES:  CR0: 80050033
CR2: 0060 CR3: 0002504a8006 CR4: 003706e0
DR0:  DR1:  DR2: 
DR3:  DR6: fffe0ff0 DR7: 0400
Call Trace:
 kprobe_perf_func+0x11e/0x270
 ? do_execveat_common.isra.0+0x1/0x1c0
 ? do_execveat_common.isra.0+0x5/0x1c0
 kprobe_ftrace_handler+0x10e/0x1d0
 0xc03aa0c8
 ? do_execveat_common.isra.0+0x1/0x1c0
 do_execveat_common.isra.0+0x5/0x1c0
 __x64_sys_execve+0x33/0x40
 do_syscall_64+0x6b/0xc0
 ? do_syscall_64+0x11/0xc0
 entry_SYSCALL_64_after_hwframe+0x44/0xae
RIP: 0033:0x7f420dc1db37
Code: ff ff 76 e7 f7 d8 64 41 89 00 eb df 0f 1f 80 00 00 00 00 f7 d8 64 41 89 
00 eb dc 0f 1f 84 00 00 00 00 00 b8 3b 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 
c3 48 8b 0d 01 43 0f 00 f7 d8 64 89 01 48
RSP: 002b:7ffd4e8b4e38 EFLAGS: 0246 ORIG_RAX: 003b
RAX: ffda RBX:  RCX: 7f420dc1db37
RDX: 564338d1e740 RSI: 564338d32d50 RDI: 564338d28f00
RBP: 564338d28f00 R08: 564338d32d50 R09: 0020
R10: 01b6 R11: 0246 R12: 564338d28f00
R13: 564338d32d50 R14: 564338d1e740 R15: 564338d28c60
---[ end trace 83ab3e8e16275e49 ]---
```

Link: https://lkml.kernel.org/r/20210831043723.13481-2-robb...@gentoo.org

Signed-off-by: Robin H. Johnson 
Signed-off-by: Steven Rostedt (VMware) 
Signed-off-by: Thadeu Lima de Souza Cascardo 
---
 include/linux/trace_events.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index d3cbe4bf4fab..17575aa2a53c 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -676,7 +676,7 @@ struct trace_event_file {
}   \
early_initcall(trace_init_perf_perm_##name);
 
-#define PERF_MAX_TRACE_SIZE2048
+#define PERF_MAX_TRACE_SIZE8192
 
 #define MAX_FILTER_STR_VAL 256U/* Should handle KSYM_SYMBOL_LEN */
 
-- 
2.34.1




[PATCH 5.15,5.10,5.4,4.19 0/2] Fix warning when tracing with large filenames

2024-04-24 Thread Thadeu Lima de Souza Cascardo
The warning described on patch "tracing: Increase PERF_MAX_TRACE_SIZE to
handle Sentinel1 and docker together" can be triggered with a perf probe on
do_execve with a large path. As PATH_MAX is larger than PERF_MAX_TRACE_SIZE
(2048 before the patch), the warning will trigger.

The fix was included in 5.16, so backporting to 5.15 and earlier LTS
kernels. Also included is a patch that better describes the attempted
allocation size.

-- 
2.34.1




[PATCH 5.15,5.10,5.4,4.19 1/2] tracing: Show size of requested perf buffer

2024-04-24 Thread Thadeu Lima de Souza Cascardo
From: "Robin H. Johnson" 

commit a90afe8d020da9298c98fddb19b7a6372e2feb45 upstream.

If the perf buffer isn't large enough, provide a hint about how large it
needs to be for whatever is running.

Link: https://lkml.kernel.org/r/20210831043723.13481-1-robb...@gentoo.org

Signed-off-by: Robin H. Johnson 
Signed-off-by: Steven Rostedt (VMware) 
Signed-off-by: Thadeu Lima de Souza Cascardo 
---
 kernel/trace/trace_event_perf.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 083f648e3265..61e3a2620fa3 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -401,7 +401,8 @@ void *perf_trace_buf_alloc(int size, struct pt_regs **regs, 
int *rctxp)
BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long));
 
if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
- "perf buffer not large enough"))
+ "perf buffer not large enough, wanted %d, have %d",
+ size, PERF_MAX_TRACE_SIZE))
return NULL;
 
*rctxp = rctx = perf_swevent_get_recursion_context();
-- 
2.34.1




[PATCH 2/2] objpool: cache nr_possible_cpus() and avoid caching nr_cpu_ids

2024-04-24 Thread Andrii Nakryiko
Profiling shows that calling nr_possible_cpus() in objpool_pop() takes
a noticeable amount of CPU (when profiled on 80-core machine), as we
need to recalculate number of set bits in a CPU bit mask. This number
can't change, so there is no point in paying the price for recalculating
it. As such, cache this value in struct objpool_head and use it in
objpool_pop().

On the other hand, cached pool->nr_cpus isn't necessary, as it's not
used in hot path and is also a pretty trivial value to retrieve. So drop
pool->nr_cpus in favor of using nr_cpu_ids everywhere. This way the size
of struct objpool_head remains the same, which is a nice bonus.

Same BPF selftests benchmarks were used to evaluate the effect. Using
changes in previous patch (inlining of objpool_pop/objpool_push) as
baseline, here are the differences:

BASELINE

kretprobe  :9.937 ± 0.174M/s
kretprobe-multi:   10.440 ± 0.108M/s

AFTER
=
kretprobe  :   10.106 ± 0.120M/s (+1.7%)
kretprobe-multi:   10.515 ± 0.180M/s (+0.7%)

Cc: Matt (Qiang) Wu 
Signed-off-by: Andrii Nakryiko 
---
 include/linux/objpool.h |  6 +++---
 lib/objpool.c   | 12 ++--
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/include/linux/objpool.h b/include/linux/objpool.h
index d8b1f7b91128..cb1758eaa2d3 100644
--- a/include/linux/objpool.h
+++ b/include/linux/objpool.h
@@ -73,7 +73,7 @@ typedef int (*objpool_fini_cb)(struct objpool_head *head, 
void *context);
  * struct objpool_head - object pooling metadata
  * @obj_size:   object size, aligned to sizeof(void *)
  * @nr_objs:total objs (to be pre-allocated with objpool)
- * @nr_cpus:local copy of nr_cpu_ids
+ * @nr_possible_cpus: cached value of num_possible_cpus()
  * @capacity:   max objs can be managed by one objpool_slot
  * @gfp:gfp flags for kmalloc & vmalloc
  * @ref:refcount of objpool
@@ -85,7 +85,7 @@ typedef int (*objpool_fini_cb)(struct objpool_head *head, 
void *context);
 struct objpool_head {
int obj_size;
int nr_objs;
-   int nr_cpus;
+   int nr_possible_cpus;
int capacity;
gfp_t   gfp;
refcount_t  ref;
@@ -176,7 +176,7 @@ static inline void *objpool_pop(struct objpool_head *pool)
raw_local_irq_save(flags);
 
cpu = raw_smp_processor_id();
-   for (i = 0; i < num_possible_cpus(); i++) {
+   for (i = 0; i < pool->nr_possible_cpus; i++) {
obj = __objpool_try_get_slot(pool, cpu);
if (obj)
break;
diff --git a/lib/objpool.c b/lib/objpool.c
index f696308fc026..234f9d0bd081 100644
--- a/lib/objpool.c
+++ b/lib/objpool.c
@@ -50,7 +50,7 @@ objpool_init_percpu_slots(struct objpool_head *pool, int 
nr_objs,
 {
int i, cpu_count = 0;
 
-   for (i = 0; i < pool->nr_cpus; i++) {
+   for (i = 0; i < nr_cpu_ids; i++) {
 
struct objpool_slot *slot;
int nodes, size, rc;
@@ -60,8 +60,8 @@ objpool_init_percpu_slots(struct objpool_head *pool, int 
nr_objs,
continue;
 
/* compute how many objects to be allocated with this slot */
-   nodes = nr_objs / num_possible_cpus();
-   if (cpu_count < (nr_objs % num_possible_cpus()))
+   nodes = nr_objs / pool->nr_possible_cpus;
+   if (cpu_count < (nr_objs % pool->nr_possible_cpus))
nodes++;
cpu_count++;
 
@@ -103,7 +103,7 @@ static void objpool_fini_percpu_slots(struct objpool_head 
*pool)
if (!pool->cpu_slots)
return;
 
-   for (i = 0; i < pool->nr_cpus; i++)
+   for (i = 0; i < nr_cpu_ids; i++)
kvfree(pool->cpu_slots[i]);
kfree(pool->cpu_slots);
 }
@@ -130,13 +130,13 @@ int objpool_init(struct objpool_head *pool, int nr_objs, 
int object_size,
 
/* initialize objpool pool */
memset(pool, 0, sizeof(struct objpool_head));
-   pool->nr_cpus = nr_cpu_ids;
+   pool->nr_possible_cpus = num_possible_cpus();
pool->obj_size = object_size;
pool->capacity = capacity;
pool->gfp = gfp & ~__GFP_ZERO;
pool->context = context;
pool->release = release;
-   slot_size = pool->nr_cpus * sizeof(struct objpool_slot);
+   slot_size = nr_cpu_ids * sizeof(struct objpool_slot);
pool->cpu_slots = kzalloc(slot_size, pool->gfp);
if (!pool->cpu_slots)
return -ENOMEM;
-- 
2.43.0




[PATCH 1/2] objpool: enable inlining objpool_push() and objpool_pop() operations

2024-04-24 Thread Andrii Nakryiko
objpool_push() and objpool_pop() are very performance-critical functions
and can be called very frequently in kretprobe triggering path.

As such, it makes sense to allow compiler to inline them completely to
eliminate function calls overhead. Luckily, their logic is quite well
isolated and doesn't have any sprawling dependencies.

This patch moves both objpool_push() and objpool_pop() into
include/linux/objpool.h and marks them as static inline functions,
enabling inlining. To avoid anyone using internal helpers
(objpool_try_get_slot, objpool_try_add_slot), rename them to use leading
underscores.

We used kretprobe microbenchmark from BPF selftests (bench trig-kprobe
and trig-kprobe-multi benchmarks) running no-op BPF kretprobe/kretprobe.multi
programs in a tight loop to evaluate the effect. BPF own overhead in
this case is minimal and it mostly stresses the rest of in-kernel
kretprobe infrastructure overhead. Results are in millions of calls per
second. This is not super scientific, but shows the trend nevertheless.

BEFORE
==
kretprobe  :9.794 ± 0.086M/s
kretprobe-multi:   10.219 ± 0.032M/s

AFTER
=
kretprobe  :9.937 ± 0.174M/s (+1.5%)
kretprobe-multi:   10.440 ± 0.108M/s (+2.2%)

Cc: Matt (Qiang) Wu 
Signed-off-by: Andrii Nakryiko 
---
 include/linux/objpool.h | 101 +++-
 lib/objpool.c   | 100 ---
 2 files changed, 99 insertions(+), 102 deletions(-)

diff --git a/include/linux/objpool.h b/include/linux/objpool.h
index 15aff4a17f0c..d8b1f7b91128 100644
--- a/include/linux/objpool.h
+++ b/include/linux/objpool.h
@@ -5,6 +5,10 @@
 
 #include 
 #include 
+#include 
+#include 
+#include 
+#include 
 
 /*
  * objpool: ring-array based lockless MPMC queue
@@ -118,13 +122,94 @@ int objpool_init(struct objpool_head *pool, int nr_objs, 
int object_size,
 gfp_t gfp, void *context, objpool_init_obj_cb objinit,
 objpool_fini_cb release);
 
+/* try to retrieve object from slot */
+static inline void *__objpool_try_get_slot(struct objpool_head *pool, int cpu)
+{
+   struct objpool_slot *slot = pool->cpu_slots[cpu];
+   /* load head snapshot, other cpus may change it */
+   uint32_t head = smp_load_acquire(>head);
+
+   while (head != READ_ONCE(slot->last)) {
+   void *obj;
+
+   /*
+* data visibility of 'last' and 'head' could be out of
+* order since memory updating of 'last' and 'head' are
+* performed in push() and pop() independently
+*
+* before any retrieving attempts, pop() must guarantee
+* 'last' is behind 'head', that is to say, there must
+* be available objects in slot, which could be ensured
+* by condition 'last != head && last - head <= nr_objs'
+* that is equivalent to 'last - head - 1 < nr_objs' as
+* 'last' and 'head' are both unsigned int32
+*/
+   if (READ_ONCE(slot->last) - head - 1 >= pool->nr_objs) {
+   head = READ_ONCE(slot->head);
+   continue;
+   }
+
+   /* obj must be retrieved before moving forward head */
+   obj = READ_ONCE(slot->entries[head & slot->mask]);
+
+   /* move head forward to mark it's consumption */
+   if (try_cmpxchg_release(>head, , head + 1))
+   return obj;
+   }
+
+   return NULL;
+}
+
 /**
  * objpool_pop() - allocate an object from objpool
  * @pool: object pool
  *
  * return value: object ptr or NULL if failed
  */
-void *objpool_pop(struct objpool_head *pool);
+static inline void *objpool_pop(struct objpool_head *pool)
+{
+   void *obj = NULL;
+   unsigned long flags;
+   int i, cpu;
+
+   /* disable local irq to avoid preemption & interruption */
+   raw_local_irq_save(flags);
+
+   cpu = raw_smp_processor_id();
+   for (i = 0; i < num_possible_cpus(); i++) {
+   obj = __objpool_try_get_slot(pool, cpu);
+   if (obj)
+   break;
+   cpu = cpumask_next_wrap(cpu, cpu_possible_mask, -1, 1);
+   }
+   raw_local_irq_restore(flags);
+
+   return obj;
+}
+
+/* adding object to slot, abort if the slot was already full */
+static inline int
+__objpool_try_add_slot(void *obj, struct objpool_head *pool, int cpu)
+{
+   struct objpool_slot *slot = pool->cpu_slots[cpu];
+   uint32_t head, tail;
+
+   /* loading tail and head as a local snapshot, tail first */
+   tail = READ_ONCE(slot->tail);
+
+   do {
+   head = READ_ONCE(slot->head);
+   /* fault caught: something must be wrong */
+   WARN_ON_ONCE(tail - head > pool->nr_objs);
+   } while (!try_cmpxchg_acquire(>tail, , tail + 1));
+
+   /* now the tail position is reserved for the 

[PATCH 0/2] Objpool performance improvements

2024-04-24 Thread Andrii Nakryiko
Improve objpool (used heavily in kretprobe hot path) performance with two
improvements:
  - inlining performance critical objpool_push()/objpool_pop() operations;
  - avoiding re-calculating relatively expensive nr_possible_cpus().

These opportunities were found when benchmarking and profiling kprobes and
kretprobes with BPF-based benchmarks. See individual patches for details and
results.

Andrii Nakryiko (2):
  objpool: enable inlining objpool_push() and objpool_pop() operations
  objpool: cache nr_possible_cpus() and avoid caching nr_cpu_ids

 include/linux/objpool.h | 105 +++--
 lib/objpool.c   | 112 +++-
 2 files changed, 107 insertions(+), 110 deletions(-)

-- 
2.43.0