from:"Jason Xing"

[PATCH net-next v9 7/7] rstreason: make it work in trace world

2024-04-24 Thread Jason Xing

From: Jason Xing 

At last, we should let it work by introducing this reset reason in
trace world.

One of the possible expected outputs is:
... tcp_send_reset: skbaddr=xxx skaddr=xxx src=xxx dest=xxx
state=TCP_ESTABLISHED reason=NOT_SPECIFIED

Signed-off-by: Jason Xing 
Reviewed-by: Steven Rostedt (Google) 
---
 include/trace/events/tcp.h | 26 ++
 net/ipv4/tcp_ipv4.c|  2 +-
 net/ipv4/tcp_output.c  |  2 +-
 net/ipv6/tcp_ipv6.c|  2 +-
 4 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
index 5c04a61a11c2..49b5ee091cf6 100644
--- a/include/trace/events/tcp.h
+++ b/include/trace/events/tcp.h
@@ -11,6 +11,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /*
  * tcp event with arguments sk and skb
@@ -74,20 +75,32 @@ DEFINE_EVENT(tcp_event_sk_skb, tcp_retransmit_skb,
TP_ARGS(sk, skb)
 );
 
+#undef FN
+#define FN(reason) TRACE_DEFINE_ENUM(SK_RST_REASON_##reason);
+DEFINE_RST_REASON(FN, FN)
+
+#undef FN
+#undef FNe
+#define FN(reason) { SK_RST_REASON_##reason, #reason },
+#define FNe(reason){ SK_RST_REASON_##reason, #reason }
+
 /*
  * skb of trace_tcp_send_reset is the skb that caused RST. In case of
  * active reset, skb should be NULL
  */
 TRACE_EVENT(tcp_send_reset,
 
-   TP_PROTO(const struct sock *sk, const struct sk_buff *skb),
+   TP_PROTO(const struct sock *sk,
+const struct sk_buff *skb,
+const enum sk_rst_reason reason),
 
-   TP_ARGS(sk, skb),
+   TP_ARGS(sk, skb, reason),
 
TP_STRUCT__entry(
__field(const void *, skbaddr)
__field(const void *, skaddr)
__field(int, state)
+   __field(enum sk_rst_reason, reason)
__array(__u8, saddr, sizeof(struct sockaddr_in6))
__array(__u8, daddr, sizeof(struct sockaddr_in6))
),
@@ -113,14 +126,19 @@ TRACE_EVENT(tcp_send_reset,
 */
TP_STORE_ADDR_PORTS_SKB(skb, th, entry->daddr, 
entry->saddr);
}
+   __entry->reason = reason;
),
 
-   TP_printk("skbaddr=%p skaddr=%p src=%pISpc dest=%pISpc state=%s",
+   TP_printk("skbaddr=%p skaddr=%p src=%pISpc dest=%pISpc state=%s 
reason=%s",
  __entry->skbaddr, __entry->skaddr,
  __entry->saddr, __entry->daddr,
- __entry->state ? show_tcp_state_name(__entry->state) : 
"UNKNOWN")
+ __entry->state ? show_tcp_state_name(__entry->state) : 
"UNKNOWN",
+ __print_symbolic(__entry->reason, DEFINE_RST_REASON(FN, FNe)))
 );
 
+#undef FN
+#undef FNe
+
 /*
  * tcp event with arguments sk
  *
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 6bd3a0fb9439..6096ac7a3a02 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -871,7 +871,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct 
sk_buff *skb,
if (sk)
arg.bound_dev_if = sk->sk_bound_dev_if;
 
-   trace_tcp_send_reset(sk, skb);
+   trace_tcp_send_reset(sk, skb, reason);
 
BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index e4f5c8b5172a..a8d5f22c079b 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3640,7 +3640,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t 
priority,
/* skb of trace_tcp_send_reset() keeps the skb that caused RST,
 * skb here is different to the troublesome skb, so use NULL
 */
-   trace_tcp_send_reset(sk, NULL);
+   trace_tcp_send_reset(sk, NULL, SK_RST_REASON_NOT_SPECIFIED);
 }
 
 /* Send a crossed SYN-ACK during socket establishment.
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 317d7a6e6b01..77958adf2e16 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1133,7 +1133,7 @@ static void tcp_v6_send_reset(const struct sock *sk, 
struct sk_buff *skb,
label = ip6_flowlabel(ipv6h);
}
 
-   trace_tcp_send_reset(sk, skb);
+   trace_tcp_send_reset(sk, skb, reason);
 
tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, 1,
 ipv6_get_dsfield(ipv6h), label, priority, txhash,
-- 
2.37.3

[PATCH net-next v9 6/7] mptcp: introducing a helper into active reset logic

2024-04-24 Thread Jason Xing

From: Jason Xing 

Since we have mapped every mptcp reset reason definition in enum
sk_rst_reason, introducing a new helper can cover some missing places
where we have already set the subflow->reset_reason.

Note: using SK_RST_REASON_NOT_SPECIFIED is the same as
SK_RST_REASON_MPTCP_RST_EUNSPEC. They are both unknown. So we can convert
it directly.

Suggested-by: Paolo Abeni 
Signed-off-by: Jason Xing 
Reviewed-by: Matthieu Baerts (NGI0) 
---
Link: 
https://lore.kernel.org/all/2d3ea199eef53cf6a0c48e21abdee0eefbdee927.ca...@redhat.com/
---
 net/mptcp/protocol.c |  4 +---
 net/mptcp/protocol.h | 11 +++
 net/mptcp/subflow.c  |  6 ++
 3 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 065967086492..4b13ca362efa 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -21,7 +21,6 @@
 #endif
 #include 
 #include 
-#include 
 #include 
 #include "protocol.h"
 #include "mib.h"
@@ -2570,8 +2569,7 @@ static void mptcp_check_fastclose(struct mptcp_sock *msk)
 
slow = lock_sock_fast(tcp_sk);
if (tcp_sk->sk_state != TCP_CLOSE) {
-   tcp_send_active_reset(tcp_sk, GFP_ATOMIC,
- SK_RST_REASON_NOT_SPECIFIED);
+   mptcp_send_active_reset_reason(tcp_sk);
tcp_set_state(tcp_sk, TCP_CLOSE);
}
unlock_sock_fast(tcp_sk, slow);
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 252618859ee8..cfc5f9c3f113 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -12,6 +12,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "mptcp_pm_gen.h"
 
@@ -608,6 +609,16 @@ sk_rst_convert_mptcp_reason(u32 reason)
}
 }
 
+static inline void
+mptcp_send_active_reset_reason(struct sock *sk)
+{
+   struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+   enum sk_rst_reason reason;
+
+   reason = sk_rst_convert_mptcp_reason(subflow->reset_reason);
+   tcp_send_active_reset(sk, GFP_ATOMIC, reason);
+}
+
 static inline u64
 mptcp_subflow_get_map_offset(const struct mptcp_subflow_context *subflow)
 {
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index fb7abf2d01ca..97ec44d1df30 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -20,7 +20,6 @@
 #include 
 #endif
 #include 
-#include 
 
 #include "protocol.h"
 #include "mib.h"
@@ -424,7 +423,7 @@ void mptcp_subflow_reset(struct sock *ssk)
/* must hold: tcp_done() could drop last reference on parent */
sock_hold(sk);
 
-   tcp_send_active_reset(ssk, GFP_ATOMIC, SK_RST_REASON_NOT_SPECIFIED);
+   mptcp_send_active_reset_reason(ssk);
tcp_done(ssk);
if (!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, _sk(sk)->flags))
mptcp_schedule_work(sk);
@@ -1362,8 +1361,7 @@ static bool subflow_check_data_avail(struct sock *ssk)
tcp_set_state(ssk, TCP_CLOSE);
while ((skb = skb_peek(>sk_receive_queue)))
sk_eat_skb(ssk, skb);
-   tcp_send_active_reset(ssk, GFP_ATOMIC,
- SK_RST_REASON_NOT_SPECIFIED);
+   mptcp_send_active_reset_reason(ssk);
WRITE_ONCE(subflow->data_avail, false);
return false;
}
-- 
2.37.3

[PATCH net-next v9 5/7] mptcp: support rstreason for passive reset

2024-04-24 Thread Jason Xing

From: Jason Xing 

It relies on what reset options in the skb are as rfc8684 says. Reusing
this logic can save us much energy. This patch replaces most of the prior
NOT_SPECIFIED reasons.

Signed-off-by: Jason Xing 
Reviewed-by: Matthieu Baerts (NGI0) 
---
 net/mptcp/protocol.h | 27 +++
 net/mptcp/subflow.c  | 22 +-
 2 files changed, 44 insertions(+), 5 deletions(-)

diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index fdfa843e2d88..252618859ee8 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -581,6 +581,33 @@ mptcp_subflow_ctx_reset(struct mptcp_subflow_context 
*subflow)
WRITE_ONCE(subflow->local_id, -1);
 }
 
+/* Convert reset reasons in MPTCP to enum sk_rst_reason type */
+static inline enum sk_rst_reason
+sk_rst_convert_mptcp_reason(u32 reason)
+{
+   switch (reason) {
+   case MPTCP_RST_EUNSPEC:
+   return SK_RST_REASON_MPTCP_RST_EUNSPEC;
+   case MPTCP_RST_EMPTCP:
+   return SK_RST_REASON_MPTCP_RST_EMPTCP;
+   case MPTCP_RST_ERESOURCE:
+   return SK_RST_REASON_MPTCP_RST_ERESOURCE;
+   case MPTCP_RST_EPROHIBIT:
+   return SK_RST_REASON_MPTCP_RST_EPROHIBIT;
+   case MPTCP_RST_EWQ2BIG:
+   return SK_RST_REASON_MPTCP_RST_EWQ2BIG;
+   case MPTCP_RST_EBADPERF:
+   return SK_RST_REASON_MPTCP_RST_EBADPERF;
+   case MPTCP_RST_EMIDDLEBOX:
+   return SK_RST_REASON_MPTCP_RST_EMIDDLEBOX;
+   default:
+   /* It should not happen, or else errors may occur
+* in MPTCP layer
+*/
+   return SK_RST_REASON_ERROR;
+   }
+}
+
 static inline u64
 mptcp_subflow_get_map_offset(const struct mptcp_subflow_context *subflow)
 {
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index ac867d277860..fb7abf2d01ca 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -309,8 +309,13 @@ static struct dst_entry *subflow_v4_route_req(const struct 
sock *sk,
return dst;
 
dst_release(dst);
-   if (!req->syncookie)
-   tcp_request_sock_ops.send_reset(sk, skb, 
SK_RST_REASON_NOT_SPECIFIED);
+   if (!req->syncookie) {
+   struct mptcp_ext *mpext = mptcp_get_ext(skb);
+   enum sk_rst_reason reason;
+
+   reason = sk_rst_convert_mptcp_reason(mpext->reset_reason);
+   tcp_request_sock_ops.send_reset(sk, skb, reason);
+   }
return NULL;
 }
 
@@ -377,8 +382,13 @@ static struct dst_entry *subflow_v6_route_req(const struct 
sock *sk,
return dst;
 
dst_release(dst);
-   if (!req->syncookie)
-   tcp6_request_sock_ops.send_reset(sk, skb, 
SK_RST_REASON_NOT_SPECIFIED);
+   if (!req->syncookie) {
+   struct mptcp_ext *mpext = mptcp_get_ext(skb);
+   enum sk_rst_reason reason;
+
+   reason = sk_rst_convert_mptcp_reason(mpext->reset_reason);
+   tcp6_request_sock_ops.send_reset(sk, skb, reason);
+   }
return NULL;
 }
 #endif
@@ -783,6 +793,7 @@ static struct sock *subflow_syn_recv_sock(const struct sock 
*sk,
struct mptcp_subflow_request_sock *subflow_req;
struct mptcp_options_received mp_opt;
bool fallback, fallback_is_fatal;
+   enum sk_rst_reason reason;
struct mptcp_sock *owner;
struct sock *child;
 
@@ -913,7 +924,8 @@ static struct sock *subflow_syn_recv_sock(const struct sock 
*sk,
tcp_rsk(req)->drop_req = true;
inet_csk_prepare_for_destroy_sock(child);
tcp_done(child);
-   req->rsk_ops->send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
+   reason = sk_rst_convert_mptcp_reason(mptcp_get_ext(skb)->reset_reason);
+   req->rsk_ops->send_reset(sk, skb, reason);
 
/* The last child reference will be released by the caller */
return child;
-- 
2.37.3

[PATCH net-next v9 4/7] tcp: support rstreason for passive reset

2024-04-24 Thread Jason Xing

From: Jason Xing 

Reuse the dropreason logic to show the exact reason of tcp reset,
so we can finally display the corresponding item in enum sk_reset_reason
instead of reinventing new reset reasons. This patch replaces all
the prior NOT_SPECIFIED reasons.

Signed-off-by: Jason Xing 
---
 include/net/rstreason.h | 15 +++
 net/ipv4/tcp_ipv4.c | 11 +++
 net/ipv6/tcp_ipv6.c | 11 +++
 3 files changed, 29 insertions(+), 8 deletions(-)

diff --git a/include/net/rstreason.h b/include/net/rstreason.h
index bc53b5a24505..df3b6ac0c9b3 100644
--- a/include/net/rstreason.h
+++ b/include/net/rstreason.h
@@ -103,4 +103,19 @@ enum sk_rst_reason {
 */
SK_RST_REASON_MAX,
 };
+
+/* Convert skb drop reasons to enum sk_rst_reason type */
+static inline enum sk_rst_reason
+sk_rst_convert_drop_reason(enum skb_drop_reason reason)
+{
+   switch (reason) {
+   case SKB_DROP_REASON_NOT_SPECIFIED:
+   return SK_RST_REASON_NOT_SPECIFIED;
+   case SKB_DROP_REASON_NO_SOCKET:
+   return SK_RST_REASON_NO_SOCKET;
+   default:
+   /* If we don't have our own corresponding reason */
+   return SK_RST_REASON_NOT_SPECIFIED;
+   }
+}
 #endif
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 418d11902fa7..6bd3a0fb9439 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1936,7 +1936,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
return 0;
 
 reset:
-   tcp_v4_send_reset(rsk, skb, SK_RST_REASON_NOT_SPECIFIED);
+   tcp_v4_send_reset(rsk, skb, sk_rst_convert_drop_reason(reason));
 discard:
kfree_skb_reason(skb, reason);
/* Be careful here. If this function gets more complicated and
@@ -2278,7 +2278,10 @@ int tcp_v4_rcv(struct sk_buff *skb)
} else {
drop_reason = tcp_child_process(sk, nsk, skb);
if (drop_reason) {
-   tcp_v4_send_reset(nsk, skb, 
SK_RST_REASON_NOT_SPECIFIED);
+   enum sk_rst_reason rst_reason;
+
+   rst_reason = 
sk_rst_convert_drop_reason(drop_reason);
+   tcp_v4_send_reset(nsk, skb, rst_reason);
goto discard_and_relse;
}
sock_put(sk);
@@ -2357,7 +2360,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
 bad_packet:
__TCP_INC_STATS(net, TCP_MIB_INERRS);
} else {
-   tcp_v4_send_reset(NULL, skb, SK_RST_REASON_NOT_SPECIFIED);
+   tcp_v4_send_reset(NULL, skb, 
sk_rst_convert_drop_reason(drop_reason));
}
 
 discard_it:
@@ -2409,7 +2412,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
tcp_v4_timewait_ack(sk, skb);
break;
case TCP_TW_RST:
-   tcp_v4_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
+   tcp_v4_send_reset(sk, skb, 
sk_rst_convert_drop_reason(drop_reason));
inet_twsk_deschedule_put(inet_twsk(sk));
goto discard_it;
case TCP_TW_SUCCESS:;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 017f6293b5f4..317d7a6e6b01 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1680,7 +1680,7 @@ int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
return 0;
 
 reset:
-   tcp_v6_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
+   tcp_v6_send_reset(sk, skb, sk_rst_convert_drop_reason(reason));
 discard:
if (opt_skb)
__kfree_skb(opt_skb);
@@ -1865,7 +1865,10 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff 
*skb)
} else {
drop_reason = tcp_child_process(sk, nsk, skb);
if (drop_reason) {
-   tcp_v6_send_reset(nsk, skb, 
SK_RST_REASON_NOT_SPECIFIED);
+   enum sk_rst_reason rst_reason;
+
+   rst_reason = 
sk_rst_convert_drop_reason(drop_reason);
+   tcp_v6_send_reset(nsk, skb, rst_reason);
goto discard_and_relse;
}
sock_put(sk);
@@ -1942,7 +1945,7 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff 
*skb)
 bad_packet:
__TCP_INC_STATS(net, TCP_MIB_INERRS);
} else {
-   tcp_v6_send_reset(NULL, skb, SK_RST_REASON_NOT_SPECIFIED);
+   tcp_v6_send_reset(NULL, skb, 
sk_rst_convert_drop_reason(drop_reason));
}
 
 discard_it:
@@ -1998,7 +2001,7 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff 
*skb)
tcp_v6_timewait_ack(sk, skb);
break;
case TCP_TW_RST:
-   tcp_v6_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
+   tcp_v6_send_reset(sk, skb, 
sk_rst_convert_drop_reason(drop_reason));
inet_twsk_deschedule_put

[PATCH net-next v9 3/7] rstreason: prepare for active reset

2024-04-24 Thread Jason Xing

From: Jason Xing 

Like what we did to passive reset:
only passing possible reset reason in each active reset path.

No functional changes.

Signed-off-by: Jason Xing 
Acked-by: Matthieu Baerts (NGI0) 
---
 include/net/tcp.h |  3 ++-
 net/ipv4/tcp.c| 15 ++-
 net/ipv4/tcp_output.c |  3 ++-
 net/ipv4/tcp_timer.c  |  9 ++---
 net/mptcp/protocol.c  |  4 +++-
 net/mptcp/subflow.c   |  5 +++--
 6 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index b935e1ae4caf..adeacc9aa28a 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -670,7 +670,8 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
 void tcp_send_probe0(struct sock *);
 int tcp_write_wakeup(struct sock *, int mib);
 void tcp_send_fin(struct sock *sk);
-void tcp_send_active_reset(struct sock *sk, gfp_t priority);
+void tcp_send_active_reset(struct sock *sk, gfp_t priority,
+  enum sk_rst_reason reason);
 int tcp_send_synack(struct sock *);
 void tcp_push_one(struct sock *, unsigned int mss_now);
 void __tcp_send_ack(struct sock *sk, u32 rcv_nxt);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index f23b9ea5..4ec0f4feee00 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -275,6 +275,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -2811,7 +2812,8 @@ void __tcp_close(struct sock *sk, long timeout)
/* Unread data was tossed, zap the connection. */
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
tcp_set_state(sk, TCP_CLOSE);
-   tcp_send_active_reset(sk, sk->sk_allocation);
+   tcp_send_active_reset(sk, sk->sk_allocation,
+ SK_RST_REASON_NOT_SPECIFIED);
} else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
/* Check zero linger _after_ checking for unread data. */
sk->sk_prot->disconnect(sk, 0);
@@ -2885,7 +2887,8 @@ void __tcp_close(struct sock *sk, long timeout)
struct tcp_sock *tp = tcp_sk(sk);
if (READ_ONCE(tp->linger2) < 0) {
tcp_set_state(sk, TCP_CLOSE);
-   tcp_send_active_reset(sk, GFP_ATOMIC);
+   tcp_send_active_reset(sk, GFP_ATOMIC,
+ SK_RST_REASON_NOT_SPECIFIED);
__NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPABORTONLINGER);
} else {
@@ -2903,7 +2906,8 @@ void __tcp_close(struct sock *sk, long timeout)
if (sk->sk_state != TCP_CLOSE) {
if (tcp_check_oom(sk, 0)) {
tcp_set_state(sk, TCP_CLOSE);
-   tcp_send_active_reset(sk, GFP_ATOMIC);
+   tcp_send_active_reset(sk, GFP_ATOMIC,
+ SK_RST_REASON_NOT_SPECIFIED);
__NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPABORTONMEMORY);
} else if (!check_net(sock_net(sk))) {
@@ -3007,7 +3011,7 @@ int tcp_disconnect(struct sock *sk, int flags)
/* The last check adjusts for discrepancy of Linux wrt. RFC
 * states
 */
-   tcp_send_active_reset(sk, gfp_any());
+   tcp_send_active_reset(sk, gfp_any(), 
SK_RST_REASON_NOT_SPECIFIED);
WRITE_ONCE(sk->sk_err, ECONNRESET);
} else if (old_state == TCP_SYN_SENT)
WRITE_ONCE(sk->sk_err, ECONNRESET);
@@ -4564,7 +4568,8 @@ int tcp_abort(struct sock *sk, int err)
smp_wmb();
sk_error_report(sk);
if (tcp_need_reset(sk->sk_state))
-   tcp_send_active_reset(sk, GFP_ATOMIC);
+   tcp_send_active_reset(sk, GFP_ATOMIC,
+ SK_RST_REASON_NOT_SPECIFIED);
tcp_done(sk);
}
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 99a1d88f7f47..e4f5c8b5172a 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3614,7 +3614,8 @@ void tcp_send_fin(struct sock *sk)
  * was unread data in the receive queue.  This behavior is recommended
  * by RFC 2525, section 2.17.  -DaveM
  */
-void tcp_send_active_reset(struct sock *sk, gfp_t priority)
+void tcp_send_active_reset(struct sock *sk, gfp_t priority,
+  enum sk_rst_reason reason)
 {
struct sk_buff *skb;
 
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 976db57b95d4..83fe7f62f7f1 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -22,6 +22,7 @@
 #include 
 #include 
 #include 
+#include 
 
 static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk)
 {
@@ -127,7 +128,8 @@ static int tcp_out_of_resources(struct sock *sk, bool 
do_reset)

[PATCH net-next v9 2/7] rstreason: prepare for passive reset

2024-04-24 Thread Jason Xing

From: Jason Xing 

Adjust the parameter and support passing reason of reset which
is for now NOT_SPECIFIED. No functional changes.

Signed-off-by: Jason Xing 
Acked-by: Matthieu Baerts (NGI0) 
---
 include/net/request_sock.h |  4 +++-
 net/dccp/ipv4.c| 10 ++
 net/dccp/ipv6.c| 10 ++
 net/dccp/minisocks.c   |  3 ++-
 net/ipv4/tcp_ipv4.c| 12 +++-
 net/ipv4/tcp_minisocks.c   |  3 ++-
 net/ipv6/tcp_ipv6.c| 15 +--
 net/mptcp/subflow.c|  8 +---
 8 files changed, 40 insertions(+), 25 deletions(-)

diff --git a/include/net/request_sock.h b/include/net/request_sock.h
index 004e651e6067..bdc737832da6 100644
--- a/include/net/request_sock.h
+++ b/include/net/request_sock.h
@@ -18,6 +18,7 @@
 #include 
 
 #include 
+#include 
 
 struct request_sock;
 struct sk_buff;
@@ -34,7 +35,8 @@ struct request_sock_ops {
void(*send_ack)(const struct sock *sk, struct sk_buff *skb,
struct request_sock *req);
void(*send_reset)(const struct sock *sk,
- struct sk_buff *skb);
+ struct sk_buff *skb,
+ enum sk_rst_reason reason);
void(*destructor)(struct request_sock *req);
void(*syn_ack_timeout)(const struct request_sock *req);
 };
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 9fc9cea4c251..ff41bd6f99c3 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -24,6 +24,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "ackvec.h"
 #include "ccid.h"
@@ -521,7 +522,8 @@ static int dccp_v4_send_response(const struct sock *sk, 
struct request_sock *req
return err;
 }
 
-static void dccp_v4_ctl_send_reset(const struct sock *sk, struct sk_buff 
*rxskb)
+static void dccp_v4_ctl_send_reset(const struct sock *sk, struct sk_buff 
*rxskb,
+  enum sk_rst_reason reason)
 {
int err;
const struct iphdr *rxiph;
@@ -706,7 +708,7 @@ int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
return 0;
 
 reset:
-   dccp_v4_ctl_send_reset(sk, skb);
+   dccp_v4_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
kfree_skb(skb);
return 0;
 }
@@ -869,7 +871,7 @@ static int dccp_v4_rcv(struct sk_buff *skb)
if (nsk == sk) {
reqsk_put(req);
} else if (dccp_child_process(sk, nsk, skb)) {
-   dccp_v4_ctl_send_reset(sk, skb);
+   dccp_v4_ctl_send_reset(sk, skb, 
SK_RST_REASON_NOT_SPECIFIED);
goto discard_and_relse;
} else {
sock_put(sk);
@@ -909,7 +911,7 @@ static int dccp_v4_rcv(struct sk_buff *skb)
if (dh->dccph_type != DCCP_PKT_RESET) {
DCCP_SKB_CB(skb)->dccpd_reset_code =
DCCP_RESET_CODE_NO_CONNECTION;
-   dccp_v4_ctl_send_reset(sk, skb);
+   dccp_v4_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
}
 
 discard_it:
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index c8ca703dc331..85f4b8fdbe5e 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -29,6 +29,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "dccp.h"
 #include "ipv6.h"
@@ -256,7 +257,8 @@ static void dccp_v6_reqsk_destructor(struct request_sock 
*req)
kfree_skb(inet_rsk(req)->pktopts);
 }
 
-static void dccp_v6_ctl_send_reset(const struct sock *sk, struct sk_buff 
*rxskb)
+static void dccp_v6_ctl_send_reset(const struct sock *sk, struct sk_buff 
*rxskb,
+  enum sk_rst_reason reason)
 {
const struct ipv6hdr *rxip6h;
struct sk_buff *skb;
@@ -656,7 +658,7 @@ static int dccp_v6_do_rcv(struct sock *sk, struct sk_buff 
*skb)
return 0;
 
 reset:
-   dccp_v6_ctl_send_reset(sk, skb);
+   dccp_v6_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
 discard:
if (opt_skb != NULL)
__kfree_skb(opt_skb);
@@ -762,7 +764,7 @@ static int dccp_v6_rcv(struct sk_buff *skb)
if (nsk == sk) {
reqsk_put(req);
} else if (dccp_child_process(sk, nsk, skb)) {
-   dccp_v6_ctl_send_reset(sk, skb);
+   dccp_v6_ctl_send_reset(sk, skb, 
SK_RST_REASON_NOT_SPECIFIED);
goto discard_and_relse;
} else {
sock_put(sk);
@@ -801,7 +803,7 @@ static int dccp_v6_rcv(struct sk_buff *skb)
if (dh->dccph_type != DCCP_PKT_RESET) {
DCCP_SKB_CB(skb)->dccpd_reset_code =
DCCP_RESET_CODE_NO_CONNECTION;
-   dccp_v6_ctl_send_reset(sk, skb);
+   dccp_v6_ctl_send_reset(sk, skb, SK_R

[PATCH net-next v9 1/7] net: introduce rstreason to detect why the RST is sent

2024-04-24 Thread Jason Xing

From: Jason Xing 

Add a new standalone file for the easy future extension to support
both active reset and passive reset in the TCP/DCCP/MPTCP protocols.

This patch only does the preparations for reset reason mechanism,
nothing else changes.

The reset reasons are divided into three parts:
1) reuse drop reasons for passive reset in TCP
2) our own independent reasons which aren't relying on other reasons at all
3) reuse MP_TCPRST option for MPTCP

The benefits of a standalone reset reason are listed here:
1) it can cover more than one case, such as reset reasons in MPTCP,
active reset reasons.
2) people can easily/fastly understand and maintain this mechanism.
3) we get unified format of output with prefix stripped.
4) more new reset reasons are on the way
...

I will implement the basic codes of active/passive reset reason in
those three protocols, which are not complete for this moment. For
passive reset part in TCP, I only introduce the NO_SOCKET common case
which could be set as an example.

After this series applied, it will have the ability to open a new
gate to let other people contribute more reasons into it :)

Signed-off-by: Jason Xing 
Acked-by: Matthieu Baerts (NGI0) 
---
 include/net/rstreason.h | 106 
 1 file changed, 106 insertions(+)
 create mode 100644 include/net/rstreason.h

diff --git a/include/net/rstreason.h b/include/net/rstreason.h
new file mode 100644
index ..bc53b5a24505
--- /dev/null
+++ b/include/net/rstreason.h
@@ -0,0 +1,106 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifndef _LINUX_RSTREASON_H
+#define _LINUX_RSTREASON_H
+#include 
+#include 
+
+#define DEFINE_RST_REASON(FN, FNe) \
+   FN(NOT_SPECIFIED)   \
+   FN(NO_SOCKET)   \
+   FN(MPTCP_RST_EUNSPEC)   \
+   FN(MPTCP_RST_EMPTCP)\
+   FN(MPTCP_RST_ERESOURCE) \
+   FN(MPTCP_RST_EPROHIBIT) \
+   FN(MPTCP_RST_EWQ2BIG)   \
+   FN(MPTCP_RST_EBADPERF)  \
+   FN(MPTCP_RST_EMIDDLEBOX)\
+   FN(ERROR)   \
+   FNe(MAX)
+
+/**
+ * enum sk_rst_reason - the reasons of socket reset
+ *
+ * The reasons of sk reset, which are used in DCCP/TCP/MPTCP protocols.
+ *
+ * There are three parts in order:
+ * 1) skb drop reasons: relying on drop reasons for such as passive reset
+ * 2) independent reset reasons: such as active reset reasons
+ * 3) reset reasons in MPTCP: only for MPTCP use
+ */
+enum sk_rst_reason {
+   /* Refer to include/net/dropreason-core.h
+* Rely on skb drop reasons because it indicates exactly why RST
+* could happen.
+*/
+   /** @SK_RST_REASON_NOT_SPECIFIED: reset reason is not specified */
+   SK_RST_REASON_NOT_SPECIFIED,
+   /** @SK_RST_REASON_NO_SOCKET: no valid socket that can be used */
+   SK_RST_REASON_NO_SOCKET,
+
+   /* Copy from include/uapi/linux/mptcp.h.
+* These reset fields will not be changed since they adhere to
+* RFC 8684. So do not touch them. I'm going to list each definition
+* of them respectively.
+*/
+   /**
+* @SK_RST_REASON_MPTCP_RST_EUNSPEC: Unspecified error.
+* This is the default error; it implies that the subflow is no
+* longer available. The presence of this option shows that the
+* RST was generated by an MPTCP-aware device.
+*/
+   SK_RST_REASON_MPTCP_RST_EUNSPEC,
+   /**
+* @SK_RST_REASON_MPTCP_RST_EMPTCP: MPTCP-specific error.
+* An error has been detected in the processing of MPTCP options.
+* This is the usual reason code to return in the cases where a RST
+* is being sent to close a subflow because of an invalid response.
+*/
+   SK_RST_REASON_MPTCP_RST_EMPTCP,
+   /**
+* @SK_RST_REASON_MPTCP_RST_ERESOURCE: Lack of resources.
+* This code indicates that the sending host does not have enough
+* resources to support the terminated subflow.
+*/
+   SK_RST_REASON_MPTCP_RST_ERESOURCE,
+   /**
+* @SK_RST_REASON_MPTCP_RST_EPROHIBIT: Administratively prohibited.
+* This code indicates that the requested subflow is prohibited by
+* the policies of the sending host.
+*/
+   SK_RST_REASON_MPTCP_RST_EPROHIBIT,
+   /**
+* @SK_RST_REASON_MPTCP_RST_EWQ2BIG: Too much outstanding data.
+* This code indicates that there is an excessive amount of data
+* that needs to be transmitted over the terminated subflow while
+* having already been acknowledged over one or more other subflows.
+* This may occur if a path has been unavailable for a short period
+* and it is more efficient to reset and start again than it is to
+* retransmit the queued data.
+*/
+   SK_RST_REASON_MPTCP_RST_EWQ2BIG,
+   /**
+* @SK_RST_REASON_MPTCP_RST_EBADPERF: Unacceptable

[PATCH net-next v9 0/7] Implement reset reason mechanism to detect

2024-04-24 Thread Jason Xing

From: Jason Xing 

In production, there are so many cases about why the RST skb is sent but
we don't have a very convenient/fast method to detect the exact underlying
reasons.

RST is implemented in two kinds: passive kind (like tcp_v4_send_reset())
and active kind (like tcp_send_active_reset()). The former can be traced
carefully 1) in TCP, with the help of drop reasons, which is based on
Eric's idea[1], 2) in MPTCP, with the help of reset options defined in
RFC 8684. The latter is relatively independent, which should be
implemented on our own, such as active reset reasons which can not be
replace by skb drop reason or something like this.

In this series, I focus on the fundamental implement mostly about how
the rstreason mechanism works and give the detailed passive part as an
example, not including the active reset part. In future, we can go
further and refine those NOT_SPECIFIED reasons.

Here are some examples when tracing:
-0   [002] ..s1.  1830.262425: tcp_send_reset: skbaddr=x
skaddr=x src=x dest=x state=x reason=NOT_SPECIFIED
-0   [002] ..s1.  1830.262425: tcp_send_reset: skbaddr=x
skaddr=x src=x dest=x state=x reason=NO_SOCKET

[1]
Link: 
https://lore.kernel.org/all/CANn89iJw8x-LqgsWOeJQQvgVg6DnL5aBRLi10QN2WBdr+X4k=w...@mail.gmail.com/

v9
Link: 
https://lore.kernel.org/all/20240423072137.65168-1-kerneljasonx...@gmail.com/
1. address nit problem (Matt)
2. add acked-by and reviewed-by tags (Matt)

v8
Link: 
https://lore.kernel.org/all/20240422030109.12891-1-kerneljasonx...@gmail.com/
1. put sk reset reasons into more natural order (Matt)
2. adjust those helper position (Matt)
3. rename two convert function (Matt)
4. make the kdoc format correct (Simon)

v7
Link: 
https://lore.kernel.org/all/20240417085143.69578-1-kerneljasonx...@gmail.com/
1. get rid of enum casts which could bring potential issues (Eric)
2. use switch-case method to map between reset reason in MPTCP and sk reset
reason (Steven)
3. use switch-case method to map between skb drop reason and sk reset
reason

v6
1. add back casts, or else they are treated as error.

v5
Link: 
https://lore.kernel.org/all/2024045630.38420-1-kerneljasonx...@gmail.com/
1. address format issue (like reverse xmas tree) (Eric, Paolo)
2. remove unnecessary casts. (Eric)
3. introduce a helper used in mptcp active reset. See patch 6. (Paolo)

v4
Link: 
https://lore.kernel.org/all/20240409100934.37725-1-kerneljasonx...@gmail.com/
1. passing 'enum sk_rst_reason' for readability when tracing (Antoine)

v3
Link: 
https://lore.kernel.org/all/20240404072047.11490-1-kerneljasonx...@gmail.com/
1. rebase (mptcp part) and address what Mat suggested.

v2
Link: https://lore.kernel.org/all/20240403185033.47ebc...@kernel.org/
1. rebase against the latest net-next tree


Jason Xing (7):
  net: introduce rstreason to detect why the RST is sent
  rstreason: prepare for passive reset
  rstreason: prepare for active reset
  tcp: support rstreason for passive reset
  mptcp: support rstreason for passive reset
  mptcp: introducing a helper into active reset logic
  rstreason: make it work in trace world

 include/net/request_sock.h |   4 +-
 include/net/rstreason.h| 121 +
 include/net/tcp.h  |   3 +-
 include/trace/events/tcp.h |  26 ++--
 net/dccp/ipv4.c|  10 +--
 net/dccp/ipv6.c|  10 +--
 net/dccp/minisocks.c   |   3 +-
 net/ipv4/tcp.c |  15 +++--
 net/ipv4/tcp_ipv4.c|  17 --
 net/ipv4/tcp_minisocks.c   |   3 +-
 net/ipv4/tcp_output.c  |   5 +-
 net/ipv4/tcp_timer.c   |   9 ++-
 net/ipv6/tcp_ipv6.c|  20 +++---
 net/mptcp/protocol.c   |   2 +-
 net/mptcp/protocol.h   |  38 
 net/mptcp/subflow.c|  27 ++---
 16 files changed, 266 insertions(+), 47 deletions(-)
 create mode 100644 include/net/rstreason.h

-- 
2.37.3

Re: [PATCH net-next v7 1/7] net: introduce rstreason to detect why the RST is sent

2024-04-23 Thread Jason Xing

On Tue, Apr 23, 2024 at 7:57 PM Simon Horman  wrote:
>
> On Tue, Apr 23, 2024 at 10:17:31AM +0800, Jason Xing wrote:
> > On Tue, Apr 23, 2024 at 10:14 AM Jason Xing  
> > wrote:
> > >
> > > Hi Simon,
> > >
> > > On Tue, Apr 23, 2024 at 2:28 AM Simon Horman  wrote:
> > > >
> > > > On Mon, Apr 22, 2024 at 11:01:03AM +0800, Jason Xing wrote:
> > > >
> > > > ...
> > > >
> > > > > diff --git a/include/net/rstreason.h b/include/net/rstreason.h
> > > >
> > > > ...
> > > >
> > > > > +/**
> > > > > + * There are three parts in order:
> > > > > + * 1) reset reason in MPTCP: only for MPTCP use
> > > > > + * 2) skb drop reason: relying on drop reasons for such as passive 
> > > > > reset
> > > > > + * 3) independent reset reason: such as active reset reasons
> > > > > + */
> > > >
> > > > Hi Jason,
> > > >
> > > > A minor nit from my side.
> > > >
> > > > '/**' denotes the beginning of a Kernel doc,
> > > > but other than that, this comment is not a Kernel doc.
> > > >
> > > > FWIIW, I would suggest providing a proper Kernel doc for enum 
> > > > sk_rst_reason.
> > > > But another option would be to simply make this a normal comment,
> > > > starting with "/* There are"
> > >
> > > Thanks Simon. I'm trying to use the kdoc way to make it right :)
> > >
> > > How about this one:
> > > /**
> > >  * enum sk_rst_reason - the reasons of socket reset
> > >  *
> > >  * The reason of skb drop, which is used in DCCP/TCP/MPTCP protocols.
> >
> > s/skb drop/sk reset/
> >
> > Sorry, I cannot withdraw my previous email in time.
> >
> > >  *
> > >  * There are three parts in order:
> > >  * 1) skb drop reasons: relying on drop reasons for such as passive
> > > reset
> > >  * 2) independent reset reasons: such as active reset reasons
> > >  * 3) reset reasons in MPTCP: only for MPTCP use
> > >  */
> > > ?
> > >
> > > I chose to mimic what enum skb_drop_reason does in the
> > > include/net/dropreason-core.h file.
> > >
> > > > +enum sk_rst_reason {
> > > > +   /**
> > > > +* Copy from include/uapi/linux/mptcp.h.
> > > > +* These reset fields will not be changed since they adhere to
> > > > +* RFC 8684. So do not touch them. I'm going to list each 
> > > > definition
> > > > +* of them respectively.
> > > > +*/
> > >
> > > Thanks to you, I found another similar point where I smell something
> > > wrong as in the above code. I'm going to replace '/**' with '/*' since
> > > it's only a comment, not a kdoc.
>
> Likewise, thanks Jason.
>
> I haven't had time to look at v8 properly,
> but I see that kernel-doc is happy with the changed
> you have made there as discussed above.
>

Thank you, Simon. I learned something new about the coding style.

Besides, some other nit problems have been spotted by Matt. I will fix
them if it's required to send a new version.

Re: [PATCH net-next v8 5/7] mptcp: support rstreason for passive reset

2024-04-23 Thread Jason Xing

Hello Matthieu,

On Tue, Apr 23, 2024 at 6:02 PM Matthieu Baerts  wrote:
>
> Hi Jason,
>
> On 23/04/2024 09:21, Jason Xing wrote:
> > From: Jason Xing 
> >
> > It relys on what reset options in the skb are as rfc8684 says. Reusing
>
> (if you have something else to fix, 'checkpatch.pl --codespell' reported
> a warning here: s/relys/relies/)

Thanks. Will fix it.

>
> > this logic can save us much energy. This patch replaces most of the prior
> > NOT_SPECIFIED reasons.
> >
> > Signed-off-by: Jason Xing 
> > ---
> >  net/mptcp/protocol.h | 28 
> >  net/mptcp/subflow.c  | 22 +-
> >  2 files changed, 45 insertions(+), 5 deletions(-)
> >
> > diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
> > index fdfa843e2d88..bbcb8c068aae 100644
> > --- a/net/mptcp/protocol.h
> > +++ b/net/mptcp/protocol.h
> > @@ -581,6 +581,34 @@ mptcp_subflow_ctx_reset(struct mptcp_subflow_context 
> > *subflow)
> >   WRITE_ONCE(subflow->local_id, -1);
> >  }
> >
> > +/* Convert reset reasons in MPTCP to enum sk_rst_reason type */
> > +static inline enum sk_rst_reason
> > +sk_rst_convert_mptcp_reason(u32 reason)
> > +{
> > + switch (reason) {
> > + case MPTCP_RST_EUNSPEC:
> > + return SK_RST_REASON_MPTCP_RST_EUNSPEC;
> > + case MPTCP_RST_EMPTCP:
> > + return SK_RST_REASON_MPTCP_RST_EMPTCP;
> > + case MPTCP_RST_ERESOURCE:
> > + return SK_RST_REASON_MPTCP_RST_ERESOURCE;
> > + case MPTCP_RST_EPROHIBIT:
> > + return SK_RST_REASON_MPTCP_RST_EPROHIBIT;
> > + case MPTCP_RST_EWQ2BIG:
> > + return SK_RST_REASON_MPTCP_RST_EWQ2BIG;
> > + case MPTCP_RST_EBADPERF:
> > + return SK_RST_REASON_MPTCP_RST_EBADPERF;
> > + case MPTCP_RST_EMIDDLEBOX:
> > + return SK_RST_REASON_MPTCP_RST_EMIDDLEBOX;
> > + default:
> > + /**
>
> I guess here as well, it should be '/*' instead of '/**'. But I guess
> that's fine, this file is probably not scanned. Anyway, if you have to
> send a new version, please fix this as well.

Thanks for your help. I will.

>
> (Also, this helper might require '#include ', but our
> CI is fine with it, it is also added in the next commit, and probably
> already included via include/net/request_sock.h. So I guess that's fine.)

Yes, If I need to submit the V9 patch, I will move it.

>
>
> Other than that, it looks good to me:
>
> Reviewed-by: Matthieu Baerts (NGI0) 

Thanks for all the reviews :)

Thanks,
Jason

>
> Cheers,
> Matt
> --
> Sponsored by the NGI0 Core fund.
>

[PATCH net-next v8 7/7] rstreason: make it work in trace world

2024-04-23 Thread Jason Xing

From: Jason Xing 

At last, we should let it work by introducing this reset reason in
trace world.

One of the possible expected outputs is:
... tcp_send_reset: skbaddr=xxx skaddr=xxx src=xxx dest=xxx
state=TCP_ESTABLISHED reason=NOT_SPECIFIED

Signed-off-by: Jason Xing 
Reviewed-by: Steven Rostedt (Google) 
---
 include/trace/events/tcp.h | 26 ++
 net/ipv4/tcp_ipv4.c|  2 +-
 net/ipv4/tcp_output.c  |  2 +-
 net/ipv6/tcp_ipv6.c|  2 +-
 4 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
index 5c04a61a11c2..49b5ee091cf6 100644
--- a/include/trace/events/tcp.h
+++ b/include/trace/events/tcp.h
@@ -11,6 +11,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /*
  * tcp event with arguments sk and skb
@@ -74,20 +75,32 @@ DEFINE_EVENT(tcp_event_sk_skb, tcp_retransmit_skb,
TP_ARGS(sk, skb)
 );
 
+#undef FN
+#define FN(reason) TRACE_DEFINE_ENUM(SK_RST_REASON_##reason);
+DEFINE_RST_REASON(FN, FN)
+
+#undef FN
+#undef FNe
+#define FN(reason) { SK_RST_REASON_##reason, #reason },
+#define FNe(reason){ SK_RST_REASON_##reason, #reason }
+
 /*
  * skb of trace_tcp_send_reset is the skb that caused RST. In case of
  * active reset, skb should be NULL
  */
 TRACE_EVENT(tcp_send_reset,
 
-   TP_PROTO(const struct sock *sk, const struct sk_buff *skb),
+   TP_PROTO(const struct sock *sk,
+const struct sk_buff *skb,
+const enum sk_rst_reason reason),
 
-   TP_ARGS(sk, skb),
+   TP_ARGS(sk, skb, reason),
 
TP_STRUCT__entry(
__field(const void *, skbaddr)
__field(const void *, skaddr)
__field(int, state)
+   __field(enum sk_rst_reason, reason)
__array(__u8, saddr, sizeof(struct sockaddr_in6))
__array(__u8, daddr, sizeof(struct sockaddr_in6))
),
@@ -113,14 +126,19 @@ TRACE_EVENT(tcp_send_reset,
 */
TP_STORE_ADDR_PORTS_SKB(skb, th, entry->daddr, 
entry->saddr);
}
+   __entry->reason = reason;
),
 
-   TP_printk("skbaddr=%p skaddr=%p src=%pISpc dest=%pISpc state=%s",
+   TP_printk("skbaddr=%p skaddr=%p src=%pISpc dest=%pISpc state=%s 
reason=%s",
  __entry->skbaddr, __entry->skaddr,
  __entry->saddr, __entry->daddr,
- __entry->state ? show_tcp_state_name(__entry->state) : 
"UNKNOWN")
+ __entry->state ? show_tcp_state_name(__entry->state) : 
"UNKNOWN",
+ __print_symbolic(__entry->reason, DEFINE_RST_REASON(FN, FNe)))
 );
 
+#undef FN
+#undef FNe
+
 /*
  * tcp event with arguments sk
  *
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 6bd3a0fb9439..6096ac7a3a02 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -871,7 +871,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct 
sk_buff *skb,
if (sk)
arg.bound_dev_if = sk->sk_bound_dev_if;
 
-   trace_tcp_send_reset(sk, skb);
+   trace_tcp_send_reset(sk, skb, reason);
 
BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 276d9d541b01..b08ffb17d5a0 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3612,7 +3612,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t 
priority,
/* skb of trace_tcp_send_reset() keeps the skb that caused RST,
 * skb here is different to the troublesome skb, so use NULL
 */
-   trace_tcp_send_reset(sk, NULL);
+   trace_tcp_send_reset(sk, NULL, SK_RST_REASON_NOT_SPECIFIED);
 }
 
 /* Send a crossed SYN-ACK during socket establishment.
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 317d7a6e6b01..77958adf2e16 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1133,7 +1133,7 @@ static void tcp_v6_send_reset(const struct sock *sk, 
struct sk_buff *skb,
label = ip6_flowlabel(ipv6h);
}
 
-   trace_tcp_send_reset(sk, skb);
+   trace_tcp_send_reset(sk, skb, reason);
 
tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, 1,
 ipv6_get_dsfield(ipv6h), label, priority, txhash,
-- 
2.37.3

[PATCH net-next v8 6/7] mptcp: introducing a helper into active reset logic

2024-04-23 Thread Jason Xing

From: Jason Xing 

Since we have mapped every mptcp reset reason definition in enum
sk_rst_reason, introducing a new helper can cover some missing places
where we have already set the subflow->reset_reason.

Note: using SK_RST_REASON_NOT_SPECIFIED is the same as
SK_RST_REASON_MPTCP_RST_EUNSPEC. They are both unknown. So we can convert
it directly.

Suggested-by: Paolo Abeni 
Signed-off-by: Jason Xing 
---
Link: 
https://lore.kernel.org/all/2d3ea199eef53cf6a0c48e21abdee0eefbdee927.ca...@redhat.com/
---
 net/mptcp/protocol.c |  4 +---
 net/mptcp/protocol.h | 11 +++
 net/mptcp/subflow.c  |  6 ++
 3 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 065967086492..4b13ca362efa 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -21,7 +21,6 @@
 #endif
 #include 
 #include 
-#include 
 #include 
 #include "protocol.h"
 #include "mib.h"
@@ -2570,8 +2569,7 @@ static void mptcp_check_fastclose(struct mptcp_sock *msk)
 
slow = lock_sock_fast(tcp_sk);
if (tcp_sk->sk_state != TCP_CLOSE) {
-   tcp_send_active_reset(tcp_sk, GFP_ATOMIC,
- SK_RST_REASON_NOT_SPECIFIED);
+   mptcp_send_active_reset_reason(tcp_sk);
tcp_set_state(tcp_sk, TCP_CLOSE);
}
unlock_sock_fast(tcp_sk, slow);
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index bbcb8c068aae..d40ad4a2f1b8 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -12,6 +12,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "mptcp_pm_gen.h"
 
@@ -609,6 +610,16 @@ sk_rst_convert_mptcp_reason(u32 reason)
}
 }
 
+static inline void
+mptcp_send_active_reset_reason(struct sock *sk)
+{
+   struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+   enum sk_rst_reason reason;
+
+   reason = sk_rst_convert_mptcp_reason(subflow->reset_reason);
+   tcp_send_active_reset(sk, GFP_ATOMIC, reason);
+}
+
 static inline u64
 mptcp_subflow_get_map_offset(const struct mptcp_subflow_context *subflow)
 {
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index fb7abf2d01ca..97ec44d1df30 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -20,7 +20,6 @@
 #include 
 #endif
 #include 
-#include 
 
 #include "protocol.h"
 #include "mib.h"
@@ -424,7 +423,7 @@ void mptcp_subflow_reset(struct sock *ssk)
/* must hold: tcp_done() could drop last reference on parent */
sock_hold(sk);
 
-   tcp_send_active_reset(ssk, GFP_ATOMIC, SK_RST_REASON_NOT_SPECIFIED);
+   mptcp_send_active_reset_reason(ssk);
tcp_done(ssk);
if (!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, _sk(sk)->flags))
mptcp_schedule_work(sk);
@@ -1362,8 +1361,7 @@ static bool subflow_check_data_avail(struct sock *ssk)
tcp_set_state(ssk, TCP_CLOSE);
while ((skb = skb_peek(>sk_receive_queue)))
sk_eat_skb(ssk, skb);
-   tcp_send_active_reset(ssk, GFP_ATOMIC,
- SK_RST_REASON_NOT_SPECIFIED);
+   mptcp_send_active_reset_reason(ssk);
WRITE_ONCE(subflow->data_avail, false);
return false;
}
-- 
2.37.3

[PATCH net-next v8 5/7] mptcp: support rstreason for passive reset

2024-04-23 Thread Jason Xing

From: Jason Xing 

It relys on what reset options in the skb are as rfc8684 says. Reusing
this logic can save us much energy. This patch replaces most of the prior
NOT_SPECIFIED reasons.

Signed-off-by: Jason Xing 
---
 net/mptcp/protocol.h | 28 
 net/mptcp/subflow.c  | 22 +-
 2 files changed, 45 insertions(+), 5 deletions(-)

diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index fdfa843e2d88..bbcb8c068aae 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -581,6 +581,34 @@ mptcp_subflow_ctx_reset(struct mptcp_subflow_context 
*subflow)
WRITE_ONCE(subflow->local_id, -1);
 }
 
+/* Convert reset reasons in MPTCP to enum sk_rst_reason type */
+static inline enum sk_rst_reason
+sk_rst_convert_mptcp_reason(u32 reason)
+{
+   switch (reason) {
+   case MPTCP_RST_EUNSPEC:
+   return SK_RST_REASON_MPTCP_RST_EUNSPEC;
+   case MPTCP_RST_EMPTCP:
+   return SK_RST_REASON_MPTCP_RST_EMPTCP;
+   case MPTCP_RST_ERESOURCE:
+   return SK_RST_REASON_MPTCP_RST_ERESOURCE;
+   case MPTCP_RST_EPROHIBIT:
+   return SK_RST_REASON_MPTCP_RST_EPROHIBIT;
+   case MPTCP_RST_EWQ2BIG:
+   return SK_RST_REASON_MPTCP_RST_EWQ2BIG;
+   case MPTCP_RST_EBADPERF:
+   return SK_RST_REASON_MPTCP_RST_EBADPERF;
+   case MPTCP_RST_EMIDDLEBOX:
+   return SK_RST_REASON_MPTCP_RST_EMIDDLEBOX;
+   default:
+   /**
+* It should not happen, or else errors may occur
+* in MPTCP layer
+*/
+   return SK_RST_REASON_ERROR;
+   }
+}
+
 static inline u64
 mptcp_subflow_get_map_offset(const struct mptcp_subflow_context *subflow)
 {
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index ac867d277860..fb7abf2d01ca 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -309,8 +309,13 @@ static struct dst_entry *subflow_v4_route_req(const struct 
sock *sk,
return dst;
 
dst_release(dst);
-   if (!req->syncookie)
-   tcp_request_sock_ops.send_reset(sk, skb, 
SK_RST_REASON_NOT_SPECIFIED);
+   if (!req->syncookie) {
+   struct mptcp_ext *mpext = mptcp_get_ext(skb);
+   enum sk_rst_reason reason;
+
+   reason = sk_rst_convert_mptcp_reason(mpext->reset_reason);
+   tcp_request_sock_ops.send_reset(sk, skb, reason);
+   }
return NULL;
 }
 
@@ -377,8 +382,13 @@ static struct dst_entry *subflow_v6_route_req(const struct 
sock *sk,
return dst;
 
dst_release(dst);
-   if (!req->syncookie)
-   tcp6_request_sock_ops.send_reset(sk, skb, 
SK_RST_REASON_NOT_SPECIFIED);
+   if (!req->syncookie) {
+   struct mptcp_ext *mpext = mptcp_get_ext(skb);
+   enum sk_rst_reason reason;
+
+   reason = sk_rst_convert_mptcp_reason(mpext->reset_reason);
+   tcp6_request_sock_ops.send_reset(sk, skb, reason);
+   }
return NULL;
 }
 #endif
@@ -783,6 +793,7 @@ static struct sock *subflow_syn_recv_sock(const struct sock 
*sk,
struct mptcp_subflow_request_sock *subflow_req;
struct mptcp_options_received mp_opt;
bool fallback, fallback_is_fatal;
+   enum sk_rst_reason reason;
struct mptcp_sock *owner;
struct sock *child;
 
@@ -913,7 +924,8 @@ static struct sock *subflow_syn_recv_sock(const struct sock 
*sk,
tcp_rsk(req)->drop_req = true;
inet_csk_prepare_for_destroy_sock(child);
tcp_done(child);
-   req->rsk_ops->send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
+   reason = sk_rst_convert_mptcp_reason(mptcp_get_ext(skb)->reset_reason);
+   req->rsk_ops->send_reset(sk, skb, reason);
 
/* The last child reference will be released by the caller */
return child;
-- 
2.37.3

[PATCH net-next v8 4/7] tcp: support rstreason for passive reset

2024-04-23 Thread Jason Xing

From: Jason Xing 

Reuse the dropreason logic to show the exact reason of tcp reset,
so we can finally display the corresponding item in enum sk_reset_reason
instead of reinventing new reset reasons. This patch replaces all
the prior NOT_SPECIFIED reasons.

Signed-off-by: Jason Xing 
---
 include/net/rstreason.h | 15 +++
 net/ipv4/tcp_ipv4.c | 11 +++
 net/ipv6/tcp_ipv6.c | 11 +++
 3 files changed, 29 insertions(+), 8 deletions(-)

diff --git a/include/net/rstreason.h b/include/net/rstreason.h
index bc53b5a24505..df3b6ac0c9b3 100644
--- a/include/net/rstreason.h
+++ b/include/net/rstreason.h
@@ -103,4 +103,19 @@ enum sk_rst_reason {
 */
SK_RST_REASON_MAX,
 };
+
+/* Convert skb drop reasons to enum sk_rst_reason type */
+static inline enum sk_rst_reason
+sk_rst_convert_drop_reason(enum skb_drop_reason reason)
+{
+   switch (reason) {
+   case SKB_DROP_REASON_NOT_SPECIFIED:
+   return SK_RST_REASON_NOT_SPECIFIED;
+   case SKB_DROP_REASON_NO_SOCKET:
+   return SK_RST_REASON_NO_SOCKET;
+   default:
+   /* If we don't have our own corresponding reason */
+   return SK_RST_REASON_NOT_SPECIFIED;
+   }
+}
 #endif
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 418d11902fa7..6bd3a0fb9439 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1936,7 +1936,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
return 0;
 
 reset:
-   tcp_v4_send_reset(rsk, skb, SK_RST_REASON_NOT_SPECIFIED);
+   tcp_v4_send_reset(rsk, skb, sk_rst_convert_drop_reason(reason));
 discard:
kfree_skb_reason(skb, reason);
/* Be careful here. If this function gets more complicated and
@@ -2278,7 +2278,10 @@ int tcp_v4_rcv(struct sk_buff *skb)
} else {
drop_reason = tcp_child_process(sk, nsk, skb);
if (drop_reason) {
-   tcp_v4_send_reset(nsk, skb, 
SK_RST_REASON_NOT_SPECIFIED);
+   enum sk_rst_reason rst_reason;
+
+   rst_reason = 
sk_rst_convert_drop_reason(drop_reason);
+   tcp_v4_send_reset(nsk, skb, rst_reason);
goto discard_and_relse;
}
sock_put(sk);
@@ -2357,7 +2360,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
 bad_packet:
__TCP_INC_STATS(net, TCP_MIB_INERRS);
} else {
-   tcp_v4_send_reset(NULL, skb, SK_RST_REASON_NOT_SPECIFIED);
+   tcp_v4_send_reset(NULL, skb, 
sk_rst_convert_drop_reason(drop_reason));
}
 
 discard_it:
@@ -2409,7 +2412,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
tcp_v4_timewait_ack(sk, skb);
break;
case TCP_TW_RST:
-   tcp_v4_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
+   tcp_v4_send_reset(sk, skb, 
sk_rst_convert_drop_reason(drop_reason));
inet_twsk_deschedule_put(inet_twsk(sk));
goto discard_it;
case TCP_TW_SUCCESS:;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 017f6293b5f4..317d7a6e6b01 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1680,7 +1680,7 @@ int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
return 0;
 
 reset:
-   tcp_v6_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
+   tcp_v6_send_reset(sk, skb, sk_rst_convert_drop_reason(reason));
 discard:
if (opt_skb)
__kfree_skb(opt_skb);
@@ -1865,7 +1865,10 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff 
*skb)
} else {
drop_reason = tcp_child_process(sk, nsk, skb);
if (drop_reason) {
-   tcp_v6_send_reset(nsk, skb, 
SK_RST_REASON_NOT_SPECIFIED);
+   enum sk_rst_reason rst_reason;
+
+   rst_reason = 
sk_rst_convert_drop_reason(drop_reason);
+   tcp_v6_send_reset(nsk, skb, rst_reason);
goto discard_and_relse;
}
sock_put(sk);
@@ -1942,7 +1945,7 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff 
*skb)
 bad_packet:
__TCP_INC_STATS(net, TCP_MIB_INERRS);
} else {
-   tcp_v6_send_reset(NULL, skb, SK_RST_REASON_NOT_SPECIFIED);
+   tcp_v6_send_reset(NULL, skb, 
sk_rst_convert_drop_reason(drop_reason));
}
 
 discard_it:
@@ -1998,7 +2001,7 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff 
*skb)
tcp_v6_timewait_ack(sk, skb);
break;
case TCP_TW_RST:
-   tcp_v6_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
+   tcp_v6_send_reset(sk, skb, 
sk_rst_convert_drop_reason(drop_reason));
inet_twsk_deschedule_put

[PATCH net-next v8 3/7] rstreason: prepare for active reset

2024-04-23 Thread Jason Xing

From: Jason Xing 

Like what we did to passive reset:
only passing possible reset reason in each active reset path.

No functional changes.

Signed-off-by: Jason Xing 
---
 include/net/tcp.h |  3 ++-
 net/ipv4/tcp.c| 15 ++-
 net/ipv4/tcp_output.c |  3 ++-
 net/ipv4/tcp_timer.c  |  9 ++---
 net/mptcp/protocol.c  |  4 +++-
 net/mptcp/subflow.c   |  5 +++--
 6 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index b935e1ae4caf..adeacc9aa28a 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -670,7 +670,8 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
 void tcp_send_probe0(struct sock *);
 int tcp_write_wakeup(struct sock *, int mib);
 void tcp_send_fin(struct sock *sk);
-void tcp_send_active_reset(struct sock *sk, gfp_t priority);
+void tcp_send_active_reset(struct sock *sk, gfp_t priority,
+  enum sk_rst_reason reason);
 int tcp_send_synack(struct sock *);
 void tcp_push_one(struct sock *, unsigned int mss_now);
 void __tcp_send_ack(struct sock *sk, u32 rcv_nxt);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index f23b9ea5..4ec0f4feee00 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -275,6 +275,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -2811,7 +2812,8 @@ void __tcp_close(struct sock *sk, long timeout)
/* Unread data was tossed, zap the connection. */
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
tcp_set_state(sk, TCP_CLOSE);
-   tcp_send_active_reset(sk, sk->sk_allocation);
+   tcp_send_active_reset(sk, sk->sk_allocation,
+ SK_RST_REASON_NOT_SPECIFIED);
} else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
/* Check zero linger _after_ checking for unread data. */
sk->sk_prot->disconnect(sk, 0);
@@ -2885,7 +2887,8 @@ void __tcp_close(struct sock *sk, long timeout)
struct tcp_sock *tp = tcp_sk(sk);
if (READ_ONCE(tp->linger2) < 0) {
tcp_set_state(sk, TCP_CLOSE);
-   tcp_send_active_reset(sk, GFP_ATOMIC);
+   tcp_send_active_reset(sk, GFP_ATOMIC,
+ SK_RST_REASON_NOT_SPECIFIED);
__NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPABORTONLINGER);
} else {
@@ -2903,7 +2906,8 @@ void __tcp_close(struct sock *sk, long timeout)
if (sk->sk_state != TCP_CLOSE) {
if (tcp_check_oom(sk, 0)) {
tcp_set_state(sk, TCP_CLOSE);
-   tcp_send_active_reset(sk, GFP_ATOMIC);
+   tcp_send_active_reset(sk, GFP_ATOMIC,
+ SK_RST_REASON_NOT_SPECIFIED);
__NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPABORTONMEMORY);
} else if (!check_net(sock_net(sk))) {
@@ -3007,7 +3011,7 @@ int tcp_disconnect(struct sock *sk, int flags)
/* The last check adjusts for discrepancy of Linux wrt. RFC
 * states
 */
-   tcp_send_active_reset(sk, gfp_any());
+   tcp_send_active_reset(sk, gfp_any(), 
SK_RST_REASON_NOT_SPECIFIED);
WRITE_ONCE(sk->sk_err, ECONNRESET);
} else if (old_state == TCP_SYN_SENT)
WRITE_ONCE(sk->sk_err, ECONNRESET);
@@ -4564,7 +4568,8 @@ int tcp_abort(struct sock *sk, int err)
smp_wmb();
sk_error_report(sk);
if (tcp_need_reset(sk->sk_state))
-   tcp_send_active_reset(sk, GFP_ATOMIC);
+   tcp_send_active_reset(sk, GFP_ATOMIC,
+ SK_RST_REASON_NOT_SPECIFIED);
tcp_done(sk);
}
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 61119d42b0fd..276d9d541b01 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3586,7 +3586,8 @@ void tcp_send_fin(struct sock *sk)
  * was unread data in the receive queue.  This behavior is recommended
  * by RFC 2525, section 2.17.  -DaveM
  */
-void tcp_send_active_reset(struct sock *sk, gfp_t priority)
+void tcp_send_active_reset(struct sock *sk, gfp_t priority,
+  enum sk_rst_reason reason)
 {
struct sk_buff *skb;
 
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 976db57b95d4..83fe7f62f7f1 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -22,6 +22,7 @@
 #include 
 #include 
 #include 
+#include 
 
 static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk)
 {
@@ -127,7 +128,8 @@ static int tcp_out_of_resources(struct sock *sk, bool 
do_reset)

[PATCH net-next v8 1/7] net: introduce rstreason to detect why the RST is sent

2024-04-23 Thread Jason Xing

From: Jason Xing 

Add a new standalone file for the easy future extension to support
both active reset and passive reset in the TCP/DCCP/MPTCP protocols.

This patch only does the preparations for reset reason mechanism,
nothing else changes.

The reset reasons are divided into three parts:
1) reuse drop reasons for passive reset in TCP
2) our own independent reasons which aren't relying on other reasons at all
3) reuse MP_TCPRST option for MPTCP

The benefits of a standalone reset reason are listed here:
1) it can cover more than one case, such as reset reasons in MPTCP,
active reset reasons.
2) people can easily/fastly understand and maintain this mechanism.
3) we get unified format of output with prefix stripped.
4) more new reset reasons are on the way
...

I will implement the basic codes of active/passive reset reason in
those three protocols, which are not complete for this moment. For
passive reset part in TCP, I only introduce the NO_SOCKET common case
which could be set as an example.

After this series applied, it will have the ability to open a new
gate to let other people contribute more reasons into it :)

Signed-off-by: Jason Xing 
---
 include/net/rstreason.h | 106 
 1 file changed, 106 insertions(+)
 create mode 100644 include/net/rstreason.h

diff --git a/include/net/rstreason.h b/include/net/rstreason.h
new file mode 100644
index ..bc53b5a24505
--- /dev/null
+++ b/include/net/rstreason.h
@@ -0,0 +1,106 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifndef _LINUX_RSTREASON_H
+#define _LINUX_RSTREASON_H
+#include 
+#include 
+
+#define DEFINE_RST_REASON(FN, FNe) \
+   FN(NOT_SPECIFIED)   \
+   FN(NO_SOCKET)   \
+   FN(MPTCP_RST_EUNSPEC)   \
+   FN(MPTCP_RST_EMPTCP)\
+   FN(MPTCP_RST_ERESOURCE) \
+   FN(MPTCP_RST_EPROHIBIT) \
+   FN(MPTCP_RST_EWQ2BIG)   \
+   FN(MPTCP_RST_EBADPERF)  \
+   FN(MPTCP_RST_EMIDDLEBOX)\
+   FN(ERROR)   \
+   FNe(MAX)
+
+/**
+ * enum sk_rst_reason - the reasons of socket reset
+ *
+ * The reasons of sk reset, which are used in DCCP/TCP/MPTCP protocols.
+ *
+ * There are three parts in order:
+ * 1) skb drop reasons: relying on drop reasons for such as passive reset
+ * 2) independent reset reasons: such as active reset reasons
+ * 3) reset reasons in MPTCP: only for MPTCP use
+ */
+enum sk_rst_reason {
+   /* Refer to include/net/dropreason-core.h
+* Rely on skb drop reasons because it indicates exactly why RST
+* could happen.
+*/
+   /** @SK_RST_REASON_NOT_SPECIFIED: reset reason is not specified */
+   SK_RST_REASON_NOT_SPECIFIED,
+   /** @SK_RST_REASON_NO_SOCKET: no valid socket that can be used */
+   SK_RST_REASON_NO_SOCKET,
+
+   /* Copy from include/uapi/linux/mptcp.h.
+* These reset fields will not be changed since they adhere to
+* RFC 8684. So do not touch them. I'm going to list each definition
+* of them respectively.
+*/
+   /**
+* @SK_RST_REASON_MPTCP_RST_EUNSPEC: Unspecified error.
+* This is the default error; it implies that the subflow is no
+* longer available. The presence of this option shows that the
+* RST was generated by an MPTCP-aware device.
+*/
+   SK_RST_REASON_MPTCP_RST_EUNSPEC,
+   /**
+* @SK_RST_REASON_MPTCP_RST_EMPTCP: MPTCP-specific error.
+* An error has been detected in the processing of MPTCP options.
+* This is the usual reason code to return in the cases where a RST
+* is being sent to close a subflow because of an invalid response.
+*/
+   SK_RST_REASON_MPTCP_RST_EMPTCP,
+   /**
+* @SK_RST_REASON_MPTCP_RST_ERESOURCE: Lack of resources.
+* This code indicates that the sending host does not have enough
+* resources to support the terminated subflow.
+*/
+   SK_RST_REASON_MPTCP_RST_ERESOURCE,
+   /**
+* @SK_RST_REASON_MPTCP_RST_EPROHIBIT: Administratively prohibited.
+* This code indicates that the requested subflow is prohibited by
+* the policies of the sending host.
+*/
+   SK_RST_REASON_MPTCP_RST_EPROHIBIT,
+   /**
+* @SK_RST_REASON_MPTCP_RST_EWQ2BIG: Too much outstanding data.
+* This code indicates that there is an excessive amount of data
+* that needs to be transmitted over the terminated subflow while
+* having already been acknowledged over one or more other subflows.
+* This may occur if a path has been unavailable for a short period
+* and it is more efficient to reset and start again than it is to
+* retransmit the queued data.
+*/
+   SK_RST_REASON_MPTCP_RST_EWQ2BIG,
+   /**
+* @SK_RST_REASON_MPTCP_RST_EBADPERF: Unacceptable performance.
+* This code

[PATCH net-next v8 2/7] rstreason: prepare for passive reset

2024-04-23 Thread Jason Xing

From: Jason Xing 

Adjust the parameter and support passing reason of reset which
is for now NOT_SPECIFIED. No functional changes.

Signed-off-by: Jason Xing 
---
 include/net/request_sock.h |  4 +++-
 net/dccp/ipv4.c| 10 ++
 net/dccp/ipv6.c| 10 ++
 net/dccp/minisocks.c   |  3 ++-
 net/ipv4/tcp_ipv4.c| 12 +++-
 net/ipv4/tcp_minisocks.c   |  3 ++-
 net/ipv6/tcp_ipv6.c| 15 +--
 net/mptcp/subflow.c|  8 +---
 8 files changed, 40 insertions(+), 25 deletions(-)

diff --git a/include/net/request_sock.h b/include/net/request_sock.h
index 004e651e6067..bdc737832da6 100644
--- a/include/net/request_sock.h
+++ b/include/net/request_sock.h
@@ -18,6 +18,7 @@
 #include 
 
 #include 
+#include 
 
 struct request_sock;
 struct sk_buff;
@@ -34,7 +35,8 @@ struct request_sock_ops {
void(*send_ack)(const struct sock *sk, struct sk_buff *skb,
struct request_sock *req);
void(*send_reset)(const struct sock *sk,
- struct sk_buff *skb);
+ struct sk_buff *skb,
+ enum sk_rst_reason reason);
void(*destructor)(struct request_sock *req);
void(*syn_ack_timeout)(const struct request_sock *req);
 };
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 9fc9cea4c251..ff41bd6f99c3 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -24,6 +24,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "ackvec.h"
 #include "ccid.h"
@@ -521,7 +522,8 @@ static int dccp_v4_send_response(const struct sock *sk, 
struct request_sock *req
return err;
 }
 
-static void dccp_v4_ctl_send_reset(const struct sock *sk, struct sk_buff 
*rxskb)
+static void dccp_v4_ctl_send_reset(const struct sock *sk, struct sk_buff 
*rxskb,
+  enum sk_rst_reason reason)
 {
int err;
const struct iphdr *rxiph;
@@ -706,7 +708,7 @@ int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
return 0;
 
 reset:
-   dccp_v4_ctl_send_reset(sk, skb);
+   dccp_v4_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
kfree_skb(skb);
return 0;
 }
@@ -869,7 +871,7 @@ static int dccp_v4_rcv(struct sk_buff *skb)
if (nsk == sk) {
reqsk_put(req);
} else if (dccp_child_process(sk, nsk, skb)) {
-   dccp_v4_ctl_send_reset(sk, skb);
+   dccp_v4_ctl_send_reset(sk, skb, 
SK_RST_REASON_NOT_SPECIFIED);
goto discard_and_relse;
} else {
sock_put(sk);
@@ -909,7 +911,7 @@ static int dccp_v4_rcv(struct sk_buff *skb)
if (dh->dccph_type != DCCP_PKT_RESET) {
DCCP_SKB_CB(skb)->dccpd_reset_code =
DCCP_RESET_CODE_NO_CONNECTION;
-   dccp_v4_ctl_send_reset(sk, skb);
+   dccp_v4_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
}
 
 discard_it:
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index c8ca703dc331..85f4b8fdbe5e 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -29,6 +29,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "dccp.h"
 #include "ipv6.h"
@@ -256,7 +257,8 @@ static void dccp_v6_reqsk_destructor(struct request_sock 
*req)
kfree_skb(inet_rsk(req)->pktopts);
 }
 
-static void dccp_v6_ctl_send_reset(const struct sock *sk, struct sk_buff 
*rxskb)
+static void dccp_v6_ctl_send_reset(const struct sock *sk, struct sk_buff 
*rxskb,
+  enum sk_rst_reason reason)
 {
const struct ipv6hdr *rxip6h;
struct sk_buff *skb;
@@ -656,7 +658,7 @@ static int dccp_v6_do_rcv(struct sock *sk, struct sk_buff 
*skb)
return 0;
 
 reset:
-   dccp_v6_ctl_send_reset(sk, skb);
+   dccp_v6_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
 discard:
if (opt_skb != NULL)
__kfree_skb(opt_skb);
@@ -762,7 +764,7 @@ static int dccp_v6_rcv(struct sk_buff *skb)
if (nsk == sk) {
reqsk_put(req);
} else if (dccp_child_process(sk, nsk, skb)) {
-   dccp_v6_ctl_send_reset(sk, skb);
+   dccp_v6_ctl_send_reset(sk, skb, 
SK_RST_REASON_NOT_SPECIFIED);
goto discard_and_relse;
} else {
sock_put(sk);
@@ -801,7 +803,7 @@ static int dccp_v6_rcv(struct sk_buff *skb)
if (dh->dccph_type != DCCP_PKT_RESET) {
DCCP_SKB_CB(skb)->dccpd_reset_code =
DCCP_RESET_CODE_NO_CONNECTION;
-   dccp_v6_ctl_send_reset(sk, skb);
+   dccp_v6_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
}

[PATCH net-next v8 0/7] Implement reset reason mechanism to detect

2024-04-23 Thread Jason Xing

From: Jason Xing 

In production, there are so many cases about why the RST skb is sent but
we don't have a very convenient/fast method to detect the exact underlying
reasons.

RST is implemented in two kinds: passive kind (like tcp_v4_send_reset())
and active kind (like tcp_send_active_reset()). The former can be traced
carefully 1) in TCP, with the help of drop reasons, which is based on
Eric's idea[1], 2) in MPTCP, with the help of reset options defined in
RFC 8684. The latter is relatively independent, which should be
implemented on our own, such as active reset reasons which can not be
replace by skb drop reason or something like this.

In this series, I focus on the fundamental implement mostly about how
the rstreason mechnism works and give the detailed passive part as an
example, not including the active reset part. In future, we can go
further and refine those NOT_SPECIFIED reasons.

Here are some examples when tracing:
-0   [002] ..s1.  1830.262425: tcp_send_reset: skbaddr=x
skaddr=x src=x dest=x state=x reason=NOT_SPECIFIED
-0   [002] ..s1.  1830.262425: tcp_send_reset: skbaddr=x
skaddr=x src=x dest=x state=x reason=NO_SOCKET

[1]
Link: 
https://lore.kernel.org/all/CANn89iJw8x-LqgsWOeJQQvgVg6DnL5aBRLi10QN2WBdr+X4k=w...@mail.gmail.com/

v8
Link: 
https://lore.kernel.org/all/20240422030109.12891-1-kerneljasonx...@gmail.com/
1. put sk reset reasons into more natural order (Matt)
2. adjust those helper position (Matt)
3. rename two convert function (Matt)
4. make the kdoc format correct (Simon)

v7
Link: 
https://lore.kernel.org/all/20240417085143.69578-1-kerneljasonx...@gmail.com/
1. get rid of enum casts which could bring potential issues (Eric)
2. use switch-case method to map between reset reason in MPTCP and sk reset
reason (Steven)
3. use switch-case method to map between skb drop reason and sk reset
reason

v6
1. add back casts, or else they are treated as error.

v5
Link: 
https://lore.kernel.org/all/2024045630.38420-1-kerneljasonx...@gmail.com/
1. address format issue (like reverse xmas tree) (Eric, Paolo)
2. remove unnecessary casts. (Eric)
3. introduce a helper used in mptcp active reset. See patch 6. (Paolo)

v4
Link: 
https://lore.kernel.org/all/20240409100934.37725-1-kerneljasonx...@gmail.com/
1. passing 'enum sk_rst_reason' for readability when tracing (Antoine)

v3
Link: 
https://lore.kernel.org/all/20240404072047.11490-1-kerneljasonx...@gmail.com/
1. rebase (mptcp part) and address what Mat suggested.

v2
Link: https://lore.kernel.org/all/20240403185033.47ebc...@kernel.org/
1. rebase against the latest net-next tree

Jason Xing (7):
  net: introduce rstreason to detect why the RST is sent
  rstreason: prepare for passive reset
  rstreason: prepare for active reset
  tcp: support rstreason for passive reset
  mptcp: support rstreason for passive reset
  mptcp: introducing a helper into active reset logic
  rstreason: make it work in trace world

 include/net/request_sock.h |   4 +-
 include/net/rstreason.h| 121 +
 include/net/tcp.h  |   3 +-
 include/trace/events/tcp.h |  26 ++--
 net/dccp/ipv4.c|  10 +--
 net/dccp/ipv6.c|  10 +--
 net/dccp/minisocks.c   |   3 +-
 net/ipv4/tcp.c |  15 +++--
 net/ipv4/tcp_ipv4.c|  17 --
 net/ipv4/tcp_minisocks.c   |   3 +-
 net/ipv4/tcp_output.c  |   5 +-
 net/ipv4/tcp_timer.c   |   9 ++-
 net/ipv6/tcp_ipv6.c|  20 +++---
 net/mptcp/protocol.c   |   2 +-
 net/mptcp/protocol.h   |  39 
 net/mptcp/subflow.c|  27 ++---
 16 files changed, 267 insertions(+), 47 deletions(-)
 create mode 100644 include/net/rstreason.h

-- 
2.37.3

Re: [PATCH net-next v7 1/7] net: introduce rstreason to detect why the RST is sent

2024-04-22 Thread Jason Xing

On Tue, Apr 23, 2024 at 10:14 AM Jason Xing  wrote:
>
> Hi Simon,
>
> On Tue, Apr 23, 2024 at 2:28 AM Simon Horman  wrote:
> >
> > On Mon, Apr 22, 2024 at 11:01:03AM +0800, Jason Xing wrote:
> >
> > ...
> >
> > > diff --git a/include/net/rstreason.h b/include/net/rstreason.h
> >
> > ...
> >
> > > +/**
> > > + * There are three parts in order:
> > > + * 1) reset reason in MPTCP: only for MPTCP use
> > > + * 2) skb drop reason: relying on drop reasons for such as passive reset
> > > + * 3) independent reset reason: such as active reset reasons
> > > + */
> >
> > Hi Jason,
> >
> > A minor nit from my side.
> >
> > '/**' denotes the beginning of a Kernel doc,
> > but other than that, this comment is not a Kernel doc.
> >
> > FWIIW, I would suggest providing a proper Kernel doc for enum sk_rst_reason.
> > But another option would be to simply make this a normal comment,
> > starting with "/* There are"
>
> Thanks Simon. I'm trying to use the kdoc way to make it right :)
>
> How about this one:
> /**
>  * enum sk_rst_reason - the reasons of socket reset
>  *
>  * The reason of skb drop, which is used in DCCP/TCP/MPTCP protocols.

s/skb drop/sk reset/

Sorry, I cannot withdraw my previous email in time.

>  *
>  * There are three parts in order:
>  * 1) skb drop reasons: relying on drop reasons for such as passive
> reset
>  * 2) independent reset reasons: such as active reset reasons
>  * 3) reset reasons in MPTCP: only for MPTCP use
>  */
> ?
>
> I chose to mimic what enum skb_drop_reason does in the
> include/net/dropreason-core.h file.
>
> > +enum sk_rst_reason {
> > +   /**
> > +* Copy from include/uapi/linux/mptcp.h.
> > +* These reset fields will not be changed since they adhere to
> > +* RFC 8684. So do not touch them. I'm going to list each definition
> > +* of them respectively.
> > +*/
>
> Thanks to you, I found another similar point where I smell something
> wrong as in the above code. I'm going to replace '/**' with '/*' since
> it's only a comment, not a kdoc.
>
> Thanks,
> Jason

Re: [PATCH net-next v7 1/7] net: introduce rstreason to detect why the RST is sent

2024-04-22 Thread Jason Xing

Hi Simon,

On Tue, Apr 23, 2024 at 2:28 AM Simon Horman  wrote:
>
> On Mon, Apr 22, 2024 at 11:01:03AM +0800, Jason Xing wrote:
>
> ...
>
> > diff --git a/include/net/rstreason.h b/include/net/rstreason.h
>
> ...
>
> > +/**
> > + * There are three parts in order:
> > + * 1) reset reason in MPTCP: only for MPTCP use
> > + * 2) skb drop reason: relying on drop reasons for such as passive reset
> > + * 3) independent reset reason: such as active reset reasons
> > + */
>
> Hi Jason,
>
> A minor nit from my side.
>
> '/**' denotes the beginning of a Kernel doc,
> but other than that, this comment is not a Kernel doc.
>
> FWIIW, I would suggest providing a proper Kernel doc for enum sk_rst_reason.
> But another option would be to simply make this a normal comment,
> starting with "/* There are"

Thanks Simon. I'm trying to use the kdoc way to make it right :)

How about this one:
/**
 * enum sk_rst_reason - the reasons of socket reset
 *
 * The reason of skb drop, which is used in DCCP/TCP/MPTCP protocols.
 *
 * There are three parts in order:
 * 1) skb drop reasons: relying on drop reasons for such as passive
reset
 * 2) independent reset reasons: such as active reset reasons
 * 3) reset reasons in MPTCP: only for MPTCP use
 */
?

I chose to mimic what enum skb_drop_reason does in the
include/net/dropreason-core.h file.

> +enum sk_rst_reason {
> +   /**
> +* Copy from include/uapi/linux/mptcp.h.
> +* These reset fields will not be changed since they adhere to
> +* RFC 8684. So do not touch them. I'm going to list each definition
> +* of them respectively.
> +*/

Thanks to you, I found another similar point where I smell something
wrong as in the above code. I'm going to replace '/**' with '/*' since
it's only a comment, not a kdoc.

Thanks,
Jason

Re: [PATCH net-next v7 1/7] net: introduce rstreason to detect why the RST is sent

2024-04-22 Thread Jason Xing

Hello Matthieu,

On Mon, Apr 22, 2024 at 4:47 PM Matthieu Baerts  wrote:
>
> Hi Jason,
>
> On 22/04/2024 05:01, Jason Xing wrote:
> > From: Jason Xing 
> >
> > Add a new standalone file for the easy future extension to support
> > both active reset and passive reset in the TCP/DCCP/MPTCP protocols.
>
> Thank you for looking at that!

Thanks for the review!

>
> (...)
>
> > diff --git a/include/net/rstreason.h b/include/net/rstreason.h
> > new file mode 100644
> > index ..c57bc5413c17
> > --- /dev/null
> > +++ b/include/net/rstreason.h
> > @@ -0,0 +1,144 @@
> > +/* SPDX-License-Identifier: GPL-2.0-or-later */
> > +
> > +#ifndef _LINUX_RSTREASON_H
> > +#define _LINUX_RSTREASON_H
> > +#include 
> > +#include 
> > +
> > +#define DEFINE_RST_REASON(FN, FNe)   \
> > + FN(MPTCP_RST_EUNSPEC)   \
> > + FN(MPTCP_RST_EMPTCP)\
> > + FN(MPTCP_RST_ERESOURCE) \
> > + FN(MPTCP_RST_EPROHIBIT) \
> > + FN(MPTCP_RST_EWQ2BIG)   \
> > + FN(MPTCP_RST_EBADPERF)  \
> > + FN(MPTCP_RST_EMIDDLEBOX)\
>
> Small detail: should it not make more sense to put the ones linked to
> MPTCP at the end? I mean I guess MPTCP should be treated in second
> priority: CONFIG_MPTCP could not be set, and the ones linked to TCP
> should be more frequent, etc.

Do you mean that I need to adjust the order: 1) tcp reasons first, 2)
independent reasons, 3) mptcp reasons ?

Reasonable. I will do it :)

>
> > + FN(NOT_SPECIFIED)   \
> > + FN(NO_SOCKET)   \
> > + FNe(MAX)
>
> (...)
>
> > +/* Convert reset reasons in MPTCP to our own enum type */
> > +static inline enum sk_rst_reason convert_mptcpreason(u32 reason)
> > +{
> > + switch (reason) {
> > + case MPTCP_RST_EUNSPEC:
> > + return SK_RST_REASON_MPTCP_RST_EUNSPEC;
> > + case MPTCP_RST_EMPTCP:
> > + return SK_RST_REASON_MPTCP_RST_EMPTCP;
> > + case MPTCP_RST_ERESOURCE:
> > + return SK_RST_REASON_MPTCP_RST_ERESOURCE;
> > + case MPTCP_RST_EPROHIBIT:
> > + return SK_RST_REASON_MPTCP_RST_EPROHIBIT;
> > + case MPTCP_RST_EWQ2BIG:
> > + return SK_RST_REASON_MPTCP_RST_EWQ2BIG;
> > + case MPTCP_RST_EBADPERF:
> > + return SK_RST_REASON_MPTCP_RST_EBADPERF;
> > + case MPTCP_RST_EMIDDLEBOX:
> > + return SK_RST_REASON_MPTCP_RST_EMIDDLEBOX;
> > + default:
> > + /**
> > +  * It should not happen, or else errors may occur
> > +  * in MPTCP layer
> > +  */
> > + return SK_RST_REASON_ERROR;
> > + }
> > +}
>
> If this helper is only used on MPTCP, maybe better to move it to
> net/mptcp/protocol.h (and to patch 5/7?)? We tried to isolate MPTCP code.

Roger that. I will move the helper into protocol.h as well as the patch itself.

>
> Also, maybe it is just me, but I'm not a big fan of the helper name:
> convert_mptcpreason() (same for the "drop" one). I think it should at
> least mention its "origin" (rst reason): e.g. something like
> (sk_)rst_reason_convert_mptcp or (sk_)rst_convert_mptcp_reason() (or
> mptcp_to_rst_reason())?
>
> And (sk_)rst_reason_convert_(skb_)drop() (or skb_drop_to_rst_reason())?

I agree with you. Actually I had a local patch where I used
sk_rst_reason_skbdrop() and sk_rst_reason_mptcpreason().
Interestingly, I changed them in this patch series due to the function
name being too long (which is my initial thought).

I will use sk_rst_convert_xxx_reason() as you suggested.

>
> > +/* Convert reset reasons in MPTCP to our own enum type */
>
> I don't think this part is linked to MPTCP, right?

Ah, copy-paste syndrome... Sorry, I will correct it.

>
> > +static inline enum sk_rst_reason convert_dropreason(enum skb_drop_reason 
> > reason)
> > +{
> > + switch (reason) {
> > + case SKB_DROP_REASON_NOT_SPECIFIED:
> > + return SK_RST_REASON_NOT_SPECIFIED;
> > + case SKB_DROP_REASON_NO_SOCKET:
> > + return SK_RST_REASON_NO_SOCKET;
> > + default:
> > + /* If we don't have our own corresponding reason */
> > + return SK_RST_REASON_NOT_SPECIFIED;
> > + }
> > +}
>
> (This helper could be introduced in patch 4/7 because it is not used
> before, but I'm fine either ways.)

Good. It makes more sense.

Thanks,
Jason

[PATCH net-next v7 7/7] rstreason: make it work in trace world

2024-04-21 Thread Jason Xing

From: Jason Xing 

At last, we should let it work by introducing this reset reason in
trace world.

One of the possible expected outputs is:
... tcp_send_reset: skbaddr=xxx skaddr=xxx src=xxx dest=xxx
state=TCP_ESTABLISHED reason=NOT_SPECIFIED

Signed-off-by: Jason Xing 
Reviewed-by: Steven Rostedt (Google) 
---
 include/trace/events/tcp.h | 26 ++
 net/ipv4/tcp_ipv4.c|  2 +-
 net/ipv4/tcp_output.c  |  2 +-
 net/ipv6/tcp_ipv6.c|  2 +-
 4 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
index 5c04a61a11c2..49b5ee091cf6 100644
--- a/include/trace/events/tcp.h
+++ b/include/trace/events/tcp.h
@@ -11,6 +11,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /*
  * tcp event with arguments sk and skb
@@ -74,20 +75,32 @@ DEFINE_EVENT(tcp_event_sk_skb, tcp_retransmit_skb,
TP_ARGS(sk, skb)
 );
 
+#undef FN
+#define FN(reason) TRACE_DEFINE_ENUM(SK_RST_REASON_##reason);
+DEFINE_RST_REASON(FN, FN)
+
+#undef FN
+#undef FNe
+#define FN(reason) { SK_RST_REASON_##reason, #reason },
+#define FNe(reason){ SK_RST_REASON_##reason, #reason }
+
 /*
  * skb of trace_tcp_send_reset is the skb that caused RST. In case of
  * active reset, skb should be NULL
  */
 TRACE_EVENT(tcp_send_reset,
 
-   TP_PROTO(const struct sock *sk, const struct sk_buff *skb),
+   TP_PROTO(const struct sock *sk,
+const struct sk_buff *skb,
+const enum sk_rst_reason reason),
 
-   TP_ARGS(sk, skb),
+   TP_ARGS(sk, skb, reason),
 
TP_STRUCT__entry(
__field(const void *, skbaddr)
__field(const void *, skaddr)
__field(int, state)
+   __field(enum sk_rst_reason, reason)
__array(__u8, saddr, sizeof(struct sockaddr_in6))
__array(__u8, daddr, sizeof(struct sockaddr_in6))
),
@@ -113,14 +126,19 @@ TRACE_EVENT(tcp_send_reset,
 */
TP_STORE_ADDR_PORTS_SKB(skb, th, entry->daddr, 
entry->saddr);
}
+   __entry->reason = reason;
),
 
-   TP_printk("skbaddr=%p skaddr=%p src=%pISpc dest=%pISpc state=%s",
+   TP_printk("skbaddr=%p skaddr=%p src=%pISpc dest=%pISpc state=%s 
reason=%s",
  __entry->skbaddr, __entry->skaddr,
  __entry->saddr, __entry->daddr,
- __entry->state ? show_tcp_state_name(__entry->state) : 
"UNKNOWN")
+ __entry->state ? show_tcp_state_name(__entry->state) : 
"UNKNOWN",
+ __print_symbolic(__entry->reason, DEFINE_RST_REASON(FN, FNe)))
 );
 
+#undef FN
+#undef FNe
+
 /*
  * tcp event with arguments sk
  *
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 06f8a24801b2..5db2d55b65af 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -871,7 +871,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct 
sk_buff *skb,
if (sk)
arg.bound_dev_if = sk->sk_bound_dev_if;
 
-   trace_tcp_send_reset(sk, skb);
+   trace_tcp_send_reset(sk, skb, reason);
 
BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 276d9d541b01..b08ffb17d5a0 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3612,7 +3612,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t 
priority,
/* skb of trace_tcp_send_reset() keeps the skb that caused RST,
 * skb here is different to the troublesome skb, so use NULL
 */
-   trace_tcp_send_reset(sk, NULL);
+   trace_tcp_send_reset(sk, NULL, SK_RST_REASON_NOT_SPECIFIED);
 }
 
 /* Send a crossed SYN-ACK during socket establishment.
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index d8c74e90698b..966a6a9b0f44 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1133,7 +1133,7 @@ static void tcp_v6_send_reset(const struct sock *sk, 
struct sk_buff *skb,
label = ip6_flowlabel(ipv6h);
}
 
-   trace_tcp_send_reset(sk, skb);
+   trace_tcp_send_reset(sk, skb, reason);
 
tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, 1,
 ipv6_get_dsfield(ipv6h), label, priority, txhash,
-- 
2.37.3

[PATCH net-next v7 6/7] mptcp: introducing a helper into active reset logic

2024-04-21 Thread Jason Xing

From: Jason Xing 

Since we have mapped every mptcp reset reason definition in enum
sk_rst_reason, introducing a new helper can cover some missing places
where we have already set the subflow->reset_reason.

Note: using SK_RST_REASON_NOT_SPECIFIED is the same as
SK_RST_REASON_MPTCP_RST_EUNSPEC. They are both unknown. So we can convert
it directly.

Suggested-by: Paolo Abeni 
Signed-off-by: Jason Xing 
---
Link: 
https://lore.kernel.org/all/2d3ea199eef53cf6a0c48e21abdee0eefbdee927.ca...@redhat.com/
---
 net/mptcp/protocol.c |  4 +---
 net/mptcp/protocol.h | 11 +++
 net/mptcp/subflow.c  |  6 ++
 3 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 065967086492..4b13ca362efa 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -21,7 +21,6 @@
 #endif
 #include 
 #include 
-#include 
 #include 
 #include "protocol.h"
 #include "mib.h"
@@ -2570,8 +2569,7 @@ static void mptcp_check_fastclose(struct mptcp_sock *msk)
 
slow = lock_sock_fast(tcp_sk);
if (tcp_sk->sk_state != TCP_CLOSE) {
-   tcp_send_active_reset(tcp_sk, GFP_ATOMIC,
- SK_RST_REASON_NOT_SPECIFIED);
+   mptcp_send_active_reset_reason(tcp_sk);
tcp_set_state(tcp_sk, TCP_CLOSE);
}
unlock_sock_fast(tcp_sk, slow);
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index fdfa843e2d88..d4f83f1c6880 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -12,6 +12,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "mptcp_pm_gen.h"
 
@@ -581,6 +582,16 @@ mptcp_subflow_ctx_reset(struct mptcp_subflow_context 
*subflow)
WRITE_ONCE(subflow->local_id, -1);
 }
 
+static inline void
+mptcp_send_active_reset_reason(struct sock *sk)
+{
+   struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+   enum sk_rst_reason reason;
+
+   reason = convert_mptcpreason(subflow->reset_reason);
+   tcp_send_active_reset(sk, GFP_ATOMIC, reason);
+}
+
 static inline u64
 mptcp_subflow_get_map_offset(const struct mptcp_subflow_context *subflow)
 {
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index 54e4b2515517..423c842086ff 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -20,7 +20,6 @@
 #include 
 #endif
 #include 
-#include 
 
 #include "protocol.h"
 #include "mib.h"
@@ -424,7 +423,7 @@ void mptcp_subflow_reset(struct sock *ssk)
/* must hold: tcp_done() could drop last reference on parent */
sock_hold(sk);
 
-   tcp_send_active_reset(ssk, GFP_ATOMIC, SK_RST_REASON_NOT_SPECIFIED);
+   mptcp_send_active_reset_reason(ssk);
tcp_done(ssk);
if (!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, _sk(sk)->flags))
mptcp_schedule_work(sk);
@@ -1362,8 +1361,7 @@ static bool subflow_check_data_avail(struct sock *ssk)
tcp_set_state(ssk, TCP_CLOSE);
while ((skb = skb_peek(>sk_receive_queue)))
sk_eat_skb(ssk, skb);
-   tcp_send_active_reset(ssk, GFP_ATOMIC,
- SK_RST_REASON_NOT_SPECIFIED);
+   mptcp_send_active_reset_reason(ssk);
WRITE_ONCE(subflow->data_avail, false);
return false;
}
-- 
2.37.3

[PATCH net-next v7 5/7] mptcp: support rstreason for passive reset

2024-04-21 Thread Jason Xing

From: Jason Xing 

It relys on what reset options in the skb are as rfc8684 says. Reusing
this logic can save us much energy. This patch replaces most of the prior
NOT_SPECIFIED reasons.

Signed-off-by: Jason Xing 
---
 net/mptcp/subflow.c | 22 +-
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index ac867d277860..54e4b2515517 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -309,8 +309,13 @@ static struct dst_entry *subflow_v4_route_req(const struct 
sock *sk,
return dst;
 
dst_release(dst);
-   if (!req->syncookie)
-   tcp_request_sock_ops.send_reset(sk, skb, 
SK_RST_REASON_NOT_SPECIFIED);
+   if (!req->syncookie) {
+   struct mptcp_ext *mpext = mptcp_get_ext(skb);
+   enum sk_rst_reason reason;
+
+   reason = convert_mptcpreason(mpext->reset_reason);
+   tcp_request_sock_ops.send_reset(sk, skb, reason);
+   }
return NULL;
 }
 
@@ -377,8 +382,13 @@ static struct dst_entry *subflow_v6_route_req(const struct 
sock *sk,
return dst;
 
dst_release(dst);
-   if (!req->syncookie)
-   tcp6_request_sock_ops.send_reset(sk, skb, 
SK_RST_REASON_NOT_SPECIFIED);
+   if (!req->syncookie) {
+   struct mptcp_ext *mpext = mptcp_get_ext(skb);
+   enum sk_rst_reason reason;
+
+   reason = convert_mptcpreason(mpext->reset_reason);
+   tcp6_request_sock_ops.send_reset(sk, skb, reason);
+   }
return NULL;
 }
 #endif
@@ -783,6 +793,7 @@ static struct sock *subflow_syn_recv_sock(const struct sock 
*sk,
struct mptcp_subflow_request_sock *subflow_req;
struct mptcp_options_received mp_opt;
bool fallback, fallback_is_fatal;
+   enum sk_rst_reason reason;
struct mptcp_sock *owner;
struct sock *child;
 
@@ -913,7 +924,8 @@ static struct sock *subflow_syn_recv_sock(const struct sock 
*sk,
tcp_rsk(req)->drop_req = true;
inet_csk_prepare_for_destroy_sock(child);
tcp_done(child);
-   req->rsk_ops->send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
+   reason = convert_mptcpreason(mptcp_get_ext(skb)->reset_reason);
+   req->rsk_ops->send_reset(sk, skb, reason);
 
/* The last child reference will be released by the caller */
return child;
-- 
2.37.3

[PATCH net-next v7 4/7] tcp: support rstreason for passive reset

2024-04-21 Thread Jason Xing

From: Jason Xing 

Reuse the dropreason logic to show the exact reason of tcp reset,
so we can finally display the corresponding item in enum sk_reset_reason
instead of reinventing new reset reasons. This patch replaces all
the prior NOT_SPECIFIED reasons.

Signed-off-by: Jason Xing 
---
 net/ipv4/tcp_ipv4.c | 9 +
 net/ipv6/tcp_ipv6.c | 9 +
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 418d11902fa7..06f8a24801b2 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1936,7 +1936,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
return 0;
 
 reset:
-   tcp_v4_send_reset(rsk, skb, SK_RST_REASON_NOT_SPECIFIED);
+   tcp_v4_send_reset(rsk, skb, convert_dropreason(reason));
 discard:
kfree_skb_reason(skb, reason);
/* Be careful here. If this function gets more complicated and
@@ -2278,7 +2278,8 @@ int tcp_v4_rcv(struct sk_buff *skb)
} else {
drop_reason = tcp_child_process(sk, nsk, skb);
if (drop_reason) {
-   tcp_v4_send_reset(nsk, skb, 
SK_RST_REASON_NOT_SPECIFIED);
+   tcp_v4_send_reset(nsk, skb,
+ 
convert_dropreason(drop_reason));
goto discard_and_relse;
}
sock_put(sk);
@@ -2357,7 +2358,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
 bad_packet:
__TCP_INC_STATS(net, TCP_MIB_INERRS);
} else {
-   tcp_v4_send_reset(NULL, skb, SK_RST_REASON_NOT_SPECIFIED);
+   tcp_v4_send_reset(NULL, skb, convert_dropreason(drop_reason));
}
 
 discard_it:
@@ -2409,7 +2410,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
tcp_v4_timewait_ack(sk, skb);
break;
case TCP_TW_RST:
-   tcp_v4_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
+   tcp_v4_send_reset(sk, skb, convert_dropreason(drop_reason));
inet_twsk_deschedule_put(inet_twsk(sk));
goto discard_it;
case TCP_TW_SUCCESS:;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 017f6293b5f4..d8c74e90698b 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1680,7 +1680,7 @@ int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
return 0;
 
 reset:
-   tcp_v6_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
+   tcp_v6_send_reset(sk, skb, convert_dropreason(reason));
 discard:
if (opt_skb)
__kfree_skb(opt_skb);
@@ -1865,7 +1865,8 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff 
*skb)
} else {
drop_reason = tcp_child_process(sk, nsk, skb);
if (drop_reason) {
-   tcp_v6_send_reset(nsk, skb, 
SK_RST_REASON_NOT_SPECIFIED);
+   tcp_v6_send_reset(nsk, skb,
+ 
convert_dropreason(drop_reason));
goto discard_and_relse;
}
sock_put(sk);
@@ -1942,7 +1943,7 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff 
*skb)
 bad_packet:
__TCP_INC_STATS(net, TCP_MIB_INERRS);
} else {
-   tcp_v6_send_reset(NULL, skb, SK_RST_REASON_NOT_SPECIFIED);
+   tcp_v6_send_reset(NULL, skb, convert_dropreason(drop_reason));
}
 
 discard_it:
@@ -1998,7 +1999,7 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff 
*skb)
tcp_v6_timewait_ack(sk, skb);
break;
case TCP_TW_RST:
-   tcp_v6_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
+   tcp_v6_send_reset(sk, skb, convert_dropreason(drop_reason));
inet_twsk_deschedule_put(inet_twsk(sk));
goto discard_it;
case TCP_TW_SUCCESS:
-- 
2.37.3

[PATCH net-next v7 3/7] rstreason: prepare for active reset

2024-04-21 Thread Jason Xing

From: Jason Xing 

Like what we did to passive reset:
only passing possible reset reason in each active reset path.

No functional changes.

Signed-off-by: Jason Xing 
---
 include/net/tcp.h |  3 ++-
 net/ipv4/tcp.c| 15 ++-
 net/ipv4/tcp_output.c |  3 ++-
 net/ipv4/tcp_timer.c  |  9 ++---
 net/mptcp/protocol.c  |  4 +++-
 net/mptcp/subflow.c   |  5 +++--
 6 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index b935e1ae4caf..adeacc9aa28a 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -670,7 +670,8 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
 void tcp_send_probe0(struct sock *);
 int tcp_write_wakeup(struct sock *, int mib);
 void tcp_send_fin(struct sock *sk);
-void tcp_send_active_reset(struct sock *sk, gfp_t priority);
+void tcp_send_active_reset(struct sock *sk, gfp_t priority,
+  enum sk_rst_reason reason);
 int tcp_send_synack(struct sock *);
 void tcp_push_one(struct sock *, unsigned int mss_now);
 void __tcp_send_ack(struct sock *sk, u32 rcv_nxt);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index f23b9ea5..4ec0f4feee00 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -275,6 +275,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -2811,7 +2812,8 @@ void __tcp_close(struct sock *sk, long timeout)
/* Unread data was tossed, zap the connection. */
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
tcp_set_state(sk, TCP_CLOSE);
-   tcp_send_active_reset(sk, sk->sk_allocation);
+   tcp_send_active_reset(sk, sk->sk_allocation,
+ SK_RST_REASON_NOT_SPECIFIED);
} else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
/* Check zero linger _after_ checking for unread data. */
sk->sk_prot->disconnect(sk, 0);
@@ -2885,7 +2887,8 @@ void __tcp_close(struct sock *sk, long timeout)
struct tcp_sock *tp = tcp_sk(sk);
if (READ_ONCE(tp->linger2) < 0) {
tcp_set_state(sk, TCP_CLOSE);
-   tcp_send_active_reset(sk, GFP_ATOMIC);
+   tcp_send_active_reset(sk, GFP_ATOMIC,
+ SK_RST_REASON_NOT_SPECIFIED);
__NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPABORTONLINGER);
} else {
@@ -2903,7 +2906,8 @@ void __tcp_close(struct sock *sk, long timeout)
if (sk->sk_state != TCP_CLOSE) {
if (tcp_check_oom(sk, 0)) {
tcp_set_state(sk, TCP_CLOSE);
-   tcp_send_active_reset(sk, GFP_ATOMIC);
+   tcp_send_active_reset(sk, GFP_ATOMIC,
+ SK_RST_REASON_NOT_SPECIFIED);
__NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPABORTONMEMORY);
} else if (!check_net(sock_net(sk))) {
@@ -3007,7 +3011,7 @@ int tcp_disconnect(struct sock *sk, int flags)
/* The last check adjusts for discrepancy of Linux wrt. RFC
 * states
 */
-   tcp_send_active_reset(sk, gfp_any());
+   tcp_send_active_reset(sk, gfp_any(), 
SK_RST_REASON_NOT_SPECIFIED);
WRITE_ONCE(sk->sk_err, ECONNRESET);
} else if (old_state == TCP_SYN_SENT)
WRITE_ONCE(sk->sk_err, ECONNRESET);
@@ -4564,7 +4568,8 @@ int tcp_abort(struct sock *sk, int err)
smp_wmb();
sk_error_report(sk);
if (tcp_need_reset(sk->sk_state))
-   tcp_send_active_reset(sk, GFP_ATOMIC);
+   tcp_send_active_reset(sk, GFP_ATOMIC,
+ SK_RST_REASON_NOT_SPECIFIED);
tcp_done(sk);
}
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 61119d42b0fd..276d9d541b01 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3586,7 +3586,8 @@ void tcp_send_fin(struct sock *sk)
  * was unread data in the receive queue.  This behavior is recommended
  * by RFC 2525, section 2.17.  -DaveM
  */
-void tcp_send_active_reset(struct sock *sk, gfp_t priority)
+void tcp_send_active_reset(struct sock *sk, gfp_t priority,
+  enum sk_rst_reason reason)
 {
struct sk_buff *skb;
 
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 976db57b95d4..83fe7f62f7f1 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -22,6 +22,7 @@
 #include 
 #include 
 #include 
+#include 
 
 static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk)
 {
@@ -127,7 +128,8 @@ static int tcp_out_of_resources(struct sock *sk, bool 
do_reset)

[PATCH net-next v7 2/7] rstreason: prepare for passive reset

2024-04-21 Thread Jason Xing

From: Jason Xing 

Adjust the parameter and support passing reason of reset which
is for now NOT_SPECIFIED. No functional changes.

Signed-off-by: Jason Xing 
---
 include/net/request_sock.h |  4 +++-
 net/dccp/ipv4.c| 10 ++
 net/dccp/ipv6.c| 10 ++
 net/dccp/minisocks.c   |  3 ++-
 net/ipv4/tcp_ipv4.c| 12 +++-
 net/ipv4/tcp_minisocks.c   |  3 ++-
 net/ipv6/tcp_ipv6.c| 15 +--
 net/mptcp/subflow.c|  8 +---
 8 files changed, 40 insertions(+), 25 deletions(-)

diff --git a/include/net/request_sock.h b/include/net/request_sock.h
index 004e651e6067..bdc737832da6 100644
--- a/include/net/request_sock.h
+++ b/include/net/request_sock.h
@@ -18,6 +18,7 @@
 #include 
 
 #include 
+#include 
 
 struct request_sock;
 struct sk_buff;
@@ -34,7 +35,8 @@ struct request_sock_ops {
void(*send_ack)(const struct sock *sk, struct sk_buff *skb,
struct request_sock *req);
void(*send_reset)(const struct sock *sk,
- struct sk_buff *skb);
+ struct sk_buff *skb,
+ enum sk_rst_reason reason);
void(*destructor)(struct request_sock *req);
void(*syn_ack_timeout)(const struct request_sock *req);
 };
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 9fc9cea4c251..ff41bd6f99c3 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -24,6 +24,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "ackvec.h"
 #include "ccid.h"
@@ -521,7 +522,8 @@ static int dccp_v4_send_response(const struct sock *sk, 
struct request_sock *req
return err;
 }
 
-static void dccp_v4_ctl_send_reset(const struct sock *sk, struct sk_buff 
*rxskb)
+static void dccp_v4_ctl_send_reset(const struct sock *sk, struct sk_buff 
*rxskb,
+  enum sk_rst_reason reason)
 {
int err;
const struct iphdr *rxiph;
@@ -706,7 +708,7 @@ int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
return 0;
 
 reset:
-   dccp_v4_ctl_send_reset(sk, skb);
+   dccp_v4_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
kfree_skb(skb);
return 0;
 }
@@ -869,7 +871,7 @@ static int dccp_v4_rcv(struct sk_buff *skb)
if (nsk == sk) {
reqsk_put(req);
} else if (dccp_child_process(sk, nsk, skb)) {
-   dccp_v4_ctl_send_reset(sk, skb);
+   dccp_v4_ctl_send_reset(sk, skb, 
SK_RST_REASON_NOT_SPECIFIED);
goto discard_and_relse;
} else {
sock_put(sk);
@@ -909,7 +911,7 @@ static int dccp_v4_rcv(struct sk_buff *skb)
if (dh->dccph_type != DCCP_PKT_RESET) {
DCCP_SKB_CB(skb)->dccpd_reset_code =
DCCP_RESET_CODE_NO_CONNECTION;
-   dccp_v4_ctl_send_reset(sk, skb);
+   dccp_v4_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
}
 
 discard_it:
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index c8ca703dc331..85f4b8fdbe5e 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -29,6 +29,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "dccp.h"
 #include "ipv6.h"
@@ -256,7 +257,8 @@ static void dccp_v6_reqsk_destructor(struct request_sock 
*req)
kfree_skb(inet_rsk(req)->pktopts);
 }
 
-static void dccp_v6_ctl_send_reset(const struct sock *sk, struct sk_buff 
*rxskb)
+static void dccp_v6_ctl_send_reset(const struct sock *sk, struct sk_buff 
*rxskb,
+  enum sk_rst_reason reason)
 {
const struct ipv6hdr *rxip6h;
struct sk_buff *skb;
@@ -656,7 +658,7 @@ static int dccp_v6_do_rcv(struct sock *sk, struct sk_buff 
*skb)
return 0;
 
 reset:
-   dccp_v6_ctl_send_reset(sk, skb);
+   dccp_v6_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
 discard:
if (opt_skb != NULL)
__kfree_skb(opt_skb);
@@ -762,7 +764,7 @@ static int dccp_v6_rcv(struct sk_buff *skb)
if (nsk == sk) {
reqsk_put(req);
} else if (dccp_child_process(sk, nsk, skb)) {
-   dccp_v6_ctl_send_reset(sk, skb);
+   dccp_v6_ctl_send_reset(sk, skb, 
SK_RST_REASON_NOT_SPECIFIED);
goto discard_and_relse;
} else {
sock_put(sk);
@@ -801,7 +803,7 @@ static int dccp_v6_rcv(struct sk_buff *skb)
if (dh->dccph_type != DCCP_PKT_RESET) {
DCCP_SKB_CB(skb)->dccpd_reset_code =
DCCP_RESET_CODE_NO_CONNECTION;
-   dccp_v6_ctl_send_reset(sk, skb);
+   dccp_v6_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
}

[PATCH net-next v7 0/7] Implement reset reason mechanism to detect

2024-04-21 Thread Jason Xing

From: Jason Xing 

In production, there are so many cases about why the RST skb is sent but
we don't have a very convenient/fast method to detect the exact underlying
reasons.

RST is implemented in two kinds: passive kind (like tcp_v4_send_reset())
and active kind (like tcp_send_active_reset()). The former can be traced
carefully 1) in TCP, with the help of drop reasons, which is based on
Eric's idea[1], 2) in MPTCP, with the help of reset options defined in
RFC 8684. The latter is relatively independent, which should be
implemented on our own, such as active reset reasons which can not be
replace by skb drop reason or something like this.

In this series, I focus on the fundamental implement mostly about how
the rstreason mechnism works and give the detailed passive part as an
example, not including the active reset part. In future, we can go
further and refine those NOT_SPECIFIED reasons.

Here are some examples when tracing:
-0   [002] ..s1.  1830.262425: tcp_send_reset: skbaddr=x
skaddr=x src=x dest=x state=x reason=NOT_SPECIFIED
-0   [002] ..s1.  1830.262425: tcp_send_reset: skbaddr=x
skaddr=x src=x dest=x state=x reason=NO_SOCKET

[1]
Link: 
https://lore.kernel.org/all/CANn89iJw8x-LqgsWOeJQQvgVg6DnL5aBRLi10QN2WBdr+X4k=w...@mail.gmail.com/

v7
Link: 
https://lore.kernel.org/all/20240417085143.69578-1-kerneljasonx...@gmail.com/
1. get rid of enum casts which could bring potential issues (Eric)
2. use switch-case method to map between reset reason in MPTCP and sk reset
reason (Steven)
3. use switch-case method to map between skb drop reason and sk reset
reason

v6
1. add back casts, or else they are treated as error.

v5
Link: 
https://lore.kernel.org/all/2024045630.38420-1-kerneljasonx...@gmail.com/
1. address format issue (like reverse xmas tree) (Eric, Paolo)
2. remove unnecessary casts. (Eric)
3. introduce a helper used in mptcp active reset. See patch 6. (Paolo)

v4
Link: 
https://lore.kernel.org/all/20240409100934.37725-1-kerneljasonx...@gmail.com/
1. passing 'enum sk_rst_reason' for readability when tracing (Antoine)

v3
Link: 
https://lore.kernel.org/all/20240404072047.11490-1-kerneljasonx...@gmail.com/
1. rebase (mptcp part) and address what Mat suggested.

v2
Link: https://lore.kernel.org/all/20240403185033.47ebc...@kernel.org/
1. rebase against the latest net-next tree


Jason Xing (7):
  net: introduce rstreason to detect why the RST is sent
  rstreason: prepare for passive reset
  rstreason: prepare for active reset
  tcp: support rstreason for passive reset
  mptcp: support rstreason for passive reset
  mptcp: introducing a helper into active reset logic
  rstreason: make it work in trace world

 include/net/request_sock.h |   4 +-
 include/net/rstreason.h| 144 +
 include/net/tcp.h  |   3 +-
 include/trace/events/tcp.h |  26 +--
 net/dccp/ipv4.c|  10 +--
 net/dccp/ipv6.c|  10 +--
 net/dccp/minisocks.c   |   3 +-
 net/ipv4/tcp.c |  15 ++--
 net/ipv4/tcp_ipv4.c|  15 ++--
 net/ipv4/tcp_minisocks.c   |   3 +-
 net/ipv4/tcp_output.c  |   5 +-
 net/ipv4/tcp_timer.c   |   9 ++-
 net/ipv6/tcp_ipv6.c|  18 +++--
 net/mptcp/protocol.c   |   2 +-
 net/mptcp/protocol.h   |  11 +++
 net/mptcp/subflow.c|  27 +--
 16 files changed, 258 insertions(+), 47 deletions(-)
 create mode 100644 include/net/rstreason.h

-- 
2.37.3

[PATCH net-next v7 1/7] net: introduce rstreason to detect why the RST is sent

2024-04-21 Thread Jason Xing

From: Jason Xing 

Add a new standalone file for the easy future extension to support
both active reset and passive reset in the TCP/DCCP/MPTCP protocols.

This patch only does the preparations for reset reason mechanism,
nothing else changes.

The reset reasons are divided into three parts:
1) reuse MP_TCPRST option for MPTCP
2) reuse drop reasons for passive reset in TCP
3) our own reasons which are not relying on other reasons at all

The benefits of a standalone reset reason are listed here:
1) it can cover more than one case, such as reset reasons in MPTCP,
active reset reasons.
2) people can easily/fastly understand and maintain this mechanism.
3) we get unified format of output with prefix stripped.
4) more new reset reasons are on the way
...

I will implement the basic codes of active/passive reset reason in
those three protocols, which are not complete for this moment. For
passive reset part in TCP, I only introduce the NO_SOCKET common case
which could be set as an example.

After this series applied, it will have the ability to open a new
gate to let other people contribute more reasons into it :)

Signed-off-by: Jason Xing 
---
 include/net/rstreason.h | 144 
 1 file changed, 144 insertions(+)
 create mode 100644 include/net/rstreason.h

diff --git a/include/net/rstreason.h b/include/net/rstreason.h
new file mode 100644
index ..c57bc5413c17
--- /dev/null
+++ b/include/net/rstreason.h
@@ -0,0 +1,144 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifndef _LINUX_RSTREASON_H
+#define _LINUX_RSTREASON_H
+#include 
+#include 
+
+#define DEFINE_RST_REASON(FN, FNe) \
+   FN(MPTCP_RST_EUNSPEC)   \
+   FN(MPTCP_RST_EMPTCP)\
+   FN(MPTCP_RST_ERESOURCE) \
+   FN(MPTCP_RST_EPROHIBIT) \
+   FN(MPTCP_RST_EWQ2BIG)   \
+   FN(MPTCP_RST_EBADPERF)  \
+   FN(MPTCP_RST_EMIDDLEBOX)\
+   FN(NOT_SPECIFIED)   \
+   FN(NO_SOCKET)   \
+   FNe(MAX)
+
+/**
+ * There are three parts in order:
+ * 1) reset reason in MPTCP: only for MPTCP use
+ * 2) skb drop reason: relying on drop reasons for such as passive reset
+ * 3) independent reset reason: such as active reset reasons
+ */
+enum sk_rst_reason {
+   /**
+* Copy from include/uapi/linux/mptcp.h.
+* These reset fields will not be changed since they adhere to
+* RFC 8684. So do not touch them. I'm going to list each definition
+* of them respectively.
+*/
+   /**
+* @SK_RST_REASON_MPTCP_RST_EUNSPEC: Unspecified error.
+* This is the default error; it implies that the subflow is no
+* longer available. The presence of this option shows that the
+* RST was generated by an MPTCP-aware device.
+*/
+   SK_RST_REASON_MPTCP_RST_EUNSPEC,
+   /**
+* @SK_RST_REASON_MPTCP_RST_EMPTCP: MPTCP-specific error.
+* An error has been detected in the processing of MPTCP options.
+* This is the usual reason code to return in the cases where a RST
+* is being sent to close a subflow because of an invalid response.
+*/
+   SK_RST_REASON_MPTCP_RST_EMPTCP,
+   /**
+* @SK_RST_REASON_MPTCP_RST_ERESOURCE: Lack of resources.
+* This code indicates that the sending host does not have enough
+* resources to support the terminated subflow.
+*/
+   SK_RST_REASON_MPTCP_RST_ERESOURCE,
+   /**
+* @SK_RST_REASON_MPTCP_RST_EPROHIBIT: Administratively prohibited.
+* This code indicates that the requested subflow is prohibited by
+* the policies of the sending host.
+*/
+   SK_RST_REASON_MPTCP_RST_EPROHIBIT,
+   /**
+* @SK_RST_REASON_MPTCP_RST_EWQ2BIG: Too much outstanding data.
+* This code indicates that there is an excessive amount of data
+* that needs to be transmitted over the terminated subflow while
+* having already been acknowledged over one or more other subflows.
+* This may occur if a path has been unavailable for a short period
+* and it is more efficient to reset and start again than it is to
+* retransmit the queued data.
+*/
+   SK_RST_REASON_MPTCP_RST_EWQ2BIG,
+   /**
+* @SK_RST_REASON_MPTCP_RST_EBADPERF: Unacceptable performance.
+* This code indicates that the performance of this subflow was
+* too low compared to the other subflows of this Multipath TCP
+* connection.
+*/
+   SK_RST_REASON_MPTCP_RST_EBADPERF,
+   /**
+* @SK_RST_REASON_MPTCP_RST_EMIDDLEBOX: Middlebox interference.
+* Middlebox interference has been detected over this subflow,
+* making MPTCP signaling invalid. For example, this may be sent
+* if the checksum does not validate.
+*/
+   SK_RST_REASON_MPTCP_RST_EMIDDLEBOX,
+
+   /**
+* Refer

Re: [PATCH net-next v6 0/7] Implement reset reason mechanism to detect

2024-04-19 Thread Jason Xing

Hello Steven,

On Sat, Apr 20, 2024 at 10:36 AM Steven Rostedt  wrote:
>
> On Fri, 19 Apr 2024 16:00:20 +0800
> Jason Xing  wrote:
>
> > If other experts see this thread, please help me. I would appreciate
> > it. I have strong interests and feel strong responsibility to
> > implement something like this patch series. It can be very useful!!
>
> I'm not a networking expert, but as I'm Cc'd and this is about tracing,
> I'll jump in to see if I can help. Honestly, reading the thread, it
> appears that you and Eric are talking past each other.
>
> I believe Eric is concerned about losing the value of the enum. Enums
> are types, and if you typecast them to another type, they lose the
> previous type, and all the safety that goes with it.

Ah, I see. Possible lost value in another enum could cause a problem.

>
> Now, I do not really understand the problem trying to be solved here. I
> understand how TCP works but I never looked into the implementation of
> MPTCP.
>
> You added this:
>
> +static inline enum sk_rst_reason convert_mptcp_reason(u32 reason)
> +{
> +   return reason += RST_REASON_START;
> +}
>
> And used it for places like this:
>
> @@ -309,8 +309,13 @@ static struct dst_entry *subflow_v4_route_req(const 
> struct sock *sk,
> return dst;
>
> dst_release(dst);
> -   if (!req->syncookie)
> -   tcp_request_sock_ops.send_reset(sk, skb, 
> SK_RST_REASON_NOT_SPECIFIED);
> +   if (!req->syncookie) {
> +   struct mptcp_ext *mpext = mptcp_get_ext(skb);
> +   enum sk_rst_reason reason;
> +
> +   reason = convert_mptcp_reason(mpext->reset_reason);
> +   tcp_request_sock_ops.send_reset(sk, skb, reason);
> +   }
> return NULL;
>  }
>
> As I don't know this code or how MPTCP works, I do not understand the
> above. It use to pass to send_reset() SK_RST_REASON_NOT_SPECIFIED. But
> now it takes a "reset_reason" calls the "convert_mptcp_reason()" to get
> back a enum value.
>
> If you are mapping the reset_reason to enum sk_rst_reason, why not do
> it via a real conversion instead of this fragile arithmetic between the two
> values?
>
> static inline enum sk_rst_reason convert_mptcp_reason(u32 reason)
> {
> switch(reason) {
> case 0: return SK_RST_REASON_MPTCP_RST_EUNSPEC;
> case 1: return SK_RST_REASON_MPTCP_RST_EMPTCP;
> case 2: return SK_RST_REASON_MPTCP_RST_ERESOURCE;
> [..]
> default: return SK_RST_REASON_MAX; // or some other error value
> ]
> }

This code snippet looks much better than mine.

>
> I'm not sure if this is any better, but it's not doing any casting and
> it's easier to understand. It's a simple mapping between the reason and
> the enum and there's no inherit dependency between the values. Could
> possibly create enums for the reason numbers and replace the hard coded
> values with them.

Right.

I also need to handle many drop reasons cases like what you do. Due to
too many of them, I will try the key-value map instead of switch-case
and then see if it works.

>
> That way that helper function is at least doing a real conversion of
> one type to another.
>
> But like I said from the beginning. I don't understand the details here
> and have not spent the time to dig deeper. I just read the thread and I
> agree with Eric that the arithmetic conversion of reason to an enum
> looks fragile at best and buggy at worst.

Thanks so much for your help, which I didn't even imagine.

Sure, after one night of investigation, I figured it out. I will drop
enum casts without any doubt as Eric and you suggested. But I believe
a new enum is needed, grouping various reasons together which help
ftrace print the valid string to userspace.

Thanks,
Jason

>
> -- Steve

Re: [PATCH net-next v6 0/7] Implement reset reason mechanism to detect

2024-04-19 Thread Jason Xing

On Fri, Apr 19, 2024 at 4:00 PM Jason Xing  wrote:
>
> On Fri, Apr 19, 2024 at 3:44 PM Eric Dumazet  wrote:
> >
> > On Fri, Apr 19, 2024 at 9:29 AM Jason Xing  
> > wrote:
> > >
> > > On Fri, Apr 19, 2024 at 3:02 PM Eric Dumazet  wrote:
> > > >
> > > > On Fri, Apr 19, 2024 at 4:31 AM Jason Xing  
> > > > wrote:
> > > > >
> > > > > On Fri, Apr 19, 2024 at 7:26 AM Jason Xing 
> > > > >  wrote:
> > > > > >
> > > > > > > When I said "If you feel the need to put them in a special group, 
> > > > > > > this
> > > > > > > is fine by me.",
> > > > > > > this was really about partitioning the existing enum into groups, 
> > > > > > > if
> > > > > > > you prefer having a group of 'RES reasons'
> > > > > >
> > > > > > Are you suggesting copying what we need from enum skb_drop_reason{} 
> > > > > > to
> > > > > > enum sk_rst_reason{}? Why not reusing them directly. I have no idea
> > > > > > what the side effect of cast conversion itself is?
> > > > >
> > > > > Sorry that I'm writing this email. I'm worried my statement is not
> > > > > that clear, so I write one simple snippet which can help me explain
> > > > > well :)
> > > > >
> > > > > Allow me give NO_SOCKET as an example:
> > > > > diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
> > > > > index e63a3bf99617..2c9f7364de45 100644
> > > > > --- a/net/ipv4/icmp.c
> > > > > +++ b/net/ipv4/icmp.c
> > > > > @@ -767,6 +767,7 @@ void __icmp_send(struct sk_buff *skb_in, int type,
> > > > > int code, __be32 info,
> > > > > if (!fl4.saddr)
> > > > > fl4.saddr = htonl(INADDR_DUMMY);
> > > > >
> > > > > +   trace_icmp_send(skb_in, type, code);
> > > > > icmp_push_reply(sk, _param, , , );
> > > > >  ende:
> > > > > ip_rt_put(rt);
> > > > > diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> > > > > index 1e650ec71d2f..d5963831280f 100644
> > > > > --- a/net/ipv4/tcp_ipv4.c
> > > > > +++ b/net/ipv4/tcp_ipv4.c
> > > > > @@ -2160,6 +2160,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
> > > > >  {
> > > > > struct net *net = dev_net(skb->dev);
> > > > > enum skb_drop_reason drop_reason;
> > > > > +   enum sk_rst_reason rst_reason;
> > > > > int sdif = inet_sdif(skb);
> > > > > int dif = inet_iif(skb);
> > > > > const struct iphdr *iph;
> > > > > @@ -2355,7 +2356,8 @@ int tcp_v4_rcv(struct sk_buff *skb)
> > > > >  bad_packet:
> > > > > __TCP_INC_STATS(net, TCP_MIB_INERRS);
> > > > > } else {
> > > > > -   tcp_v4_send_reset(NULL, skb);
> > > > > +   rst_reason = RST_REASON_NO_SOCKET;
> > > > > +   tcp_v4_send_reset(NULL, skb, rst_reason);
> > > > > }
> > > > >
> > > > >  discard_it:
> > > > >
> > > > > As you can see, we need to add a new 'rst_reason' variable which
> > > > > actually is the same as drop reason. They are the same except for the
> > > > > enum type... Such rst_reasons/drop_reasons are all over the place.
> > > > >
> > > > > Eric, if you have a strong preference, I can do it as you said.
> > > > >
> > > > > Well, how about explicitly casting them like this based on the current
> > > > > series. It looks better and clearer and more helpful to people who is
> > > > > reading codes to understand:
> > > > > diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> > > > > index 461b4d2b7cfe..eb125163d819 100644
> > > > > --- a/net/ipv4/tcp_ipv4.c
> > > > > +++ b/net/ipv4/tcp_ipv4.c
> > > > > @@ -1936,7 +1936,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct 
> > > > > sk_buff *skb)
> > > > > return 0;
> > > > >
> > > > >  reset:
> > > > > -   tcp_v4_send_reset(rsk, skb, (u32)reason);
> > > > > +   tcp_v4_send_reset(rsk, sk

Re: [PATCH net-next v6 0/7] Implement reset reason mechanism to detect

2024-04-19 Thread Jason Xing

On Fri, Apr 19, 2024 at 3:44 PM Eric Dumazet  wrote:
>
> On Fri, Apr 19, 2024 at 9:29 AM Jason Xing  wrote:
> >
> > On Fri, Apr 19, 2024 at 3:02 PM Eric Dumazet  wrote:
> > >
> > > On Fri, Apr 19, 2024 at 4:31 AM Jason Xing  
> > > wrote:
> > > >
> > > > On Fri, Apr 19, 2024 at 7:26 AM Jason Xing  
> > > > wrote:
> > > > >
> > > > > > When I said "If you feel the need to put them in a special group, 
> > > > > > this
> > > > > > is fine by me.",
> > > > > > this was really about partitioning the existing enum into groups, if
> > > > > > you prefer having a group of 'RES reasons'
> > > > >
> > > > > Are you suggesting copying what we need from enum skb_drop_reason{} to
> > > > > enum sk_rst_reason{}? Why not reusing them directly. I have no idea
> > > > > what the side effect of cast conversion itself is?
> > > >
> > > > Sorry that I'm writing this email. I'm worried my statement is not
> > > > that clear, so I write one simple snippet which can help me explain
> > > > well :)
> > > >
> > > > Allow me give NO_SOCKET as an example:
> > > > diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
> > > > index e63a3bf99617..2c9f7364de45 100644
> > > > --- a/net/ipv4/icmp.c
> > > > +++ b/net/ipv4/icmp.c
> > > > @@ -767,6 +767,7 @@ void __icmp_send(struct sk_buff *skb_in, int type,
> > > > int code, __be32 info,
> > > > if (!fl4.saddr)
> > > > fl4.saddr = htonl(INADDR_DUMMY);
> > > >
> > > > +   trace_icmp_send(skb_in, type, code);
> > > > icmp_push_reply(sk, _param, , , );
> > > >  ende:
> > > > ip_rt_put(rt);
> > > > diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> > > > index 1e650ec71d2f..d5963831280f 100644
> > > > --- a/net/ipv4/tcp_ipv4.c
> > > > +++ b/net/ipv4/tcp_ipv4.c
> > > > @@ -2160,6 +2160,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
> > > >  {
> > > > struct net *net = dev_net(skb->dev);
> > > > enum skb_drop_reason drop_reason;
> > > > +   enum sk_rst_reason rst_reason;
> > > > int sdif = inet_sdif(skb);
> > > > int dif = inet_iif(skb);
> > > > const struct iphdr *iph;
> > > > @@ -2355,7 +2356,8 @@ int tcp_v4_rcv(struct sk_buff *skb)
> > > >  bad_packet:
> > > > __TCP_INC_STATS(net, TCP_MIB_INERRS);
> > > > } else {
> > > > -   tcp_v4_send_reset(NULL, skb);
> > > > +   rst_reason = RST_REASON_NO_SOCKET;
> > > > +   tcp_v4_send_reset(NULL, skb, rst_reason);
> > > > }
> > > >
> > > >  discard_it:
> > > >
> > > > As you can see, we need to add a new 'rst_reason' variable which
> > > > actually is the same as drop reason. They are the same except for the
> > > > enum type... Such rst_reasons/drop_reasons are all over the place.
> > > >
> > > > Eric, if you have a strong preference, I can do it as you said.
> > > >
> > > > Well, how about explicitly casting them like this based on the current
> > > > series. It looks better and clearer and more helpful to people who is
> > > > reading codes to understand:
> > > > diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> > > > index 461b4d2b7cfe..eb125163d819 100644
> > > > --- a/net/ipv4/tcp_ipv4.c
> > > > +++ b/net/ipv4/tcp_ipv4.c
> > > > @@ -1936,7 +1936,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff 
> > > > *skb)
> > > > return 0;
> > > >
> > > >  reset:
> > > > -   tcp_v4_send_reset(rsk, skb, (u32)reason);
> > > > +   tcp_v4_send_reset(rsk, skb, (enum sk_rst_reason)reason);
> > > >  discard:
> > > > kfree_skb_reason(skb, reason);
> > > > /* Be careful here. If this function gets more complicated and
> > >
> > > It makes no sense to declare an enum sk_rst_reason and then convert it
> > > to drop_reason
> > > or vice versa.
> > >
> > > Next thing you know, compiler guys will add a new -Woption that will
> > > forbid such conversions.
> > >
> > > Please add

Re: [PATCH net-next v6 0/7] Implement reset reason mechanism to detect

2024-04-19 Thread Jason Xing

On Fri, Apr 19, 2024 at 3:02 PM Eric Dumazet  wrote:
>
> On Fri, Apr 19, 2024 at 4:31 AM Jason Xing  wrote:
> >
> > On Fri, Apr 19, 2024 at 7:26 AM Jason Xing  
> > wrote:
> > >
> > > > When I said "If you feel the need to put them in a special group, this
> > > > is fine by me.",
> > > > this was really about partitioning the existing enum into groups, if
> > > > you prefer having a group of 'RES reasons'
> > >
> > > Are you suggesting copying what we need from enum skb_drop_reason{} to
> > > enum sk_rst_reason{}? Why not reusing them directly. I have no idea
> > > what the side effect of cast conversion itself is?
> >
> > Sorry that I'm writing this email. I'm worried my statement is not
> > that clear, so I write one simple snippet which can help me explain
> > well :)
> >
> > Allow me give NO_SOCKET as an example:
> > diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
> > index e63a3bf99617..2c9f7364de45 100644
> > --- a/net/ipv4/icmp.c
> > +++ b/net/ipv4/icmp.c
> > @@ -767,6 +767,7 @@ void __icmp_send(struct sk_buff *skb_in, int type,
> > int code, __be32 info,
> > if (!fl4.saddr)
> > fl4.saddr = htonl(INADDR_DUMMY);
> >
> > +   trace_icmp_send(skb_in, type, code);
> > icmp_push_reply(sk, _param, , , );
> >  ende:
> > ip_rt_put(rt);
> > diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> > index 1e650ec71d2f..d5963831280f 100644
> > --- a/net/ipv4/tcp_ipv4.c
> > +++ b/net/ipv4/tcp_ipv4.c
> > @@ -2160,6 +2160,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
> >  {
> > struct net *net = dev_net(skb->dev);
> > enum skb_drop_reason drop_reason;
> > +   enum sk_rst_reason rst_reason;
> > int sdif = inet_sdif(skb);
> > int dif = inet_iif(skb);
> > const struct iphdr *iph;
> > @@ -2355,7 +2356,8 @@ int tcp_v4_rcv(struct sk_buff *skb)
> >  bad_packet:
> > __TCP_INC_STATS(net, TCP_MIB_INERRS);
> > } else {
> > -   tcp_v4_send_reset(NULL, skb);
> > +   rst_reason = RST_REASON_NO_SOCKET;
> > +   tcp_v4_send_reset(NULL, skb, rst_reason);
> > }
> >
> >  discard_it:
> >
> > As you can see, we need to add a new 'rst_reason' variable which
> > actually is the same as drop reason. They are the same except for the
> > enum type... Such rst_reasons/drop_reasons are all over the place.
> >
> > Eric, if you have a strong preference, I can do it as you said.
> >
> > Well, how about explicitly casting them like this based on the current
> > series. It looks better and clearer and more helpful to people who is
> > reading codes to understand:
> > diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> > index 461b4d2b7cfe..eb125163d819 100644
> > --- a/net/ipv4/tcp_ipv4.c
> > +++ b/net/ipv4/tcp_ipv4.c
> > @@ -1936,7 +1936,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff 
> > *skb)
> > return 0;
> >
> >  reset:
> > -   tcp_v4_send_reset(rsk, skb, (u32)reason);
> > +   tcp_v4_send_reset(rsk, skb, (enum sk_rst_reason)reason);
> >  discard:
> > kfree_skb_reason(skb, reason);
> > /* Be careful here. If this function gets more complicated and
>
> It makes no sense to declare an enum sk_rst_reason and then convert it
> to drop_reason
> or vice versa.
>
> Next thing you know, compiler guys will add a new -Woption that will
> forbid such conversions.
>
> Please add to tcp_v4_send_reset() an skb_drop_reason, not a new enum.

Ah... It looks like I didn't make myself clear again. Sorry...
Actually I wrote this part many times. My conclusion is that It's not
feasible to do this.

REASONS:
If we __only__ need to deal with this passive reset in TCP, it's fine.
We pass a skb_drop_reason which should also be converted to u32 type
in tcp_v4_send_reset() as you said, it can work. People who use the
trace will see the reason with the 'SKB_DROP_REASON' prefix stripped.

But we have to deal with other cases. A few questions are listed here:
1) What about tcp_send_active_reset() in TCP/MPTCP? Passing weird drop
reasons? There is no drop reason at all. I think people will get
confused. So I believe we should invent new definitions to cope with
it.
2) What about the .send_reset callback in the subflow_syn_recv_sock()
in MPTCP? The reasons in MPTCP are only definitions (such as
MPTCP_RST_EUNSPEC). I don't think we can convert them into the enum
skb_drop_reason type.

So where should we group those various reasons?

Introducing a new enum is for extension and compatibility for all
kinds of reset reasons.

What do you think?

Thanks,
Jason

>
> skb_drop_reason are simply values that are later converted to strings...
>
> So : Do not declare a new enum.

Re: [PATCH net-next v6 0/7] Implement reset reason mechanism to detect

2024-04-18 Thread Jason Xing

On Fri, Apr 19, 2024 at 7:26 AM Jason Xing  wrote:
>
> > When I said "If you feel the need to put them in a special group, this
> > is fine by me.",
> > this was really about partitioning the existing enum into groups, if
> > you prefer having a group of 'RES reasons'
>
> Are you suggesting copying what we need from enum skb_drop_reason{} to
> enum sk_rst_reason{}? Why not reusing them directly. I have no idea
> what the side effect of cast conversion itself is?

Sorry that I'm writing this email. I'm worried my statement is not
that clear, so I write one simple snippet which can help me explain
well :)

Allow me give NO_SOCKET as an example:
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index e63a3bf99617..2c9f7364de45 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -767,6 +767,7 @@ void __icmp_send(struct sk_buff *skb_in, int type,
int code, __be32 info,
if (!fl4.saddr)
fl4.saddr = htonl(INADDR_DUMMY);

+   trace_icmp_send(skb_in, type, code);
icmp_push_reply(sk, _param, , , );
 ende:
ip_rt_put(rt);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 1e650ec71d2f..d5963831280f 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2160,6 +2160,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
 {
struct net *net = dev_net(skb->dev);
enum skb_drop_reason drop_reason;
+   enum sk_rst_reason rst_reason;
int sdif = inet_sdif(skb);
int dif = inet_iif(skb);
const struct iphdr *iph;
@@ -2355,7 +2356,8 @@ int tcp_v4_rcv(struct sk_buff *skb)
 bad_packet:
__TCP_INC_STATS(net, TCP_MIB_INERRS);
} else {
-   tcp_v4_send_reset(NULL, skb);
+   rst_reason = RST_REASON_NO_SOCKET;
+   tcp_v4_send_reset(NULL, skb, rst_reason);
}

 discard_it:

As you can see, we need to add a new 'rst_reason' variable which
actually is the same as drop reason. They are the same except for the
enum type... Such rst_reasons/drop_reasons are all over the place.

Eric, if you have a strong preference, I can do it as you said.

Well, how about explicitly casting them like this based on the current
series. It looks better and clearer and more helpful to people who is
reading codes to understand:
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 461b4d2b7cfe..eb125163d819 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1936,7 +1936,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
return 0;

 reset:
-   tcp_v4_send_reset(rsk, skb, (u32)reason);
+   tcp_v4_send_reset(rsk, skb, (enum sk_rst_reason)reason);
 discard:
kfree_skb_reason(skb, reason);
/* Be careful here. If this function gets more complicated and

Thanks for your patience again.

Jason

Re: [PATCH net-next v6 0/7] Implement reset reason mechanism to detect

2024-04-18 Thread Jason Xing

> When I said "If you feel the need to put them in a special group, this
> is fine by me.",
> this was really about partitioning the existing enum into groups, if
> you prefer having a group of 'RES reasons'

Are you suggesting copying what we need from enum skb_drop_reason{} to
enum sk_rst_reason{}? Why not reusing them directly. I have no idea
what the side effect of cast conversion itself is?

If __not__ doing so (copying reasons one by one), for passive rests,
we can totally rely on the drop reason, which means if we implement
more reasons for skb drop happening in reset cases, we don't need to
handle reset cases over and over again (like adding rst reasons just
after newly added drop reasons if without cast conversions). It's
easier to maintain the reset reason part if we can apply the current
patch series.

Thank you.

Re: [PATCH net-next v6 0/7] Implement reset reason mechanism to detect

2024-04-18 Thread Jason Xing

On Fri, Apr 19, 2024 at 6:29 AM Jason Xing  wrote:
>
> On Fri, Apr 19, 2024 at 2:51 AM Eric Dumazet  wrote:
> >
> > On Thu, Apr 18, 2024 at 6:24 PM Jason Xing  
> > wrote:
> > >
> > > On Thu, Apr 18, 2024 at 11:46 PM Jakub Kicinski  wrote:
> > > >
> > > > On Thu, 18 Apr 2024 11:30:02 +0800 Jason Xing wrote:
> > > > > I'm not sure why the patch series has been changed to 'Changes
> > > > > Requested', until now I don't think I need to change something.
> > > > >
> > > > > Should I repost this series (keeping the v6 tag) and then wait for
> > > > > more comments?
> > > >
> > > > If Eric doesn't like it - it's not getting merged.
> > >
> > > I'm not a English native speaker. If I understand correctly, it seems
> > > that Eric doesn't object to the patch series. Here is the quotation
> > > [1]:
> > > "If you feel the need to put them in a special group, this is fine by me."
> > >
> > > This rst reason mechanism can cover all the possible reasons for both
> > > TCP and MPTCP. We don't need to reinvent some definitions of reset
> > > reasons which are totally the same as drop reasons. Also, we don't
> > > need to reinvent something to cover MPTCP. If people are willing to
> > > contribute more rst reasons, they can find a good place.
> > >
> > > Reset is one big complicated 'issue' in production. I spent a lot of
> > > time handling all kinds of reset reasons daily. I'm apparently not the
> > > only one. I just want to make admins' lives easier, including me. This
> > > special/separate reason group is important because we can extend it in
> > > the future, which will not get confused.
> > >
> > > I hope it can have a chance to get merged :) Thank you.
> > >
> > > [1]: 
> > > https://lore.kernel.org/all/cann89i+alo_agyc8dr8dkfyi+6wpzcgrogysvgr8frfrvaa...@mail.gmail.com/
> > >
> > > Thanks,
> > > Jason
> >
> > My objection was these casts between enums. Especially if hiding with (u32)
>
> So I should explicitly cast it like this:
> tcp_v4_send_reset(rsk, skb, (enum sk_rst_reason)reason);
> ?
>
> >
> > I see no reason for adding these casts in TCP stack.
>
> Sorry, I don't know why the casts really make you so annoyed. But I
> still think it's not a bad way to handle all the cases for RST.
>
> Supposing not to add a enum sk_rst_reason{}, passing drop reasons only
> works well in TCP for passive rests. For active reset cases (in the
> tcp_send_active_reset()), it's meaningless/confusing to insist on
> reusing the drop reason because I have to add some reset reasons (that
> are only used in RST cases) in the enum skb_drop_reason{}, which is
> really weird, in my view. The same problem exists in how to handle
> MPTCP. So I prefer putting them in a separate group like now. What do
> you think about it, right now?

The description in the previous email may be too long. The key point
is that we're not supporting only for passive resets, right?

Thanks,
Jason

Re: [PATCH net-next v6 0/7] Implement reset reason mechanism to detect

2024-04-18 Thread Jason Xing

On Fri, Apr 19, 2024 at 2:51 AM Eric Dumazet  wrote:
>
> On Thu, Apr 18, 2024 at 6:24 PM Jason Xing  wrote:
> >
> > On Thu, Apr 18, 2024 at 11:46 PM Jakub Kicinski  wrote:
> > >
> > > On Thu, 18 Apr 2024 11:30:02 +0800 Jason Xing wrote:
> > > > I'm not sure why the patch series has been changed to 'Changes
> > > > Requested', until now I don't think I need to change something.
> > > >
> > > > Should I repost this series (keeping the v6 tag) and then wait for
> > > > more comments?
> > >
> > > If Eric doesn't like it - it's not getting merged.
> >
> > I'm not a English native speaker. If I understand correctly, it seems
> > that Eric doesn't object to the patch series. Here is the quotation
> > [1]:
> > "If you feel the need to put them in a special group, this is fine by me."
> >
> > This rst reason mechanism can cover all the possible reasons for both
> > TCP and MPTCP. We don't need to reinvent some definitions of reset
> > reasons which are totally the same as drop reasons. Also, we don't
> > need to reinvent something to cover MPTCP. If people are willing to
> > contribute more rst reasons, they can find a good place.
> >
> > Reset is one big complicated 'issue' in production. I spent a lot of
> > time handling all kinds of reset reasons daily. I'm apparently not the
> > only one. I just want to make admins' lives easier, including me. This
> > special/separate reason group is important because we can extend it in
> > the future, which will not get confused.
> >
> > I hope it can have a chance to get merged :) Thank you.
> >
> > [1]: 
> > https://lore.kernel.org/all/cann89i+alo_agyc8dr8dkfyi+6wpzcgrogysvgr8frfrvaa...@mail.gmail.com/
> >
> > Thanks,
> > Jason
>
> My objection was these casts between enums. Especially if hiding with (u32)

So I should explicitly cast it like this:
tcp_v4_send_reset(rsk, skb, (enum sk_rst_reason)reason);
?

>
> I see no reason for adding these casts in TCP stack.

Sorry, I don't know why the casts really make you so annoyed. But I
still think it's not a bad way to handle all the cases for RST.

Supposing not to add a enum sk_rst_reason{}, passing drop reasons only
works well in TCP for passive rests. For active reset cases (in the
tcp_send_active_reset()), it's meaningless/confusing to insist on
reusing the drop reason because I have to add some reset reasons (that
are only used in RST cases) in the enum skb_drop_reason{}, which is
really weird, in my view. The same problem exists in how to handle
MPTCP. So I prefer putting them in a separate group like now. What do
you think about it, right now?

Thanks,
Jason

Re: [PATCH net-next v6 0/7] Implement reset reason mechanism to detect

2024-04-18 Thread Jason Xing

On Thu, Apr 18, 2024 at 11:46 PM Jakub Kicinski  wrote:
>
> On Thu, 18 Apr 2024 11:30:02 +0800 Jason Xing wrote:
> > I'm not sure why the patch series has been changed to 'Changes
> > Requested', until now I don't think I need to change something.
> >
> > Should I repost this series (keeping the v6 tag) and then wait for
> > more comments?
>
> If Eric doesn't like it - it's not getting merged.

I'm not a English native speaker. If I understand correctly, it seems
that Eric doesn't object to the patch series. Here is the quotation
[1]:
"If you feel the need to put them in a special group, this is fine by me."

This rst reason mechanism can cover all the possible reasons for both
TCP and MPTCP. We don't need to reinvent some definitions of reset
reasons which are totally the same as drop reasons. Also, we don't
need to reinvent something to cover MPTCP. If people are willing to
contribute more rst reasons, they can find a good place.

Reset is one big complicated 'issue' in production. I spent a lot of
time handling all kinds of reset reasons daily. I'm apparently not the
only one. I just want to make admins' lives easier, including me. This
special/separate reason group is important because we can extend it in
the future, which will not get confused.

I hope it can have a chance to get merged :) Thank you.

[1]: 
https://lore.kernel.org/all/cann89i+alo_agyc8dr8dkfyi+6wpzcgrogysvgr8frfrvaa...@mail.gmail.com/

Thanks,
Jason

[PATCH net-next v6 7/7] rstreason: make it work in trace world

2024-04-18 Thread Jason Xing

From: Jason Xing 

At last, we should let it work by introducing this reset reason in
trace world.

One of the possible expected outputs is:
... tcp_send_reset: skbaddr=xxx skaddr=xxx src=xxx dest=xxx
state=TCP_ESTABLISHED reason=NOT_SPECIFIED

Signed-off-by: Jason Xing 
Reviewed-by: Steven Rostedt (Google) 
---
 include/trace/events/tcp.h | 37 +
 net/ipv4/tcp_ipv4.c|  2 +-
 net/ipv4/tcp_output.c  |  2 +-
 net/ipv6/tcp_ipv6.c|  2 +-
 4 files changed, 36 insertions(+), 7 deletions(-)

diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
index 5c04a61a11c2..b1455cbc0634 100644
--- a/include/trace/events/tcp.h
+++ b/include/trace/events/tcp.h
@@ -11,6 +11,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /*
  * tcp event with arguments sk and skb
@@ -74,20 +75,38 @@ DEFINE_EVENT(tcp_event_sk_skb, tcp_retransmit_skb,
TP_ARGS(sk, skb)
 );
 
+#undef FN1
+#define FN1(reason)TRACE_DEFINE_ENUM(SK_RST_REASON_##reason);
+#undef FN2
+#define FN2(reason)TRACE_DEFINE_ENUM(SKB_DROP_REASON_##reason);
+DEFINE_RST_REASON(FN1, FN1)
+
+#undef FN1
+#undef FNe1
+#define FN1(reason){ SK_RST_REASON_##reason, #reason },
+#define FNe1(reason)   { SK_RST_REASON_##reason, #reason }
+
+#undef FN2
+#undef FNe2
+#define FN2(reason){ SKB_DROP_REASON_##reason, #reason },
+#define FNe2(reason)   { SKB_DROP_REASON_##reason, #reason }
 /*
  * skb of trace_tcp_send_reset is the skb that caused RST. In case of
  * active reset, skb should be NULL
  */
 TRACE_EVENT(tcp_send_reset,
 
-   TP_PROTO(const struct sock *sk, const struct sk_buff *skb),
+   TP_PROTO(const struct sock *sk,
+const struct sk_buff *skb,
+const enum sk_rst_reason reason),
 
-   TP_ARGS(sk, skb),
+   TP_ARGS(sk, skb, reason),
 
TP_STRUCT__entry(
__field(const void *, skbaddr)
__field(const void *, skaddr)
__field(int, state)
+   __field(enum sk_rst_reason, reason)
__array(__u8, saddr, sizeof(struct sockaddr_in6))
__array(__u8, daddr, sizeof(struct sockaddr_in6))
),
@@ -113,14 +132,24 @@ TRACE_EVENT(tcp_send_reset,
 */
TP_STORE_ADDR_PORTS_SKB(skb, th, entry->daddr, 
entry->saddr);
}
+   __entry->reason = reason;
),
 
-   TP_printk("skbaddr=%p skaddr=%p src=%pISpc dest=%pISpc state=%s",
+   TP_printk("skbaddr=%p skaddr=%p src=%pISpc dest=%pISpc state=%s 
reason=%s",
  __entry->skbaddr, __entry->skaddr,
  __entry->saddr, __entry->daddr,
- __entry->state ? show_tcp_state_name(__entry->state) : 
"UNKNOWN")
+ __entry->state ? show_tcp_state_name(__entry->state) : 
"UNKNOWN",
+ __entry->reason < RST_REASON_START ?
+   __print_symbolic(__entry->reason, 
DEFINE_DROP_REASON(FN2, FNe2)) :
+   __print_symbolic(__entry->reason, 
DEFINE_RST_REASON(FN1, FNe1)))
 );
 
+#undef FN1
+#undef FNe1
+
+#undef FN2
+#undef FNe2
+
 /*
  * tcp event with arguments sk
  *
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index d78412cf8566..461b4d2b7cfe 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -871,7 +871,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct 
sk_buff *skb,
if (sk)
arg.bound_dev_if = sk->sk_bound_dev_if;
 
-   trace_tcp_send_reset(sk, skb);
+   trace_tcp_send_reset(sk, skb, reason);
 
BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 276d9d541b01..b08ffb17d5a0 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3612,7 +3612,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t 
priority,
/* skb of trace_tcp_send_reset() keeps the skb that caused RST,
 * skb here is different to the troublesome skb, so use NULL
 */
-   trace_tcp_send_reset(sk, NULL);
+   trace_tcp_send_reset(sk, NULL, SK_RST_REASON_NOT_SPECIFIED);
 }
 
 /* Send a crossed SYN-ACK during socket establishment.
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index c46095fb596c..6a4736ec3df0 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1133,7 +1133,7 @@ static void tcp_v6_send_reset(const struct sock *sk, 
struct sk_buff *skb,
label = ip6_flowlabel(ipv6h);
}
 
-   trace_tcp_send_reset(sk, skb);
+   trace_tcp_send_reset(sk, skb, reason);
 
tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, 1,
 ipv6_get_dsfield(ipv6h), label, priority, txhash,
-- 
2.37.3

[PATCH net-next v6 6/7] mptcp: introducing a helper into active reset logic

2024-04-18 Thread Jason Xing

From: Jason Xing 

Since we have mapped every mptcp reset reason definition
in enum sk_rst_reason, introducing a new helper can cover
some missing places where we have already set the
subflow->reset_reason.

Note: using SK_RST_REASON_NOT_SPECIFIED is the same as
SK_RST_REASON_MPTCP_RST_EUNSPEC. They are both unknown.
So we can convert it directly.

Suggested-by: Paolo Abeni 
Signed-off-by: Jason Xing 
---
Link: 
https://lore.kernel.org/all/2d3ea199eef53cf6a0c48e21abdee0eefbdee927.ca...@redhat.com/
---
 net/mptcp/protocol.c |  4 +---
 net/mptcp/protocol.h | 11 +++
 net/mptcp/subflow.c  |  6 ++
 3 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 065967086492..4b13ca362efa 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -21,7 +21,6 @@
 #endif
 #include 
 #include 
-#include 
 #include 
 #include "protocol.h"
 #include "mib.h"
@@ -2570,8 +2569,7 @@ static void mptcp_check_fastclose(struct mptcp_sock *msk)
 
slow = lock_sock_fast(tcp_sk);
if (tcp_sk->sk_state != TCP_CLOSE) {
-   tcp_send_active_reset(tcp_sk, GFP_ATOMIC,
- SK_RST_REASON_NOT_SPECIFIED);
+   mptcp_send_active_reset_reason(tcp_sk);
tcp_set_state(tcp_sk, TCP_CLOSE);
}
unlock_sock_fast(tcp_sk, slow);
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index fdfa843e2d88..82ef2f42a1bc 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -12,6 +12,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "mptcp_pm_gen.h"
 
@@ -581,6 +582,16 @@ mptcp_subflow_ctx_reset(struct mptcp_subflow_context 
*subflow)
WRITE_ONCE(subflow->local_id, -1);
 }
 
+static inline void
+mptcp_send_active_reset_reason(struct sock *sk)
+{
+   struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+   enum sk_rst_reason reason;
+
+   reason = convert_mptcp_reason(subflow->reset_reason);
+   tcp_send_active_reset(sk, GFP_ATOMIC, reason);
+}
+
 static inline u64
 mptcp_subflow_get_map_offset(const struct mptcp_subflow_context *subflow)
 {
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index bde4a7fdee82..4783d558863c 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -20,7 +20,6 @@
 #include 
 #endif
 #include 
-#include 
 
 #include "protocol.h"
 #include "mib.h"
@@ -424,7 +423,7 @@ void mptcp_subflow_reset(struct sock *ssk)
/* must hold: tcp_done() could drop last reference on parent */
sock_hold(sk);
 
-   tcp_send_active_reset(ssk, GFP_ATOMIC, SK_RST_REASON_NOT_SPECIFIED);
+   mptcp_send_active_reset_reason(ssk);
tcp_done(ssk);
if (!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, _sk(sk)->flags))
mptcp_schedule_work(sk);
@@ -1362,8 +1361,7 @@ static bool subflow_check_data_avail(struct sock *ssk)
tcp_set_state(ssk, TCP_CLOSE);
while ((skb = skb_peek(>sk_receive_queue)))
sk_eat_skb(ssk, skb);
-   tcp_send_active_reset(ssk, GFP_ATOMIC,
- SK_RST_REASON_NOT_SPECIFIED);
+   mptcp_send_active_reset_reason(ssk);
WRITE_ONCE(subflow->data_avail, false);
return false;
}
-- 
2.37.3

[PATCH net-next v6 5/7] mptcp: support rstreason for passive reset

2024-04-18 Thread Jason Xing

From: Jason Xing 

It relys on what reset options in the skb are as rfc8684 says. Reusing
this logic can save us much energy. This patch replaces most of the prior
NOT_SPECIFIED reasons.

Signed-off-by: Jason Xing 
---
 net/mptcp/subflow.c | 22 +-
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index ac867d277860..bde4a7fdee82 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -309,8 +309,13 @@ static struct dst_entry *subflow_v4_route_req(const struct 
sock *sk,
return dst;
 
dst_release(dst);
-   if (!req->syncookie)
-   tcp_request_sock_ops.send_reset(sk, skb, 
SK_RST_REASON_NOT_SPECIFIED);
+   if (!req->syncookie) {
+   struct mptcp_ext *mpext = mptcp_get_ext(skb);
+   enum sk_rst_reason reason;
+
+   reason = convert_mptcp_reason(mpext->reset_reason);
+   tcp_request_sock_ops.send_reset(sk, skb, reason);
+   }
return NULL;
 }
 
@@ -377,8 +382,13 @@ static struct dst_entry *subflow_v6_route_req(const struct 
sock *sk,
return dst;
 
dst_release(dst);
-   if (!req->syncookie)
-   tcp6_request_sock_ops.send_reset(sk, skb, 
SK_RST_REASON_NOT_SPECIFIED);
+   if (!req->syncookie) {
+   struct mptcp_ext *mpext = mptcp_get_ext(skb);
+   enum sk_rst_reason reason;
+
+   reason = convert_mptcp_reason(mpext->reset_reason);
+   tcp6_request_sock_ops.send_reset(sk, skb, reason);
+   }
return NULL;
 }
 #endif
@@ -783,6 +793,7 @@ static struct sock *subflow_syn_recv_sock(const struct sock 
*sk,
struct mptcp_subflow_request_sock *subflow_req;
struct mptcp_options_received mp_opt;
bool fallback, fallback_is_fatal;
+   enum sk_rst_reason reason;
struct mptcp_sock *owner;
struct sock *child;
 
@@ -913,7 +924,8 @@ static struct sock *subflow_syn_recv_sock(const struct sock 
*sk,
tcp_rsk(req)->drop_req = true;
inet_csk_prepare_for_destroy_sock(child);
tcp_done(child);
-   req->rsk_ops->send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
+   reason = convert_mptcp_reason(mptcp_get_ext(skb)->reset_reason);
+   req->rsk_ops->send_reset(sk, skb, reason);
 
/* The last child reference will be released by the caller */
return child;
-- 
2.37.3

[PATCH net-next v6 4/7] tcp: support rstreason for passive reset

2024-04-18 Thread Jason Xing

From: Jason Xing 

Reuse the dropreason logic to show the exact reason of tcp reset,
so we don't need to implement those duplicated reset reasons.
This patch replaces all the prior NOT_SPECIFIED reasons.

Signed-off-by: Jason Xing 
---
 net/ipv4/tcp_ipv4.c | 8 
 net/ipv6/tcp_ipv6.c | 8 
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 418d11902fa7..d78412cf8566 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1936,7 +1936,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
return 0;
 
 reset:
-   tcp_v4_send_reset(rsk, skb, SK_RST_REASON_NOT_SPECIFIED);
+   tcp_v4_send_reset(rsk, skb, (u32)reason);
 discard:
kfree_skb_reason(skb, reason);
/* Be careful here. If this function gets more complicated and
@@ -2278,7 +2278,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
} else {
drop_reason = tcp_child_process(sk, nsk, skb);
if (drop_reason) {
-   tcp_v4_send_reset(nsk, skb, 
SK_RST_REASON_NOT_SPECIFIED);
+   tcp_v4_send_reset(nsk, skb, (u32)drop_reason);
goto discard_and_relse;
}
sock_put(sk);
@@ -2357,7 +2357,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
 bad_packet:
__TCP_INC_STATS(net, TCP_MIB_INERRS);
} else {
-   tcp_v4_send_reset(NULL, skb, SK_RST_REASON_NOT_SPECIFIED);
+   tcp_v4_send_reset(NULL, skb, (u32)drop_reason);
}
 
 discard_it:
@@ -2409,7 +2409,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
tcp_v4_timewait_ack(sk, skb);
break;
case TCP_TW_RST:
-   tcp_v4_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
+   tcp_v4_send_reset(sk, skb, (u32)drop_reason);
inet_twsk_deschedule_put(inet_twsk(sk));
goto discard_it;
case TCP_TW_SUCCESS:;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 017f6293b5f4..c46095fb596c 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1680,7 +1680,7 @@ int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
return 0;
 
 reset:
-   tcp_v6_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
+   tcp_v6_send_reset(sk, skb, (u32)reason);
 discard:
if (opt_skb)
__kfree_skb(opt_skb);
@@ -1865,7 +1865,7 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff 
*skb)
} else {
drop_reason = tcp_child_process(sk, nsk, skb);
if (drop_reason) {
-   tcp_v6_send_reset(nsk, skb, 
SK_RST_REASON_NOT_SPECIFIED);
+   tcp_v6_send_reset(nsk, skb, (u32)drop_reason);
goto discard_and_relse;
}
sock_put(sk);
@@ -1942,7 +1942,7 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff 
*skb)
 bad_packet:
__TCP_INC_STATS(net, TCP_MIB_INERRS);
} else {
-   tcp_v6_send_reset(NULL, skb, SK_RST_REASON_NOT_SPECIFIED);
+   tcp_v6_send_reset(NULL, skb, (u32)drop_reason);
}
 
 discard_it:
@@ -1998,7 +1998,7 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff 
*skb)
tcp_v6_timewait_ack(sk, skb);
break;
case TCP_TW_RST:
-   tcp_v6_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
+   tcp_v6_send_reset(sk, skb, (u32)drop_reason);
inet_twsk_deschedule_put(inet_twsk(sk));
goto discard_it;
case TCP_TW_SUCCESS:
-- 
2.37.3

[PATCH net-next v6 3/7] rstreason: prepare for active reset

2024-04-18 Thread Jason Xing

From: Jason Xing 

Like what we did to passive reset:
only passing possible reset reason in each active reset path.

No functional changes.

Signed-off-by: Jason Xing 
---
 include/net/tcp.h |  3 ++-
 net/ipv4/tcp.c| 15 ++-
 net/ipv4/tcp_output.c |  3 ++-
 net/ipv4/tcp_timer.c  |  9 ++---
 net/mptcp/protocol.c  |  4 +++-
 net/mptcp/subflow.c   |  5 +++--
 6 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index b935e1ae4caf..adeacc9aa28a 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -670,7 +670,8 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
 void tcp_send_probe0(struct sock *);
 int tcp_write_wakeup(struct sock *, int mib);
 void tcp_send_fin(struct sock *sk);
-void tcp_send_active_reset(struct sock *sk, gfp_t priority);
+void tcp_send_active_reset(struct sock *sk, gfp_t priority,
+  enum sk_rst_reason reason);
 int tcp_send_synack(struct sock *);
 void tcp_push_one(struct sock *, unsigned int mss_now);
 void __tcp_send_ack(struct sock *sk, u32 rcv_nxt);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index f23b9ea5..4ec0f4feee00 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -275,6 +275,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -2811,7 +2812,8 @@ void __tcp_close(struct sock *sk, long timeout)
/* Unread data was tossed, zap the connection. */
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
tcp_set_state(sk, TCP_CLOSE);
-   tcp_send_active_reset(sk, sk->sk_allocation);
+   tcp_send_active_reset(sk, sk->sk_allocation,
+ SK_RST_REASON_NOT_SPECIFIED);
} else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
/* Check zero linger _after_ checking for unread data. */
sk->sk_prot->disconnect(sk, 0);
@@ -2885,7 +2887,8 @@ void __tcp_close(struct sock *sk, long timeout)
struct tcp_sock *tp = tcp_sk(sk);
if (READ_ONCE(tp->linger2) < 0) {
tcp_set_state(sk, TCP_CLOSE);
-   tcp_send_active_reset(sk, GFP_ATOMIC);
+   tcp_send_active_reset(sk, GFP_ATOMIC,
+ SK_RST_REASON_NOT_SPECIFIED);
__NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPABORTONLINGER);
} else {
@@ -2903,7 +2906,8 @@ void __tcp_close(struct sock *sk, long timeout)
if (sk->sk_state != TCP_CLOSE) {
if (tcp_check_oom(sk, 0)) {
tcp_set_state(sk, TCP_CLOSE);
-   tcp_send_active_reset(sk, GFP_ATOMIC);
+   tcp_send_active_reset(sk, GFP_ATOMIC,
+ SK_RST_REASON_NOT_SPECIFIED);
__NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPABORTONMEMORY);
} else if (!check_net(sock_net(sk))) {
@@ -3007,7 +3011,7 @@ int tcp_disconnect(struct sock *sk, int flags)
/* The last check adjusts for discrepancy of Linux wrt. RFC
 * states
 */
-   tcp_send_active_reset(sk, gfp_any());
+   tcp_send_active_reset(sk, gfp_any(), 
SK_RST_REASON_NOT_SPECIFIED);
WRITE_ONCE(sk->sk_err, ECONNRESET);
} else if (old_state == TCP_SYN_SENT)
WRITE_ONCE(sk->sk_err, ECONNRESET);
@@ -4564,7 +4568,8 @@ int tcp_abort(struct sock *sk, int err)
smp_wmb();
sk_error_report(sk);
if (tcp_need_reset(sk->sk_state))
-   tcp_send_active_reset(sk, GFP_ATOMIC);
+   tcp_send_active_reset(sk, GFP_ATOMIC,
+ SK_RST_REASON_NOT_SPECIFIED);
tcp_done(sk);
}
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 61119d42b0fd..276d9d541b01 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3586,7 +3586,8 @@ void tcp_send_fin(struct sock *sk)
  * was unread data in the receive queue.  This behavior is recommended
  * by RFC 2525, section 2.17.  -DaveM
  */
-void tcp_send_active_reset(struct sock *sk, gfp_t priority)
+void tcp_send_active_reset(struct sock *sk, gfp_t priority,
+  enum sk_rst_reason reason)
 {
struct sk_buff *skb;
 
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 976db57b95d4..83fe7f62f7f1 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -22,6 +22,7 @@
 #include 
 #include 
 #include 
+#include 
 
 static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk)
 {
@@ -127,7 +128,8 @@ static int tcp_out_of_resources(struct sock *sk, bool 
do_reset)

[PATCH net-next v6 2/7] rstreason: prepare for passive reset

2024-04-18 Thread Jason Xing

From: Jason Xing 

Adjust the parameter and support passing reason of reset which
is for now NOT_SPECIFIED. No functional changes.

Signed-off-by: Jason Xing 
---
 include/net/request_sock.h |  4 +++-
 net/dccp/ipv4.c| 10 ++
 net/dccp/ipv6.c| 10 ++
 net/dccp/minisocks.c   |  3 ++-
 net/ipv4/tcp_ipv4.c| 12 +++-
 net/ipv4/tcp_minisocks.c   |  3 ++-
 net/ipv6/tcp_ipv6.c| 15 +--
 net/mptcp/subflow.c|  8 +---
 8 files changed, 40 insertions(+), 25 deletions(-)

diff --git a/include/net/request_sock.h b/include/net/request_sock.h
index 004e651e6067..bdc737832da6 100644
--- a/include/net/request_sock.h
+++ b/include/net/request_sock.h
@@ -18,6 +18,7 @@
 #include 
 
 #include 
+#include 
 
 struct request_sock;
 struct sk_buff;
@@ -34,7 +35,8 @@ struct request_sock_ops {
void(*send_ack)(const struct sock *sk, struct sk_buff *skb,
struct request_sock *req);
void(*send_reset)(const struct sock *sk,
- struct sk_buff *skb);
+ struct sk_buff *skb,
+ enum sk_rst_reason reason);
void(*destructor)(struct request_sock *req);
void(*syn_ack_timeout)(const struct request_sock *req);
 };
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 9fc9cea4c251..ff41bd6f99c3 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -24,6 +24,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "ackvec.h"
 #include "ccid.h"
@@ -521,7 +522,8 @@ static int dccp_v4_send_response(const struct sock *sk, 
struct request_sock *req
return err;
 }
 
-static void dccp_v4_ctl_send_reset(const struct sock *sk, struct sk_buff 
*rxskb)
+static void dccp_v4_ctl_send_reset(const struct sock *sk, struct sk_buff 
*rxskb,
+  enum sk_rst_reason reason)
 {
int err;
const struct iphdr *rxiph;
@@ -706,7 +708,7 @@ int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
return 0;
 
 reset:
-   dccp_v4_ctl_send_reset(sk, skb);
+   dccp_v4_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
kfree_skb(skb);
return 0;
 }
@@ -869,7 +871,7 @@ static int dccp_v4_rcv(struct sk_buff *skb)
if (nsk == sk) {
reqsk_put(req);
} else if (dccp_child_process(sk, nsk, skb)) {
-   dccp_v4_ctl_send_reset(sk, skb);
+   dccp_v4_ctl_send_reset(sk, skb, 
SK_RST_REASON_NOT_SPECIFIED);
goto discard_and_relse;
} else {
sock_put(sk);
@@ -909,7 +911,7 @@ static int dccp_v4_rcv(struct sk_buff *skb)
if (dh->dccph_type != DCCP_PKT_RESET) {
DCCP_SKB_CB(skb)->dccpd_reset_code =
DCCP_RESET_CODE_NO_CONNECTION;
-   dccp_v4_ctl_send_reset(sk, skb);
+   dccp_v4_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
}
 
 discard_it:
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index c8ca703dc331..85f4b8fdbe5e 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -29,6 +29,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "dccp.h"
 #include "ipv6.h"
@@ -256,7 +257,8 @@ static void dccp_v6_reqsk_destructor(struct request_sock 
*req)
kfree_skb(inet_rsk(req)->pktopts);
 }
 
-static void dccp_v6_ctl_send_reset(const struct sock *sk, struct sk_buff 
*rxskb)
+static void dccp_v6_ctl_send_reset(const struct sock *sk, struct sk_buff 
*rxskb,
+  enum sk_rst_reason reason)
 {
const struct ipv6hdr *rxip6h;
struct sk_buff *skb;
@@ -656,7 +658,7 @@ static int dccp_v6_do_rcv(struct sock *sk, struct sk_buff 
*skb)
return 0;
 
 reset:
-   dccp_v6_ctl_send_reset(sk, skb);
+   dccp_v6_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
 discard:
if (opt_skb != NULL)
__kfree_skb(opt_skb);
@@ -762,7 +764,7 @@ static int dccp_v6_rcv(struct sk_buff *skb)
if (nsk == sk) {
reqsk_put(req);
} else if (dccp_child_process(sk, nsk, skb)) {
-   dccp_v6_ctl_send_reset(sk, skb);
+   dccp_v6_ctl_send_reset(sk, skb, 
SK_RST_REASON_NOT_SPECIFIED);
goto discard_and_relse;
} else {
sock_put(sk);
@@ -801,7 +803,7 @@ static int dccp_v6_rcv(struct sk_buff *skb)
if (dh->dccph_type != DCCP_PKT_RESET) {
DCCP_SKB_CB(skb)->dccpd_reset_code =
DCCP_RESET_CODE_NO_CONNECTION;
-   dccp_v6_ctl_send_reset(sk, skb);
+   dccp_v6_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
}

[PATCH net-next v6 1/7] net: introduce rstreason to detect why the RST is sent

2024-04-18 Thread Jason Xing

From: Jason Xing 

Add a new standalone file for the easy future extension to support
both active reset and passive reset in the TCP/DCCP/MPTCP protocols.

This patch only does the preparations for reset reason mechanism,
nothing else changes.

The reset reasons are divided into three parts:
1) reuse drop reasons for passive reset in TCP
2) reuse MP_TCPRST option for MPTCP
3) our own reasons

I will implement the basic codes of active/passive reset reason in
those three protocols, which is not complete for this moment. But
it provides a new chance to let other people add more reasons into
it:)

Signed-off-by: Jason Xing 
---
 include/net/rstreason.h | 93 +
 1 file changed, 93 insertions(+)
 create mode 100644 include/net/rstreason.h

diff --git a/include/net/rstreason.h b/include/net/rstreason.h
new file mode 100644
index ..0c3fa55fa62f
--- /dev/null
+++ b/include/net/rstreason.h
@@ -0,0 +1,93 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifndef _LINUX_RSTREASON_H
+#define _LINUX_RSTREASON_H
+#include 
+
+#define DEFINE_RST_REASON(FN, FNe) \
+   FN(MPTCP_RST_EUNSPEC)   \
+   FN(MPTCP_RST_EMPTCP)\
+   FN(MPTCP_RST_ERESOURCE) \
+   FN(MPTCP_RST_EPROHIBIT) \
+   FN(MPTCP_RST_EWQ2BIG)   \
+   FN(MPTCP_RST_EBADPERF)  \
+   FN(MPTCP_RST_EMIDDLEBOX)\
+   FN(NOT_SPECIFIED)   \
+   FNe(MAX)
+
+#define RST_REASON_START (SKB_DROP_REASON_MAX + 1)
+
+/* There are three parts in order:
+ * 1) 0 - SKB_DROP_REASON_MAX: rely on drop reasons for passive reset in TCP
+ * 2) SKB_DROP_REASON_MAX + 1 - MPTCP_RST_EMIDDLEBOX: for MPTCP use
+ * 3) MPTCP_RST_EMIDDLEBOX - SK_RST_REASON_MAX: independent reset reason
+ */
+enum sk_rst_reason {
+   /* Leave this 'blank' part (0-SKB_DROP_REASON_MAX) for the reuse
+* of skb drop reason because rst reason relies on what drop reason
+* indicates exactly why it could happen.
+*/
+
+   /* Copy from include/uapi/linux/mptcp.h.
+* These reset fields will not be changed since they adhere to
+* RFC 8684. So do not touch them. I'm going to list each definition
+* of them respectively.
+*/
+   /* Unspecified error.
+* This is the default error; it implies that the subflow is no
+* longer available. The presence of this option shows that the
+* RST was generated by an MPTCP-aware device.
+*/
+   SK_RST_REASON_MPTCP_RST_EUNSPEC = RST_REASON_START,
+   /* MPTCP-specific error.
+* An error has been detected in the processing of MPTCP options.
+* This is the usual reason code to return in the cases where a RST
+* is being sent to close a subflow because of an invalid response.
+*/
+   SK_RST_REASON_MPTCP_RST_EMPTCP,
+   /* Lack of resources.
+* This code indicates that the sending host does not have enough
+* resources to support the terminated subflow.
+*/
+   SK_RST_REASON_MPTCP_RST_ERESOURCE,
+   /* Administratively prohibited.
+* This code indicates that the requested subflow is prohibited by
+* the policies of the sending host.
+*/
+   SK_RST_REASON_MPTCP_RST_EPROHIBIT,
+   /* Too much outstanding data.
+* This code indicates that there is an excessive amount of data
+* that needs to be transmitted over the terminated subflow while
+* having already been acknowledged over one or more other subflows.
+* This may occur if a path has been unavailable for a short period
+* and it is more efficient to reset and start again than it is to
+* retransmit the queued data.
+*/
+   SK_RST_REASON_MPTCP_RST_EWQ2BIG,
+   /* Unacceptable performance.
+* This code indicates that the performance of this subflow was
+* too low compared to the other subflows of this Multipath TCP
+* connection.
+*/
+   SK_RST_REASON_MPTCP_RST_EBADPERF,
+   /* Middlebox interference.
+* Middlebox interference has been detected over this subflow,
+* making MPTCP signaling invalid. For example, this may be sent
+* if the checksum does not validate.
+*/
+   SK_RST_REASON_MPTCP_RST_EMIDDLEBOX,
+
+   /* For the real standalone socket reset reason, we start from here */
+   SK_RST_REASON_NOT_SPECIFIED,
+
+   /* Maximum of socket reset reasons.
+* It shouldn't be used as a real 'reason'.
+*/
+   SK_RST_REASON_MAX,
+};
+
+static inline enum sk_rst_reason convert_mptcp_reason(u32 reason)
+{
+   return reason += RST_REASON_START;
+}
+#endif
-- 
2.37.3

[PATCH net-next RESEND v6 0/7] Implement reset reason mechanism to detect

2024-04-18 Thread Jason Xing

From: Jason Xing 

In production, there are so many cases about why the RST skb is sent but
we don't have a very convenient/fast method to detect the exact underlying
reasons.

RST is implemented in two kinds: passive kind (like tcp_v4_send_reset())
and active kind (like tcp_send_active_reset()). The former can be traced
carefully 1) in TCP, with the help of drop reasons, which is based on
Eric's idea[1], 2) in MPTCP, with the help of reset options defined in
RFC 8684. The latter is relatively independent, which should be
implemented on our own.

In this series, I focus on the fundamental implement mostly about how
the rstreason mechnism works and give the detailed passive part as an
example, not including the active reset part. In future, we can go
further and refine those NOT_SPECIFIED reasons.

Here are some examples when tracing:
-0   [002] ..s1.  1830.262425: tcp_send_reset: skbaddr=x
skaddr=x src=x dest=x state=x reason=NOT_SPECIFIED
-0   [002] ..s1.  1830.262425: tcp_send_reset: skbaddr=x
skaddr=x src=x dest=x state=x reason=NO_SOCKET

[1]
Link: 
https://lore.kernel.org/all/CANn89iJw8x-LqgsWOeJQQvgVg6DnL5aBRLi10QN2WBdr+X4k=w...@mail.gmail.com/

v6
1. add back casts, or else they are treated as error.
2. RESEND because the status of patchwork changed.

v5
Link: 
https://lore.kernel.org/all/2024045630.38420-1-kerneljasonx...@gmail.com/
1. address format issue (like reverse xmas tree) (Eric, Paolo)
2. remove unnecessary casts. (Eric)
3. introduce a helper used in mptcp active reset. See patch 6. (Paolo)

v4
Link: 
https://lore.kernel.org/all/20240409100934.37725-1-kerneljasonx...@gmail.com/
1. passing 'enum sk_rst_reason' for readability when tracing (Antoine)

v3
Link: 
https://lore.kernel.org/all/20240404072047.11490-1-kerneljasonx...@gmail.com/
1. rebase (mptcp part) and address what Mat suggested.

v2
Link: https://lore.kernel.org/all/20240403185033.47ebc...@kernel.org/
1. rebase against the latest net-next tree



Jason Xing (7):
  net: introduce rstreason to detect why the RST is sent
  rstreason: prepare for passive reset
  rstreason: prepare for active reset
  tcp: support rstreason for passive reset
  mptcp: support rstreason for passive reset
  mptcp: introducing a helper into active reset logic
  rstreason: make it work in trace world

 include/net/request_sock.h |  4 +-
 include/net/rstreason.h| 93 ++
 include/net/tcp.h  |  3 +-
 include/trace/events/tcp.h | 37 +--
 net/dccp/ipv4.c| 10 ++--
 net/dccp/ipv6.c| 10 ++--
 net/dccp/minisocks.c   |  3 +-
 net/ipv4/tcp.c | 15 --
 net/ipv4/tcp_ipv4.c| 14 +++---
 net/ipv4/tcp_minisocks.c   |  3 +-
 net/ipv4/tcp_output.c  |  5 +-
 net/ipv4/tcp_timer.c   |  9 ++--
 net/ipv6/tcp_ipv6.c| 17 ---
 net/mptcp/protocol.c   |  2 +-
 net/mptcp/protocol.h   | 11 +
 net/mptcp/subflow.c| 27 ---
 16 files changed, 216 insertions(+), 47 deletions(-)
 create mode 100644 include/net/rstreason.h

-- 
2.37.3

Re: [PATCH net-next v6 0/7] Implement reset reason mechanism to detect

2024-04-17 Thread Jason Xing

On Wed, Apr 17, 2024 at 4:51 PM Jason Xing  wrote:
>
> From: Jason Xing 
>
> In production, there are so many cases about why the RST skb is sent but
> we don't have a very convenient/fast method to detect the exact underlying
> reasons.
>
> RST is implemented in two kinds: passive kind (like tcp_v4_send_reset())
> and active kind (like tcp_send_active_reset()). The former can be traced
> carefully 1) in TCP, with the help of drop reasons, which is based on
> Eric's idea[1], 2) in MPTCP, with the help of reset options defined in
> RFC 8684. The latter is relatively independent, which should be
> implemented on our own.
>
> In this series, I focus on the fundamental implement mostly about how
> the rstreason mechnism works and give the detailed passive part as an
> example, not including the active reset part. In future, we can go
> further and refine those NOT_SPECIFIED reasons.
>
> Here are some examples when tracing:
> -0   [002] ..s1.  1830.262425: tcp_send_reset: skbaddr=x
> skaddr=x src=x dest=x state=x reason=NOT_SPECIFIED
> -0   [002] ..s1.  1830.262425: tcp_send_reset: skbaddr=x
> skaddr=x src=x dest=x state=x reason=NO_SOCKET
>
> [1]
> Link: 
> https://lore.kernel.org/all/CANn89iJw8x-LqgsWOeJQQvgVg6DnL5aBRLi10QN2WBdr+X4k=w...@mail.gmail.com/
>
> v6
> 1. add back casts, or else they are treated as error.
>
> v5
> Link: 
> https://lore.kernel.org/all/2024045630.38420-1-kerneljasonx...@gmail.com/
> 1. address format issue (like reverse xmas tree) (Eric, Paolo)
> 2. remove unnecessary casts. (Eric)
> 3. introduce a helper used in mptcp active reset. See patch 6. (Paolo)
>
> v4
> Link: 
> https://lore.kernel.org/all/20240409100934.37725-1-kerneljasonx...@gmail.com/
> 1. passing 'enum sk_rst_reason' for readability when tracing (Antoine)
>
> v3
> Link: 
> https://lore.kernel.org/all/20240404072047.11490-1-kerneljasonx...@gmail.com/
> 1. rebase (mptcp part) and address what Mat suggested.
>
> v2
> Link: https://lore.kernel.org/all/20240403185033.47ebc...@kernel.org/
> 1. rebase against the latest net-next tree
>
>
>
> Jason Xing (7):
>   net: introduce rstreason to detect why the RST is sent
>   rstreason: prepare for passive reset
>   rstreason: prepare for active reset
>   tcp: support rstreason for passive reset
>   mptcp: support rstreason for passive reset
>   mptcp: introducing a helper into active reset logic
>   rstreason: make it work in trace world
>
>  include/net/request_sock.h |  4 +-
>  include/net/rstreason.h| 93 ++
>  include/net/tcp.h  |  3 +-
>  include/trace/events/tcp.h | 37 +--
>  net/dccp/ipv4.c| 10 ++--
>  net/dccp/ipv6.c| 10 ++--
>  net/dccp/minisocks.c   |  3 +-
>  net/ipv4/tcp.c | 15 --
>  net/ipv4/tcp_ipv4.c| 14 +++---
>  net/ipv4/tcp_minisocks.c   |  3 +-
>  net/ipv4/tcp_output.c  |  5 +-
>  net/ipv4/tcp_timer.c   |  9 ++--
>  net/ipv6/tcp_ipv6.c| 17 ---
>  net/mptcp/protocol.c   |  2 +-
>  net/mptcp/protocol.h   | 11 +
>  net/mptcp/subflow.c| 27 ---
>  16 files changed, 216 insertions(+), 47 deletions(-)
>  create mode 100644 include/net/rstreason.h
>
> --
> 2.37.3
>

Hello maintainers,

I'm not sure why the patch series has been changed to 'Changes
Requested', until now I don't think I need to change something.

Should I repost this series (keeping the v6 tag) and then wait for
more comments?

Thanks,
Jason

Re: [PATCH net-next v6 1/7] net: introduce rstreason to detect why the RST is sent

2024-04-17 Thread Jason Xing

Hello Eric,

On Wed, Apr 17, 2024 at 5:02 PM Eric Dumazet  wrote:
>
> On Wed, Apr 17, 2024 at 10:51 AM Jason Xing  wrote:
> >
> > From: Jason Xing 
> >
> > Add a new standalone file for the easy future extension to support
> > both active reset and passive reset in the TCP/DCCP/MPTCP protocols.
> >
> > This patch only does the preparations for reset reason mechanism,
> > nothing else changes.
> >
> > The reset reasons are divided into three parts:
> > 1) reuse drop reasons for passive reset in TCP
> > 2) reuse MP_TCPRST option for MPTCP
> > 3) our own reasons
> >
> > I will implement the basic codes of active/passive reset reason in
> > those three protocols, which is not complete for this moment. But
> > it provides a new chance to let other people add more reasons into
> > it:)
> >
> > Signed-off-by: Jason Xing 
>
> My original suggestion was to use normal values in  'enum
> skb_drop_reason', even if there was not necessarily a 'drop'
> in the common sense.
>
> https://lore.kernel.org/all/CANn89iJw8x-LqgsWOeJQQvgVg6DnL5aBRLi10QN2WBdr+X4k=w...@mail.gmail.com/
>
> This would avoid these ugly casts later, even casting an enum to other
> ones is not very logical.

Thanks for your comment.

It's a little bit tricky. That's the reason I documented and commented
on this in the rstreason.h file. I hope it's not that hard to
understand.

> Going through an u32 pivot is quite a hack.
>
> If you feel the need to put them in a special group, this is fine by me.

Yes, rst reasons only partially rely on the drop reason mechanism to
support passive rst for TCP well, but not supporting other cases. My
final goal is to cover all the cases for the future, so I wish I can
put it into a separate group, then people like me who find it useful
can introduce more reasons into it.

Thanks,
Jason

[PATCH net-next v6 6/7] mptcp: introducing a helper into active reset logic

2024-04-17 Thread Jason Xing

From: Jason Xing 

Since we have mapped every mptcp reset reason definition
in enum sk_rst_reason, introducing a new helper can cover
some missing places where we have already set the
subflow->reset_reason.

Note: using SK_RST_REASON_NOT_SPECIFIED is the same as
SK_RST_REASON_MPTCP_RST_EUNSPEC. They are both unknown.
So we can convert it directly.

Suggested-by: Paolo Abeni 
Signed-off-by: Jason Xing 
---
Link: 
https://lore.kernel.org/all/2d3ea199eef53cf6a0c48e21abdee0eefbdee927.ca...@redhat.com/
---
 net/mptcp/protocol.c |  4 +---
 net/mptcp/protocol.h | 11 +++
 net/mptcp/subflow.c  |  6 ++
 3 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 065967086492..4b13ca362efa 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -21,7 +21,6 @@
 #endif
 #include 
 #include 
-#include 
 #include 
 #include "protocol.h"
 #include "mib.h"
@@ -2570,8 +2569,7 @@ static void mptcp_check_fastclose(struct mptcp_sock *msk)
 
slow = lock_sock_fast(tcp_sk);
if (tcp_sk->sk_state != TCP_CLOSE) {
-   tcp_send_active_reset(tcp_sk, GFP_ATOMIC,
- SK_RST_REASON_NOT_SPECIFIED);
+   mptcp_send_active_reset_reason(tcp_sk);
tcp_set_state(tcp_sk, TCP_CLOSE);
}
unlock_sock_fast(tcp_sk, slow);
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index fdfa843e2d88..82ef2f42a1bc 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -12,6 +12,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "mptcp_pm_gen.h"
 
@@ -581,6 +582,16 @@ mptcp_subflow_ctx_reset(struct mptcp_subflow_context 
*subflow)
WRITE_ONCE(subflow->local_id, -1);
 }
 
+static inline void
+mptcp_send_active_reset_reason(struct sock *sk)
+{
+   struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+   enum sk_rst_reason reason;
+
+   reason = convert_mptcp_reason(subflow->reset_reason);
+   tcp_send_active_reset(sk, GFP_ATOMIC, reason);
+}
+
 static inline u64
 mptcp_subflow_get_map_offset(const struct mptcp_subflow_context *subflow)
 {
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index bde4a7fdee82..4783d558863c 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -20,7 +20,6 @@
 #include 
 #endif
 #include 
-#include 
 
 #include "protocol.h"
 #include "mib.h"
@@ -424,7 +423,7 @@ void mptcp_subflow_reset(struct sock *ssk)
/* must hold: tcp_done() could drop last reference on parent */
sock_hold(sk);
 
-   tcp_send_active_reset(ssk, GFP_ATOMIC, SK_RST_REASON_NOT_SPECIFIED);
+   mptcp_send_active_reset_reason(ssk);
tcp_done(ssk);
if (!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, _sk(sk)->flags))
mptcp_schedule_work(sk);
@@ -1362,8 +1361,7 @@ static bool subflow_check_data_avail(struct sock *ssk)
tcp_set_state(ssk, TCP_CLOSE);
while ((skb = skb_peek(>sk_receive_queue)))
sk_eat_skb(ssk, skb);
-   tcp_send_active_reset(ssk, GFP_ATOMIC,
- SK_RST_REASON_NOT_SPECIFIED);
+   mptcp_send_active_reset_reason(ssk);
WRITE_ONCE(subflow->data_avail, false);
return false;
}
-- 
2.37.3

[PATCH net-next v6 7/7] rstreason: make it work in trace world

2024-04-17 Thread Jason Xing

From: Jason Xing 

At last, we should let it work by introducing this reset reason in
trace world.

One of the possible expected outputs is:
... tcp_send_reset: skbaddr=xxx skaddr=xxx src=xxx dest=xxx
state=TCP_ESTABLISHED reason=NOT_SPECIFIED

Signed-off-by: Jason Xing 
Reviewed-by: Steven Rostedt (Google) 
---
 include/trace/events/tcp.h | 37 +
 net/ipv4/tcp_ipv4.c|  2 +-
 net/ipv4/tcp_output.c  |  2 +-
 net/ipv6/tcp_ipv6.c|  2 +-
 4 files changed, 36 insertions(+), 7 deletions(-)

diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
index 5c04a61a11c2..b1455cbc0634 100644
--- a/include/trace/events/tcp.h
+++ b/include/trace/events/tcp.h
@@ -11,6 +11,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /*
  * tcp event with arguments sk and skb
@@ -74,20 +75,38 @@ DEFINE_EVENT(tcp_event_sk_skb, tcp_retransmit_skb,
TP_ARGS(sk, skb)
 );
 
+#undef FN1
+#define FN1(reason)TRACE_DEFINE_ENUM(SK_RST_REASON_##reason);
+#undef FN2
+#define FN2(reason)TRACE_DEFINE_ENUM(SKB_DROP_REASON_##reason);
+DEFINE_RST_REASON(FN1, FN1)
+
+#undef FN1
+#undef FNe1
+#define FN1(reason){ SK_RST_REASON_##reason, #reason },
+#define FNe1(reason)   { SK_RST_REASON_##reason, #reason }
+
+#undef FN2
+#undef FNe2
+#define FN2(reason){ SKB_DROP_REASON_##reason, #reason },
+#define FNe2(reason)   { SKB_DROP_REASON_##reason, #reason }
 /*
  * skb of trace_tcp_send_reset is the skb that caused RST. In case of
  * active reset, skb should be NULL
  */
 TRACE_EVENT(tcp_send_reset,
 
-   TP_PROTO(const struct sock *sk, const struct sk_buff *skb),
+   TP_PROTO(const struct sock *sk,
+const struct sk_buff *skb,
+const enum sk_rst_reason reason),
 
-   TP_ARGS(sk, skb),
+   TP_ARGS(sk, skb, reason),
 
TP_STRUCT__entry(
__field(const void *, skbaddr)
__field(const void *, skaddr)
__field(int, state)
+   __field(enum sk_rst_reason, reason)
__array(__u8, saddr, sizeof(struct sockaddr_in6))
__array(__u8, daddr, sizeof(struct sockaddr_in6))
),
@@ -113,14 +132,24 @@ TRACE_EVENT(tcp_send_reset,
 */
TP_STORE_ADDR_PORTS_SKB(skb, th, entry->daddr, 
entry->saddr);
}
+   __entry->reason = reason;
),
 
-   TP_printk("skbaddr=%p skaddr=%p src=%pISpc dest=%pISpc state=%s",
+   TP_printk("skbaddr=%p skaddr=%p src=%pISpc dest=%pISpc state=%s 
reason=%s",
  __entry->skbaddr, __entry->skaddr,
  __entry->saddr, __entry->daddr,
- __entry->state ? show_tcp_state_name(__entry->state) : 
"UNKNOWN")
+ __entry->state ? show_tcp_state_name(__entry->state) : 
"UNKNOWN",
+ __entry->reason < RST_REASON_START ?
+   __print_symbolic(__entry->reason, 
DEFINE_DROP_REASON(FN2, FNe2)) :
+   __print_symbolic(__entry->reason, 
DEFINE_RST_REASON(FN1, FNe1)))
 );
 
+#undef FN1
+#undef FNe1
+
+#undef FN2
+#undef FNe2
+
 /*
  * tcp event with arguments sk
  *
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index d78412cf8566..461b4d2b7cfe 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -871,7 +871,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct 
sk_buff *skb,
if (sk)
arg.bound_dev_if = sk->sk_bound_dev_if;
 
-   trace_tcp_send_reset(sk, skb);
+   trace_tcp_send_reset(sk, skb, reason);
 
BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 276d9d541b01..b08ffb17d5a0 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3612,7 +3612,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t 
priority,
/* skb of trace_tcp_send_reset() keeps the skb that caused RST,
 * skb here is different to the troublesome skb, so use NULL
 */
-   trace_tcp_send_reset(sk, NULL);
+   trace_tcp_send_reset(sk, NULL, SK_RST_REASON_NOT_SPECIFIED);
 }
 
 /* Send a crossed SYN-ACK during socket establishment.
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index c46095fb596c..6a4736ec3df0 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1133,7 +1133,7 @@ static void tcp_v6_send_reset(const struct sock *sk, 
struct sk_buff *skb,
label = ip6_flowlabel(ipv6h);
}
 
-   trace_tcp_send_reset(sk, skb);
+   trace_tcp_send_reset(sk, skb, reason);
 
tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, 1,
 ipv6_get_dsfield(ipv6h), label, priority, txhash,
-- 
2.37.3

[PATCH net-next v6 5/7] mptcp: support rstreason for passive reset

2024-04-17 Thread Jason Xing

From: Jason Xing 

It relys on what reset options in the skb are as rfc8684 says. Reusing
this logic can save us much energy. This patch replaces most of the prior
NOT_SPECIFIED reasons.

Signed-off-by: Jason Xing 
---
 net/mptcp/subflow.c | 22 +-
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index ac867d277860..bde4a7fdee82 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -309,8 +309,13 @@ static struct dst_entry *subflow_v4_route_req(const struct 
sock *sk,
return dst;
 
dst_release(dst);
-   if (!req->syncookie)
-   tcp_request_sock_ops.send_reset(sk, skb, 
SK_RST_REASON_NOT_SPECIFIED);
+   if (!req->syncookie) {
+   struct mptcp_ext *mpext = mptcp_get_ext(skb);
+   enum sk_rst_reason reason;
+
+   reason = convert_mptcp_reason(mpext->reset_reason);
+   tcp_request_sock_ops.send_reset(sk, skb, reason);
+   }
return NULL;
 }
 
@@ -377,8 +382,13 @@ static struct dst_entry *subflow_v6_route_req(const struct 
sock *sk,
return dst;
 
dst_release(dst);
-   if (!req->syncookie)
-   tcp6_request_sock_ops.send_reset(sk, skb, 
SK_RST_REASON_NOT_SPECIFIED);
+   if (!req->syncookie) {
+   struct mptcp_ext *mpext = mptcp_get_ext(skb);
+   enum sk_rst_reason reason;
+
+   reason = convert_mptcp_reason(mpext->reset_reason);
+   tcp6_request_sock_ops.send_reset(sk, skb, reason);
+   }
return NULL;
 }
 #endif
@@ -783,6 +793,7 @@ static struct sock *subflow_syn_recv_sock(const struct sock 
*sk,
struct mptcp_subflow_request_sock *subflow_req;
struct mptcp_options_received mp_opt;
bool fallback, fallback_is_fatal;
+   enum sk_rst_reason reason;
struct mptcp_sock *owner;
struct sock *child;
 
@@ -913,7 +924,8 @@ static struct sock *subflow_syn_recv_sock(const struct sock 
*sk,
tcp_rsk(req)->drop_req = true;
inet_csk_prepare_for_destroy_sock(child);
tcp_done(child);
-   req->rsk_ops->send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
+   reason = convert_mptcp_reason(mptcp_get_ext(skb)->reset_reason);
+   req->rsk_ops->send_reset(sk, skb, reason);
 
/* The last child reference will be released by the caller */
return child;
-- 
2.37.3

[PATCH net-next v6 4/7] tcp: support rstreason for passive reset

2024-04-17 Thread Jason Xing

From: Jason Xing 

Reuse the dropreason logic to show the exact reason of tcp reset,
so we don't need to implement those duplicated reset reasons.
This patch replaces all the prior NOT_SPECIFIED reasons.

Signed-off-by: Jason Xing 
---
 net/ipv4/tcp_ipv4.c | 8 
 net/ipv6/tcp_ipv6.c | 8 
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 418d11902fa7..d78412cf8566 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1936,7 +1936,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
return 0;
 
 reset:
-   tcp_v4_send_reset(rsk, skb, SK_RST_REASON_NOT_SPECIFIED);
+   tcp_v4_send_reset(rsk, skb, (u32)reason);
 discard:
kfree_skb_reason(skb, reason);
/* Be careful here. If this function gets more complicated and
@@ -2278,7 +2278,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
} else {
drop_reason = tcp_child_process(sk, nsk, skb);
if (drop_reason) {
-   tcp_v4_send_reset(nsk, skb, 
SK_RST_REASON_NOT_SPECIFIED);
+   tcp_v4_send_reset(nsk, skb, (u32)drop_reason);
goto discard_and_relse;
}
sock_put(sk);
@@ -2357,7 +2357,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
 bad_packet:
__TCP_INC_STATS(net, TCP_MIB_INERRS);
} else {
-   tcp_v4_send_reset(NULL, skb, SK_RST_REASON_NOT_SPECIFIED);
+   tcp_v4_send_reset(NULL, skb, (u32)drop_reason);
}
 
 discard_it:
@@ -2409,7 +2409,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
tcp_v4_timewait_ack(sk, skb);
break;
case TCP_TW_RST:
-   tcp_v4_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
+   tcp_v4_send_reset(sk, skb, (u32)drop_reason);
inet_twsk_deschedule_put(inet_twsk(sk));
goto discard_it;
case TCP_TW_SUCCESS:;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 017f6293b5f4..c46095fb596c 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1680,7 +1680,7 @@ int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
return 0;
 
 reset:
-   tcp_v6_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
+   tcp_v6_send_reset(sk, skb, (u32)reason);
 discard:
if (opt_skb)
__kfree_skb(opt_skb);
@@ -1865,7 +1865,7 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff 
*skb)
} else {
drop_reason = tcp_child_process(sk, nsk, skb);
if (drop_reason) {
-   tcp_v6_send_reset(nsk, skb, 
SK_RST_REASON_NOT_SPECIFIED);
+   tcp_v6_send_reset(nsk, skb, (u32)drop_reason);
goto discard_and_relse;
}
sock_put(sk);
@@ -1942,7 +1942,7 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff 
*skb)
 bad_packet:
__TCP_INC_STATS(net, TCP_MIB_INERRS);
} else {
-   tcp_v6_send_reset(NULL, skb, SK_RST_REASON_NOT_SPECIFIED);
+   tcp_v6_send_reset(NULL, skb, (u32)drop_reason);
}
 
 discard_it:
@@ -1998,7 +1998,7 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff 
*skb)
tcp_v6_timewait_ack(sk, skb);
break;
case TCP_TW_RST:
-   tcp_v6_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
+   tcp_v6_send_reset(sk, skb, (u32)drop_reason);
inet_twsk_deschedule_put(inet_twsk(sk));
goto discard_it;
case TCP_TW_SUCCESS:
-- 
2.37.3

[PATCH net-next v6 3/7] rstreason: prepare for active reset

2024-04-17 Thread Jason Xing

From: Jason Xing 

Like what we did to passive reset:
only passing possible reset reason in each active reset path.

No functional changes.

Signed-off-by: Jason Xing 
---
 include/net/tcp.h |  3 ++-
 net/ipv4/tcp.c| 15 ++-
 net/ipv4/tcp_output.c |  3 ++-
 net/ipv4/tcp_timer.c  |  9 ++---
 net/mptcp/protocol.c  |  4 +++-
 net/mptcp/subflow.c   |  5 +++--
 6 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index b935e1ae4caf..adeacc9aa28a 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -670,7 +670,8 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
 void tcp_send_probe0(struct sock *);
 int tcp_write_wakeup(struct sock *, int mib);
 void tcp_send_fin(struct sock *sk);
-void tcp_send_active_reset(struct sock *sk, gfp_t priority);
+void tcp_send_active_reset(struct sock *sk, gfp_t priority,
+  enum sk_rst_reason reason);
 int tcp_send_synack(struct sock *);
 void tcp_push_one(struct sock *, unsigned int mss_now);
 void __tcp_send_ack(struct sock *sk, u32 rcv_nxt);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index f23b9ea5..4ec0f4feee00 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -275,6 +275,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -2811,7 +2812,8 @@ void __tcp_close(struct sock *sk, long timeout)
/* Unread data was tossed, zap the connection. */
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
tcp_set_state(sk, TCP_CLOSE);
-   tcp_send_active_reset(sk, sk->sk_allocation);
+   tcp_send_active_reset(sk, sk->sk_allocation,
+ SK_RST_REASON_NOT_SPECIFIED);
} else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
/* Check zero linger _after_ checking for unread data. */
sk->sk_prot->disconnect(sk, 0);
@@ -2885,7 +2887,8 @@ void __tcp_close(struct sock *sk, long timeout)
struct tcp_sock *tp = tcp_sk(sk);
if (READ_ONCE(tp->linger2) < 0) {
tcp_set_state(sk, TCP_CLOSE);
-   tcp_send_active_reset(sk, GFP_ATOMIC);
+   tcp_send_active_reset(sk, GFP_ATOMIC,
+ SK_RST_REASON_NOT_SPECIFIED);
__NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPABORTONLINGER);
} else {
@@ -2903,7 +2906,8 @@ void __tcp_close(struct sock *sk, long timeout)
if (sk->sk_state != TCP_CLOSE) {
if (tcp_check_oom(sk, 0)) {
tcp_set_state(sk, TCP_CLOSE);
-   tcp_send_active_reset(sk, GFP_ATOMIC);
+   tcp_send_active_reset(sk, GFP_ATOMIC,
+ SK_RST_REASON_NOT_SPECIFIED);
__NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPABORTONMEMORY);
} else if (!check_net(sock_net(sk))) {
@@ -3007,7 +3011,7 @@ int tcp_disconnect(struct sock *sk, int flags)
/* The last check adjusts for discrepancy of Linux wrt. RFC
 * states
 */
-   tcp_send_active_reset(sk, gfp_any());
+   tcp_send_active_reset(sk, gfp_any(), 
SK_RST_REASON_NOT_SPECIFIED);
WRITE_ONCE(sk->sk_err, ECONNRESET);
} else if (old_state == TCP_SYN_SENT)
WRITE_ONCE(sk->sk_err, ECONNRESET);
@@ -4564,7 +4568,8 @@ int tcp_abort(struct sock *sk, int err)
smp_wmb();
sk_error_report(sk);
if (tcp_need_reset(sk->sk_state))
-   tcp_send_active_reset(sk, GFP_ATOMIC);
+   tcp_send_active_reset(sk, GFP_ATOMIC,
+ SK_RST_REASON_NOT_SPECIFIED);
tcp_done(sk);
}
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 61119d42b0fd..276d9d541b01 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3586,7 +3586,8 @@ void tcp_send_fin(struct sock *sk)
  * was unread data in the receive queue.  This behavior is recommended
  * by RFC 2525, section 2.17.  -DaveM
  */
-void tcp_send_active_reset(struct sock *sk, gfp_t priority)
+void tcp_send_active_reset(struct sock *sk, gfp_t priority,
+  enum sk_rst_reason reason)
 {
struct sk_buff *skb;
 
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 976db57b95d4..83fe7f62f7f1 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -22,6 +22,7 @@
 #include 
 #include 
 #include 
+#include 
 
 static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk)
 {
@@ -127,7 +128,8 @@ static int tcp_out_of_resources(struct sock *sk, bool 
do_reset)

[PATCH net-next v6 2/7] rstreason: prepare for passive reset

2024-04-17 Thread Jason Xing

From: Jason Xing 

Adjust the parameter and support passing reason of reset which
is for now NOT_SPECIFIED. No functional changes.

Signed-off-by: Jason Xing 
---
 include/net/request_sock.h |  4 +++-
 net/dccp/ipv4.c| 10 ++
 net/dccp/ipv6.c| 10 ++
 net/dccp/minisocks.c   |  3 ++-
 net/ipv4/tcp_ipv4.c| 12 +++-
 net/ipv4/tcp_minisocks.c   |  3 ++-
 net/ipv6/tcp_ipv6.c| 15 +--
 net/mptcp/subflow.c|  8 +---
 8 files changed, 40 insertions(+), 25 deletions(-)

diff --git a/include/net/request_sock.h b/include/net/request_sock.h
index 004e651e6067..bdc737832da6 100644
--- a/include/net/request_sock.h
+++ b/include/net/request_sock.h
@@ -18,6 +18,7 @@
 #include 
 
 #include 
+#include 
 
 struct request_sock;
 struct sk_buff;
@@ -34,7 +35,8 @@ struct request_sock_ops {
void(*send_ack)(const struct sock *sk, struct sk_buff *skb,
struct request_sock *req);
void(*send_reset)(const struct sock *sk,
- struct sk_buff *skb);
+ struct sk_buff *skb,
+ enum sk_rst_reason reason);
void(*destructor)(struct request_sock *req);
void(*syn_ack_timeout)(const struct request_sock *req);
 };
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 9fc9cea4c251..ff41bd6f99c3 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -24,6 +24,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "ackvec.h"
 #include "ccid.h"
@@ -521,7 +522,8 @@ static int dccp_v4_send_response(const struct sock *sk, 
struct request_sock *req
return err;
 }
 
-static void dccp_v4_ctl_send_reset(const struct sock *sk, struct sk_buff 
*rxskb)
+static void dccp_v4_ctl_send_reset(const struct sock *sk, struct sk_buff 
*rxskb,
+  enum sk_rst_reason reason)
 {
int err;
const struct iphdr *rxiph;
@@ -706,7 +708,7 @@ int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
return 0;
 
 reset:
-   dccp_v4_ctl_send_reset(sk, skb);
+   dccp_v4_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
kfree_skb(skb);
return 0;
 }
@@ -869,7 +871,7 @@ static int dccp_v4_rcv(struct sk_buff *skb)
if (nsk == sk) {
reqsk_put(req);
} else if (dccp_child_process(sk, nsk, skb)) {
-   dccp_v4_ctl_send_reset(sk, skb);
+   dccp_v4_ctl_send_reset(sk, skb, 
SK_RST_REASON_NOT_SPECIFIED);
goto discard_and_relse;
} else {
sock_put(sk);
@@ -909,7 +911,7 @@ static int dccp_v4_rcv(struct sk_buff *skb)
if (dh->dccph_type != DCCP_PKT_RESET) {
DCCP_SKB_CB(skb)->dccpd_reset_code =
DCCP_RESET_CODE_NO_CONNECTION;
-   dccp_v4_ctl_send_reset(sk, skb);
+   dccp_v4_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
}
 
 discard_it:
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index c8ca703dc331..85f4b8fdbe5e 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -29,6 +29,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "dccp.h"
 #include "ipv6.h"
@@ -256,7 +257,8 @@ static void dccp_v6_reqsk_destructor(struct request_sock 
*req)
kfree_skb(inet_rsk(req)->pktopts);
 }
 
-static void dccp_v6_ctl_send_reset(const struct sock *sk, struct sk_buff 
*rxskb)
+static void dccp_v6_ctl_send_reset(const struct sock *sk, struct sk_buff 
*rxskb,
+  enum sk_rst_reason reason)
 {
const struct ipv6hdr *rxip6h;
struct sk_buff *skb;
@@ -656,7 +658,7 @@ static int dccp_v6_do_rcv(struct sock *sk, struct sk_buff 
*skb)
return 0;
 
 reset:
-   dccp_v6_ctl_send_reset(sk, skb);
+   dccp_v6_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
 discard:
if (opt_skb != NULL)
__kfree_skb(opt_skb);
@@ -762,7 +764,7 @@ static int dccp_v6_rcv(struct sk_buff *skb)
if (nsk == sk) {
reqsk_put(req);
} else if (dccp_child_process(sk, nsk, skb)) {
-   dccp_v6_ctl_send_reset(sk, skb);
+   dccp_v6_ctl_send_reset(sk, skb, 
SK_RST_REASON_NOT_SPECIFIED);
goto discard_and_relse;
} else {
sock_put(sk);
@@ -801,7 +803,7 @@ static int dccp_v6_rcv(struct sk_buff *skb)
if (dh->dccph_type != DCCP_PKT_RESET) {
DCCP_SKB_CB(skb)->dccpd_reset_code =
DCCP_RESET_CODE_NO_CONNECTION;
-   dccp_v6_ctl_send_reset(sk, skb);
+   dccp_v6_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
}

[PATCH net-next v6 1/7] net: introduce rstreason to detect why the RST is sent

2024-04-17 Thread Jason Xing

From: Jason Xing 

Add a new standalone file for the easy future extension to support
both active reset and passive reset in the TCP/DCCP/MPTCP protocols.

This patch only does the preparations for reset reason mechanism,
nothing else changes.

The reset reasons are divided into three parts:
1) reuse drop reasons for passive reset in TCP
2) reuse MP_TCPRST option for MPTCP
3) our own reasons

I will implement the basic codes of active/passive reset reason in
those three protocols, which is not complete for this moment. But
it provides a new chance to let other people add more reasons into
it:)

Signed-off-by: Jason Xing 
---
 include/net/rstreason.h | 93 +
 1 file changed, 93 insertions(+)
 create mode 100644 include/net/rstreason.h

diff --git a/include/net/rstreason.h b/include/net/rstreason.h
new file mode 100644
index ..0c3fa55fa62f
--- /dev/null
+++ b/include/net/rstreason.h
@@ -0,0 +1,93 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifndef _LINUX_RSTREASON_H
+#define _LINUX_RSTREASON_H
+#include 
+
+#define DEFINE_RST_REASON(FN, FNe) \
+   FN(MPTCP_RST_EUNSPEC)   \
+   FN(MPTCP_RST_EMPTCP)\
+   FN(MPTCP_RST_ERESOURCE) \
+   FN(MPTCP_RST_EPROHIBIT) \
+   FN(MPTCP_RST_EWQ2BIG)   \
+   FN(MPTCP_RST_EBADPERF)  \
+   FN(MPTCP_RST_EMIDDLEBOX)\
+   FN(NOT_SPECIFIED)   \
+   FNe(MAX)
+
+#define RST_REASON_START (SKB_DROP_REASON_MAX + 1)
+
+/* There are three parts in order:
+ * 1) 0 - SKB_DROP_REASON_MAX: rely on drop reasons for passive reset in TCP
+ * 2) SKB_DROP_REASON_MAX + 1 - MPTCP_RST_EMIDDLEBOX: for MPTCP use
+ * 3) MPTCP_RST_EMIDDLEBOX - SK_RST_REASON_MAX: independent reset reason
+ */
+enum sk_rst_reason {
+   /* Leave this 'blank' part (0-SKB_DROP_REASON_MAX) for the reuse
+* of skb drop reason because rst reason relies on what drop reason
+* indicates exactly why it could happen.
+*/
+
+   /* Copy from include/uapi/linux/mptcp.h.
+* These reset fields will not be changed since they adhere to
+* RFC 8684. So do not touch them. I'm going to list each definition
+* of them respectively.
+*/
+   /* Unspecified error.
+* This is the default error; it implies that the subflow is no
+* longer available. The presence of this option shows that the
+* RST was generated by an MPTCP-aware device.
+*/
+   SK_RST_REASON_MPTCP_RST_EUNSPEC = RST_REASON_START,
+   /* MPTCP-specific error.
+* An error has been detected in the processing of MPTCP options.
+* This is the usual reason code to return in the cases where a RST
+* is being sent to close a subflow because of an invalid response.
+*/
+   SK_RST_REASON_MPTCP_RST_EMPTCP,
+   /* Lack of resources.
+* This code indicates that the sending host does not have enough
+* resources to support the terminated subflow.
+*/
+   SK_RST_REASON_MPTCP_RST_ERESOURCE,
+   /* Administratively prohibited.
+* This code indicates that the requested subflow is prohibited by
+* the policies of the sending host.
+*/
+   SK_RST_REASON_MPTCP_RST_EPROHIBIT,
+   /* Too much outstanding data.
+* This code indicates that there is an excessive amount of data
+* that needs to be transmitted over the terminated subflow while
+* having already been acknowledged over one or more other subflows.
+* This may occur if a path has been unavailable for a short period
+* and it is more efficient to reset and start again than it is to
+* retransmit the queued data.
+*/
+   SK_RST_REASON_MPTCP_RST_EWQ2BIG,
+   /* Unacceptable performance.
+* This code indicates that the performance of this subflow was
+* too low compared to the other subflows of this Multipath TCP
+* connection.
+*/
+   SK_RST_REASON_MPTCP_RST_EBADPERF,
+   /* Middlebox interference.
+* Middlebox interference has been detected over this subflow,
+* making MPTCP signaling invalid. For example, this may be sent
+* if the checksum does not validate.
+*/
+   SK_RST_REASON_MPTCP_RST_EMIDDLEBOX,
+
+   /* For the real standalone socket reset reason, we start from here */
+   SK_RST_REASON_NOT_SPECIFIED,
+
+   /* Maximum of socket reset reasons.
+* It shouldn't be used as a real 'reason'.
+*/
+   SK_RST_REASON_MAX,
+};
+
+static inline enum sk_rst_reason convert_mptcp_reason(u32 reason)
+{
+   return reason += RST_REASON_START;
+}
+#endif
-- 
2.37.3

[PATCH net-next v6 0/7] Implement reset reason mechanism to detect

2024-04-17 Thread Jason Xing

From: Jason Xing 

In production, there are so many cases about why the RST skb is sent but
we don't have a very convenient/fast method to detect the exact underlying
reasons.

RST is implemented in two kinds: passive kind (like tcp_v4_send_reset())
and active kind (like tcp_send_active_reset()). The former can be traced
carefully 1) in TCP, with the help of drop reasons, which is based on
Eric's idea[1], 2) in MPTCP, with the help of reset options defined in
RFC 8684. The latter is relatively independent, which should be
implemented on our own.

In this series, I focus on the fundamental implement mostly about how
the rstreason mechnism works and give the detailed passive part as an
example, not including the active reset part. In future, we can go
further and refine those NOT_SPECIFIED reasons.

Here are some examples when tracing:
-0   [002] ..s1.  1830.262425: tcp_send_reset: skbaddr=x
skaddr=x src=x dest=x state=x reason=NOT_SPECIFIED
-0   [002] ..s1.  1830.262425: tcp_send_reset: skbaddr=x
skaddr=x src=x dest=x state=x reason=NO_SOCKET

[1]
Link: 
https://lore.kernel.org/all/CANn89iJw8x-LqgsWOeJQQvgVg6DnL5aBRLi10QN2WBdr+X4k=w...@mail.gmail.com/

v6
1. add back casts, or else they are treated as error.

v5
Link: 
https://lore.kernel.org/all/2024045630.38420-1-kerneljasonx...@gmail.com/
1. address format issue (like reverse xmas tree) (Eric, Paolo)
2. remove unnecessary casts. (Eric)
3. introduce a helper used in mptcp active reset. See patch 6. (Paolo)

v4
Link: 
https://lore.kernel.org/all/20240409100934.37725-1-kerneljasonx...@gmail.com/
1. passing 'enum sk_rst_reason' for readability when tracing (Antoine)

v3
Link: 
https://lore.kernel.org/all/20240404072047.11490-1-kerneljasonx...@gmail.com/
1. rebase (mptcp part) and address what Mat suggested.

v2
Link: https://lore.kernel.org/all/20240403185033.47ebc...@kernel.org/
1. rebase against the latest net-next tree



Jason Xing (7):
  net: introduce rstreason to detect why the RST is sent
  rstreason: prepare for passive reset
  rstreason: prepare for active reset
  tcp: support rstreason for passive reset
  mptcp: support rstreason for passive reset
  mptcp: introducing a helper into active reset logic
  rstreason: make it work in trace world

 include/net/request_sock.h |  4 +-
 include/net/rstreason.h| 93 ++
 include/net/tcp.h  |  3 +-
 include/trace/events/tcp.h | 37 +--
 net/dccp/ipv4.c| 10 ++--
 net/dccp/ipv6.c| 10 ++--
 net/dccp/minisocks.c   |  3 +-
 net/ipv4/tcp.c | 15 --
 net/ipv4/tcp_ipv4.c| 14 +++---
 net/ipv4/tcp_minisocks.c   |  3 +-
 net/ipv4/tcp_output.c  |  5 +-
 net/ipv4/tcp_timer.c   |  9 ++--
 net/ipv6/tcp_ipv6.c| 17 ---
 net/mptcp/protocol.c   |  2 +-
 net/mptcp/protocol.h   | 11 +
 net/mptcp/subflow.c| 27 ---
 16 files changed, 216 insertions(+), 47 deletions(-)
 create mode 100644 include/net/rstreason.h

-- 
2.37.3

Re: Re: Re: Subject: [PATCH net-next v4] net/ipv4: add tracepoint for icmp_send

2024-04-11 Thread Jason Xing

On Thu, Apr 11, 2024 at 12:57 PM Peilin He  wrote:
>
> >> >[...]
> >> >> >I think my understanding based on what Eric depicted differs from you:
> >> >> >we're supposed to filter out those many invalid cases and only trace
> >> >> >the valid action of sending a icmp, so where to add a new tracepoint
> >> >> >is important instead of adding more checks in the tracepoint itself.
> >> >> >Please refer to what trace_tcp_retransmit_skb() does :)
> >> >> >
> >> >> >Thanks,
> >> >> >Jason
> >> >> Okay, thank you for your suggestion. In order to avoid filtering out
> >> >> those many invalid cases and only tracing the valid action of sending
> >> >> a icmp, the next patch will add udd_fail_no_port trancepoint to the
> >> >> include/trace/events/udp.h. This will solve the problem you mentioned
> >> >> very well. At this point, only UDP protocol exceptions will be tracked,
> >> >> without the need to track them in icmp_send.
> >> >
> >> >I'm not against what you did (tracing all the icmp_send() for UDP) in
> >> >your original patch. I was suggesting that you could put
> >> >trace_icmp_send() in the right place, then you don't have to check the
> >> >possible error condition (like if the skb->head is valid or not, ...)
> >> >in your trace function.
> >> >
> >> >One example that can avoid various checks existing in the
> >> >__icmp_send() function:
> >> >diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
> >> >index e63a3bf99617..2c9f7364de45 100644
> >> >--- a/net/ipv4/icmp.c
> >> >+++ b/net/ipv4/icmp.c
> >> >@@ -767,6 +767,7 @@ void __icmp_send(struct sk_buff *skb_in, int type,
> >> >int code, __be32 info,
> >> >if (!fl4.saddr)
> >> >fl4.saddr = htonl(INADDR_DUMMY);
> >> >
> >> >+   trace_icmp_send(skb_in, type, code);
> >> >icmp_push_reply(sk, _param, , , );
> >> > ende
> >> >ip_rt_put(rt);
> >> >
> >> >If we go here, it means we are ready to send the ICMP skb because
> >> >we're done extracting the right information in the 'struct sk_buff
> >> >skb_in'. Simpler and easier, right?
> >> >
> >> >Thanks,
> >> >Jason
> >>
> >> I may not fully agree with this viewpoint. When trace_icmp_send is placed
> >> in this position, it cannot guarantee that all skbs in icmp are UDP 
> >> protocols
> >> (UDP needs to be distinguished based on the proto_4!=IPPROTO_UDP 
> >> condition),
> >> nor can it guarantee the legitimacy of udphdr (*uh legitimacy check is 
> >> required).
> >
> >Of course, the UDP test statement is absolutely needed! Eric
> >previously pointed this out in the V1 patch thread. I'm not referring
> >to this one but like skb->head check something like this which exists
> >in __icmp_send() function. You can see there are so many checks in it
> >before sending.
> >
> >So only keeping the UDP check is enough, I think.
>
> The __icmp_send function only checks the IP header, but does not check
> the UDP header, as shown in the following code snippet:
>
> if ((u8 *)iph < skb_in->head ||
> (skb_network_header(skb_in) + sizeof(*iph)) >
> skb_tail_pointer(skb_in))
> goto out;
>
> There is no problem with the IP header check, which does not mean that
> the UDP header is correct. Therefore, I believe that it is essential to
> include a legitimacy judgment for the UDP header.
>
> Here is an explanation of this code:
> Firstly, the UDP header (*uh) is extracted from the skb.
> Then, if the current protocol of the skb is not UDP, or if the address of
> uh is outside the range of the skb, the source port and destination port
> will not be resolved, and 0 will be filled in directly.Otherwise,
> the source port and destination port of the UDP header will be resolved.
>
> +   struct udphdr *uh = udp_hdr(skb);
> +   if (proto_4 != IPPROTO_UDP || (u8 *)uh < skb->head ||
> +   (u8 *)uh + sizeof(struct udphdr) > skb_tail_pointer(skb)) {

>From the beginning, I always agree with the UDP check. I was saying if
you can put the trace_icmp_send() just before icmp_push_reply()[1],
you could avoid those kinds of checks.
As I said in the previous email, "only keeping the UDP check is
enough". So you are right.

[1]
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index e63a3bf99617..2c9f7364de45 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -767,6 +767,7 @@ void __icmp_send(struct sk_buff *skb_in, int type,
int code, __be32 info,
if (!fl4.saddr)
fl4.saddr = htonl(INADDR_DUMMY);

+   trace_icmp_send(skb_in, type, code);
icmp_push_reply(sk, _param, , , );
 ende:
ip_rt_put(rt);

If we're doing this, trace_icmp_send() can reflect the real action of
sending an ICMP like trace_tcp_retransmit_skb(). Or else, the trace
could print some messages but no real ICMP is sent (see those error
checks). WDYT?

Thanks,
Jason

>
> With best wishes
> Peilin He
>
> >Thanks,
> >Jason
> >
> >>
> >> With best wishes
> >> Peilin He
> >>
> >> >>
> >> >> >> 2.Target this patch for net-next.
> >> >> >>
> >> >> >> v2->v3:
> >>

Re: Re: Re: Subject: [PATCH net-next v4] net/ipv4: add tracepoint for icmp_send

2024-04-10 Thread Jason Xing

On Thu, Apr 11, 2024 at 10:34 AM Peilin He  wrote:
>
> >[...]
> >> >I think my understanding based on what Eric depicted differs from you:
> >> >we're supposed to filter out those many invalid cases and only trace
> >> >the valid action of sending a icmp, so where to add a new tracepoint
> >> >is important instead of adding more checks in the tracepoint itself.
> >> >Please refer to what trace_tcp_retransmit_skb() does :)
> >> >
> >> >Thanks,
> >> >Jason
> >> Okay, thank you for your suggestion. In order to avoid filtering out
> >> those many invalid cases and only tracing the valid action of sending
> >> a icmp, the next patch will add udd_fail_no_port trancepoint to the
> >> include/trace/events/udp.h. This will solve the problem you mentioned
> >> very well. At this point, only UDP protocol exceptions will be tracked,
> >> without the need to track them in icmp_send.
> >
> >I'm not against what you did (tracing all the icmp_send() for UDP) in
> >your original patch. I was suggesting that you could put
> >trace_icmp_send() in the right place, then you don't have to check the
> >possible error condition (like if the skb->head is valid or not, ...)
> >in your trace function.
> >
> >One example that can avoid various checks existing in the
> >__icmp_send() function:
> >diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
> >index e63a3bf99617..2c9f7364de45 100644
> >--- a/net/ipv4/icmp.c
> >+++ b/net/ipv4/icmp.c
> >@@ -767,6 +767,7 @@ void __icmp_send(struct sk_buff *skb_in, int type,
> >int code, __be32 info,
> >if (!fl4.saddr)
> >fl4.saddr = htonl(INADDR_DUMMY);
> >
> >+   trace_icmp_send(skb_in, type, code);
> >icmp_push_reply(sk, _param, , , );
> > ende
> >ip_rt_put(rt);
> >
> >If we go here, it means we are ready to send the ICMP skb because
> >we're done extracting the right information in the 'struct sk_buff
> >skb_in'. Simpler and easier, right?
> >
> >Thanks,
> >Jason
>
> I may not fully agree with this viewpoint. When trace_icmp_send is placed
> in this position, it cannot guarantee that all skbs in icmp are UDP protocols
> (UDP needs to be distinguished based on the proto_4!=IPPROTO_UDP condition),
> nor can it guarantee the legitimacy of udphdr (*uh legitimacy check is 
> required).

Of course, the UDP test statement is absolutely needed! Eric
previously pointed this out in the V1 patch thread. I'm not referring
to this one but like skb->head check something like this which exists
in __icmp_send() function. You can see there are so many checks in it
before sending.

So only keeping the UDP check is enough, I think.

Thanks,
Jason

>
> With best wishes
> Peilin He
>
> >>
> >> >> 2.Target this patch for net-next.
> >> >>
> >> >> v2->v3:
> >> >> Some fixes according to
> >> >> https://lore.kernel.org/all/20240319102549.7f7f6...@gandalf.local.home/
> >> >> 1. Change the tracking directory to/sys/kernel/tracking.
> >> >> 2. Adjust the layout of the TP-STRUCT_entry parameter structure.
> >> >>
> >> >> v1->v2:
> >> >> Some fixes according to
> >> >> https://lore.kernel.org/all/CANn89iL-y9e_VFpdw=3DsZtRnKRu_tnUwqHuFQTJvJsv=
> >> >-nz1x...@mail.gmail.com/
> >> >> 1. adjust the trace_icmp_send() to more protocols than UDP.
> >> >> 2. move the calling of trace_icmp_send after sanity checks
> >> >> in __icmp_send().
> >> >>
> >> >> Signed-off-by: Peilin He
> >> >> Reviewed-by: xu xin 
> >> >> Reviewed-by: Yunkai Zhang 
> >> >> Cc: Yang Yang 
> >> >> Cc: Liu Chun 
> >> >> Cc: Xuexin Jiang 
> >> >> ---
> >> >>  include/trace/events/icmp.h | 65 +
> >> >>  net/ipv4/icmp.c |  4 +++
> >> >>  2 files changed, 69 insertions(+)
> >> >>  create mode 100644 include/trace/events/icmp.h
> >> >>
> >> >> diff --git a/include/trace/events/icmp.h b/include/trace/events/icmp.h
> >> >> new file mode 100644
> >> >> index ..7d5190f48a28
> >> >> --- /dev/null
> >> >> +++ b/include/trace/events/icmp.h
> >> >> @@ -0,0 +1,65 @@
> >> >> +/* SPDX-License-Identifier: GPL-2.0 */
> >> >> +#undef TRACE_SYSTEM
> >> >> +#define TRACE_SYSTEM icmp
> >> >> +
> >> >> +#if !defined(_TRACE_ICMP_H) || defined(TRACE_HEADER_MULTI_READ)
> >> >> +#define _TRACE_ICMP_H
> >> >> +
> >> >> +#include 
> >> >> +#include 
> >> >> +
> >> >> +TRACE_EVENT(icmp_send,
> >> >> +
> >> >> +   TP_PROTO(const struct sk_buff *skb, int type, int code),
> >> >> +
> >> >> +   TP_ARGS(skb, type, code),
> >> >> +
> >> >> +   TP_STRUCT__entry(
> >> >> +   __field(const void *, skbaddr)
> >> >> +   __field(int, type)
> >> >> +   __field(int, code)
> >> >> +   __array(__u8, saddr, 4)
> >> >> +   __array(__u8, daddr, 4)
> >> >> +   __field(__u16, sport)
> >> >> +   __field(__u16, dport)
> >> >> +   __field(unsigned short, ulen)
> >> >> +   ),
> >> >> +
> >> >> +

Re: [PATCH net-next v3 2/6] rstreason: prepare for passive reset

2024-04-10 Thread Jason Xing

On Wed, Apr 10, 2024 at 9:21 PM Antoine Tenart  wrote:
>
> Quoting Jason Xing (2024-04-10 14:54:51)
> > Hi Antoine,
> >
> > On Wed, Apr 10, 2024 at 8:14 PM Antoine Tenart  wrote:
> > >
> > > Quoting Jason Xing (2024-04-09 12:09:30)
> > > > void(*send_reset)(const struct sock *sk,
> > > > - struct sk_buff *skb);
> > > > + struct sk_buff *skb,
> > > > + int reason);
> >
> > > what should be 'reason' harder. Eg. when looking at the code or when
> > > using BTF (to then install debugging probes with BPF) this is not
> > > obvious.
> >
> > Only one number if we want to extract the reason with BPF, right? I
> > haven't tried it.
>
> Yes, we can get 'reason'. Knowing the type helps.
>
> > > A similar approach could be done as the one used for drop reasons: enum
> > > skb_drop_reason is used for parameters (eg. kfree_skb_reason) but other
> > > valid values (subsystem drop reasons) can be used too if casted (to
> > > u32). We could use 'enum sk_rst_reason' and cast the other values. WDYT?
> >
> > I have been haunted by this 'issue' for a long time...
> >
> > Are you suggesting doing so as below for readability:
> > 1) replace the reason parameter in all the related functions (like
> > .send_reset(), tcp_v4_send_reset(), etc) by using 'enum sk_rst_reason'
> > type?
> > 2) in patch [4/6], when it needs to pass the specific reason in those
> > functions, we can cast it to 'enum sk_rst_reason'?
> >
> > One modification I just made based on this patchset if I understand 
> > correctly:
> > diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> > index 4889fccbf754..e0419b8496b5 100644
> > --- a/net/ipv4/tcp_ipv4.c
> > +++ b/net/ipv4/tcp_ipv4.c
> > @@ -725,7 +725,7 @@ static bool tcp_v4_ao_sign_reset(const struct sock
> > *sk, struct sk_buff *skb,
> >   */
> >
> >  static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb,
> > - int reason)
> > + enum sk_rst_reason reason)
> >  {
> > const struct tcphdr *th = tcp_hdr(skb);
> > struct {
> > @@ -1935,7 +1935,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff 
> > *skb)
> > return 0;
> >
> >  reset:
> > -   tcp_v4_send_reset(rsk, skb, reason);
> > +   tcp_v4_send_reset(rsk, skb, (enum sk_rst_reason)reason);
> >  discard:
> > kfree_skb_reason(skb, reason);
> > /* Be careful here. If this function gets more complicated and
> >
>
> That's right. I think (u32) can also be used for the cast to make the
> compiler happy in 2), but the above makes sense.

Got it :) Will update soon.

Thanks,
Jason

Re: Re: Subject: [PATCH net-next v4] net/ipv4: add tracepoint for icmp_send

2024-04-10 Thread Jason Xing

[...]
> >I think my understanding based on what Eric depicted differs from you:
> >we're supposed to filter out those many invalid cases and only trace
> >the valid action of sending a icmp, so where to add a new tracepoint
> >is important instead of adding more checks in the tracepoint itself.
> >Please refer to what trace_tcp_retransmit_skb() does :)
> >
> >Thanks,
> >Jason
> Okay, thank you for your suggestion. In order to avoid filtering out
> those many invalid cases and only tracing the valid action of sending
> a icmp, the next patch will add udd_fail_no_port trancepoint to the
> include/trace/events/udp.h. This will solve the problem you mentioned
> very well. At this point, only UDP protocol exceptions will be tracked,
> without the need to track them in icmp_send.

I'm not against what you did (tracing all the icmp_send() for UDP) in
your original patch. I was suggesting that you could put
trace_icmp_send() in the right place, then you don't have to check the
possible error condition (like if the skb->head is valid or not, ...)
in your trace function.

One example that can avoid various checks existing in the
__icmp_send() function:
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index e63a3bf99617..2c9f7364de45 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -767,6 +767,7 @@ void __icmp_send(struct sk_buff *skb_in, int type,
int code, __be32 info,
if (!fl4.saddr)
fl4.saddr = htonl(INADDR_DUMMY);

+   trace_icmp_send(skb_in, type, code);
icmp_push_reply(sk, _param, , , );
 ende:
ip_rt_put(rt);

If we go here, it means we are ready to send the ICMP skb because
we're done extracting the right information in the 'struct sk_buff
skb_in'. Simpler and easier, right?

Thanks,
Jason

>
> >> 2.Target this patch for net-next.
> >>
> >> v2->v3:
> >> Some fixes according to
> >> https://lore.kernel.org/all/20240319102549.7f7f6...@gandalf.local.home/
> >> 1. Change the tracking directory to/sys/kernel/tracking.
> >> 2. Adjust the layout of the TP-STRUCT_entry parameter structure.
> >>
> >> v1->v2:
> >> Some fixes according to
> >> https://lore.kernel.org/all/CANn89iL-y9e_VFpdw=3DsZtRnKRu_tnUwqHuFQTJvJsv=
> >-nz1x...@mail.gmail.com/
> >> 1. adjust the trace_icmp_send() to more protocols than UDP.
> >> 2. move the calling of trace_icmp_send after sanity checks
> >> in __icmp_send().
> >>
> >> Signed-off-by: Peilin He
> >> Reviewed-by: xu xin 
> >> Reviewed-by: Yunkai Zhang 
> >> Cc: Yang Yang 
> >> Cc: Liu Chun 
> >> Cc: Xuexin Jiang 
> >> ---
> >>  include/trace/events/icmp.h | 65 +
> >>  net/ipv4/icmp.c |  4 +++
> >>  2 files changed, 69 insertions(+)
> >>  create mode 100644 include/trace/events/icmp.h
> >>
> >> diff --git a/include/trace/events/icmp.h b/include/trace/events/icmp.h
> >> new file mode 100644
> >> index ..7d5190f48a28
> >> --- /dev/null
> >> +++ b/include/trace/events/icmp.h
> >> @@ -0,0 +1,65 @@
> >> +/* SPDX-License-Identifier: GPL-2.0 */
> >> +#undef TRACE_SYSTEM
> >> +#define TRACE_SYSTEM icmp
> >> +
> >> +#if !defined(_TRACE_ICMP_H) || defined(TRACE_HEADER_MULTI_READ)
> >> +#define _TRACE_ICMP_H
> >> +
> >> +#include 
> >> +#include 
> >> +
> >> +TRACE_EVENT(icmp_send,
> >> +
> >> +   TP_PROTO(const struct sk_buff *skb, int type, int code),
> >> +
> >> +   TP_ARGS(skb, type, code),
> >> +
> >> +   TP_STRUCT__entry(
> >> +   __field(const void *, skbaddr)
> >> +   __field(int, type)
> >> +   __field(int, code)
> >> +   __array(__u8, saddr, 4)
> >> +   __array(__u8, daddr, 4)
> >> +   __field(__u16, sport)
> >> +   __field(__u16, dport)
> >> +   __field(unsigned short, ulen)
> >> +   ),
> >> +
> >> +   TP_fast_assign(
> >> +   struct iphdr *iph =3D ip_hdr(skb);
> >> +   int proto_4 =3D iph->protocol;
> >> +   __be32 *p32;
> >> +
> >> +   __entry->skbaddr =3D skb;
> >> +   __entry->type =3D type;
> >> +   __entry->code =3D code;
> >> +
> >> +   struct udphdr *uh =3D udp_hdr(skb);
> >> +   if (proto_4 !=3D IPPROTO_UDP || (u8 *)uh < skb->h=
> >ead ||
> >> +   (u8 *)uh + sizeof(struct udphdr) > skb_ta=
> >il_pointer(skb)) {
> >> +   __entry->sport =3D 0;
> >> +   __entry->dport =3D 0;
> >> +   __entry->ulen =3D 0;
> >> +   } else {
> >> +   __entry->sport =3D ntohs(uh->source);
> >> +   __entry->dport =3D ntohs(uh->dest);
> >> +   __entry->ulen =3D ntohs(uh->len);
> >> +   }
> >> +

Re: [PATCH net-next v3 2/6] rstreason: prepare for passive reset

2024-04-10 Thread Jason Xing

Hi Antoine,

On Wed, Apr 10, 2024 at 8:14 PM Antoine Tenart  wrote:
>
> Quoting Jason Xing (2024-04-09 12:09:30)
> > void(*send_reset)(const struct sock *sk,
> > - struct sk_buff *skb);
> > + struct sk_buff *skb,
> > + int reason);
>
> I get that 'int' is used instead of 'enum sk_rst_reason' to allow
> passing drop reasons too without casting, but that makes understanding

Yes!

> what should be 'reason' harder. Eg. when looking at the code or when
> using BTF (to then install debugging probes with BPF) this is not
> obvious.

Only one number if we want to extract the reason with BPF, right? I
haven't tried it.

>
> A similar approach could be done as the one used for drop reasons: enum
> skb_drop_reason is used for parameters (eg. kfree_skb_reason) but other
> valid values (subsystem drop reasons) can be used too if casted (to
> u32). We could use 'enum sk_rst_reason' and cast the other values. WDYT?

I have been haunted by this 'issue' for a long time...

Are you suggesting doing so as below for readability:
1) replace the reason parameter in all the related functions (like
.send_reset(), tcp_v4_send_reset(), etc) by using 'enum sk_rst_reason'
type?
2) in patch [4/6], when it needs to pass the specific reason in those
functions, we can cast it to 'enum sk_rst_reason'?

One modification I just made based on this patchset if I understand correctly:
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 4889fccbf754..e0419b8496b5 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -725,7 +725,7 @@ static bool tcp_v4_ao_sign_reset(const struct sock
*sk, struct sk_buff *skb,
  */

 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb,
- int reason)
+ enum sk_rst_reason reason)
 {
const struct tcphdr *th = tcp_hdr(skb);
struct {
@@ -1935,7 +1935,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
return 0;

 reset:
-   tcp_v4_send_reset(rsk, skb, reason);
+   tcp_v4_send_reset(rsk, skb, (enum sk_rst_reason)reason);
 discard:
kfree_skb_reason(skb, reason);
/* Be careful here. If this function gets more complicated and

It, indeed, looks better (clean and clear) :)

So I also ought to adjust the trace_tcp_send_reset part...

Thanks,
Jason

Re: [PATCH net-next v3 6/6] rstreason: make it work in trace world

2024-04-09 Thread Jason Xing

Hi Steven,

On Tue, Apr 9, 2024 at 11:36 PM Steven Rostedt  wrote:
>
> On Tue,  9 Apr 2024 18:09:34 +0800
> Jason Xing  wrote:
>
> >  /*
> >   * tcp event with arguments sk and skb
> > @@ -74,20 +75,38 @@ DEFINE_EVENT(tcp_event_sk_skb, tcp_retransmit_skb,
> >   TP_ARGS(sk, skb)
> >  );
> >
> > +#undef FN1
> > +#define FN1(reason)  TRACE_DEFINE_ENUM(SK_RST_REASON_##reason);
> > +#undef FN2
> > +#define FN2(reason)  TRACE_DEFINE_ENUM(SKB_DROP_REASON_##reason);
> > +DEFINE_RST_REASON(FN1, FN1)
>
> Interesting. I've never seen the passing of the internal macros to the main
> macro before. I see that you are using it for handling both the
> SK_RST_REASON and the SK_DROP_REASON.

Yes, I want to cover two kinds of reasons and then strip them of
prefixes which can be reported to userspace.

>
> > +
> > +#undef FN1
> > +#undef FNe1
> > +#define FN1(reason)  { SK_RST_REASON_##reason, #reason },
> > +#define FNe1(reason) { SK_RST_REASON_##reason, #reason }
> > +
> > +#undef FN2
> > +#undef FNe2
> > +#define FN2(reason)  { SKB_DROP_REASON_##reason, #reason },
> > +#define FNe2(reason) { SKB_DROP_REASON_##reason, #reason }
>
> Anyway, from a tracing point of view, as it looks like it would work
> (I haven't tested it).

Sure, it works. One simple test if you're interested:
1) Apply this patchset locally
2) add 'trace_tcp_send_reset(sk, skb, [one reason])' in the receive
path, say, somewhere in the tcp_v4_rcv()

The possible result can be seen in the cover letter. I list here:
-0   [002] ..s1.  1830.262425: tcp_send_reset: skbaddr=x
skaddr=x src=x dest=x state=x reason=NOT_SPECIFIED
-0   [002] ..s1.  1830.262425: tcp_send_reset: skbaddr=x
skaddr=x src=x dest=x state=x reason=NO_SOCKET

>
> Reviewed-by: Steven Rostedt (Google) 

Thanks!

>
> -- Steve

[PATCH net-next v3 6/6] rstreason: make it work in trace world

2024-04-09 Thread Jason Xing

From: Jason Xing 

At last, we should let it work by introducing this reset reason in
trace world.

One of the possible expected outputs is:
... tcp_send_reset: skbaddr=xxx skaddr=xxx src=xxx dest=xxx
state=TCP_ESTABLISHED reason=NOT_SPECIFIED

Signed-off-by: Jason Xing 
---
 include/trace/events/tcp.h | 37 +
 net/ipv4/tcp_ipv4.c|  2 +-
 net/ipv4/tcp_output.c  |  2 +-
 net/ipv6/tcp_ipv6.c|  2 +-
 4 files changed, 36 insertions(+), 7 deletions(-)

diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
index 5c04a61a11c2..9bed9e63c9c5 100644
--- a/include/trace/events/tcp.h
+++ b/include/trace/events/tcp.h
@@ -11,6 +11,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /*
  * tcp event with arguments sk and skb
@@ -74,20 +75,38 @@ DEFINE_EVENT(tcp_event_sk_skb, tcp_retransmit_skb,
TP_ARGS(sk, skb)
 );
 
+#undef FN1
+#define FN1(reason)TRACE_DEFINE_ENUM(SK_RST_REASON_##reason);
+#undef FN2
+#define FN2(reason)TRACE_DEFINE_ENUM(SKB_DROP_REASON_##reason);
+DEFINE_RST_REASON(FN1, FN1)
+
+#undef FN1
+#undef FNe1
+#define FN1(reason){ SK_RST_REASON_##reason, #reason },
+#define FNe1(reason)   { SK_RST_REASON_##reason, #reason }
+
+#undef FN2
+#undef FNe2
+#define FN2(reason){ SKB_DROP_REASON_##reason, #reason },
+#define FNe2(reason)   { SKB_DROP_REASON_##reason, #reason }
 /*
  * skb of trace_tcp_send_reset is the skb that caused RST. In case of
  * active reset, skb should be NULL
  */
 TRACE_EVENT(tcp_send_reset,
 
-   TP_PROTO(const struct sock *sk, const struct sk_buff *skb),
+   TP_PROTO(const struct sock *sk,
+const struct sk_buff *skb,
+const int reason),
 
-   TP_ARGS(sk, skb),
+   TP_ARGS(sk, skb, reason),
 
TP_STRUCT__entry(
__field(const void *, skbaddr)
__field(const void *, skaddr)
__field(int, state)
+   __field(int, reason)
__array(__u8, saddr, sizeof(struct sockaddr_in6))
__array(__u8, daddr, sizeof(struct sockaddr_in6))
),
@@ -113,14 +132,24 @@ TRACE_EVENT(tcp_send_reset,
 */
TP_STORE_ADDR_PORTS_SKB(skb, th, entry->daddr, 
entry->saddr);
}
+   __entry->reason = reason;
),
 
-   TP_printk("skbaddr=%p skaddr=%p src=%pISpc dest=%pISpc state=%s",
+   TP_printk("skbaddr=%p skaddr=%p src=%pISpc dest=%pISpc state=%s 
reason=%s",
  __entry->skbaddr, __entry->skaddr,
  __entry->saddr, __entry->daddr,
- __entry->state ? show_tcp_state_name(__entry->state) : 
"UNKNOWN")
+ __entry->state ? show_tcp_state_name(__entry->state) : 
"UNKNOWN",
+ __entry->reason < RST_REASON_START ?
+   __print_symbolic(__entry->reason, 
DEFINE_DROP_REASON(FN2, FNe2)) :
+   __print_symbolic(__entry->reason, 
DEFINE_RST_REASON(FN1, FNe1)))
 );
 
+#undef FN1
+#undef FNe1
+
+#undef FN2
+#undef FNe2
+
 /*
  * tcp event with arguments sk
  *
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 03c5af9decbf..4889fccbf754 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -871,7 +871,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct 
sk_buff *skb,
if (sk)
arg.bound_dev_if = sk->sk_bound_dev_if;
 
-   trace_tcp_send_reset(sk, skb);
+   trace_tcp_send_reset(sk, skb, reason);
 
BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 6d807b5c1b9c..710922f7d4d6 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3610,7 +3610,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t 
priority, int reason)
/* skb of trace_tcp_send_reset() keeps the skb that caused RST,
 * skb here is different to the troublesome skb, so use NULL
 */
-   trace_tcp_send_reset(sk, NULL);
+   trace_tcp_send_reset(sk, NULL, SK_RST_REASON_NOT_SPECIFIED);
 }
 
 /* Send a crossed SYN-ACK during socket establishment.
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 6889ea70c760..3c995eff6e52 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1131,7 +1131,7 @@ static void tcp_v6_send_reset(const struct sock *sk, 
struct sk_buff *skb,
label = ip6_flowlabel(ipv6h);
}
 
-   trace_tcp_send_reset(sk, skb);
+   trace_tcp_send_reset(sk, skb, reason);
 
tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, 1,
 ipv6_get_dsfield(ipv6h), label, priority, txhash,
-- 
2.37.3

[PATCH net-next v3 5/6] mptcp: support rstreason for passive reset

2024-04-09 Thread Jason Xing

From: Jason Xing 

It relys on what reset options in the skb are as rfc8684 says. Reusing
this logic can save us much energy. This patch replaces most of the prior
NOT_SPECIFIED reasons.

Signed-off-by: Jason Xing 
---
 net/mptcp/subflow.c | 18 +-
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index ba0a252c113f..4f2be72d5b02 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -308,8 +308,11 @@ static struct dst_entry *subflow_v4_route_req(const struct 
sock *sk,
return dst;
 
dst_release(dst);
-   if (!req->syncookie)
-   tcp_request_sock_ops.send_reset(sk, skb, 
SK_RST_REASON_NOT_SPECIFIED);
+   if (!req->syncookie) {
+   struct mptcp_ext *mpext = mptcp_get_ext(skb);
+
+   tcp_request_sock_ops.send_reset(sk, skb, mpext->reset_reason);
+   }
return NULL;
 }
 
@@ -375,8 +378,11 @@ static struct dst_entry *subflow_v6_route_req(const struct 
sock *sk,
return dst;
 
dst_release(dst);
-   if (!req->syncookie)
-   tcp6_request_sock_ops.send_reset(sk, skb, 
SK_RST_REASON_NOT_SPECIFIED);
+   if (!req->syncookie) {
+   struct mptcp_ext *mpext = mptcp_get_ext(skb);
+
+   tcp6_request_sock_ops.send_reset(sk, skb, mpext->reset_reason);
+   }
return NULL;
 }
 #endif
@@ -783,6 +789,7 @@ static struct sock *subflow_syn_recv_sock(const struct sock 
*sk,
bool fallback, fallback_is_fatal;
struct mptcp_sock *owner;
struct sock *child;
+   int reason;
 
pr_debug("listener=%p, req=%p, conn=%p", listener, req, listener->conn);
 
@@ -911,7 +918,8 @@ static struct sock *subflow_syn_recv_sock(const struct sock 
*sk,
tcp_rsk(req)->drop_req = true;
inet_csk_prepare_for_destroy_sock(child);
tcp_done(child);
-   req->rsk_ops->send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
+   reason = mptcp_get_ext(skb)->reset_reason;
+   req->rsk_ops->send_reset(sk, skb, convert_mptcp_reason(reason));
 
/* The last child reference will be released by the caller */
return child;
-- 
2.37.3

[PATCH net-next v3 4/6] tcp: support rstreason for passive reset

2024-04-09 Thread Jason Xing

From: Jason Xing 

Reuse the dropreason logic to show the exact reason of tcp reset,
so we don't need to implement those duplicated reset reasons.
This patch replaces all the prior NOT_SPECIFIED reasons.

Signed-off-by: Jason Xing 
---
 net/ipv4/tcp_ipv4.c | 8 
 net/ipv6/tcp_ipv6.c | 8 
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 21fa69445f7a..03c5af9decbf 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1935,7 +1935,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
return 0;
 
 reset:
-   tcp_v4_send_reset(rsk, skb, SK_RST_REASON_NOT_SPECIFIED);
+   tcp_v4_send_reset(rsk, skb, reason);
 discard:
kfree_skb_reason(skb, reason);
/* Be careful here. If this function gets more complicated and
@@ -2278,7 +2278,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
} else {
drop_reason = tcp_child_process(sk, nsk, skb);
if (drop_reason) {
-   tcp_v4_send_reset(nsk, skb, 
SK_RST_REASON_NOT_SPECIFIED);
+   tcp_v4_send_reset(nsk, skb, drop_reason);
goto discard_and_relse;
}
sock_put(sk);
@@ -2356,7 +2356,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
 bad_packet:
__TCP_INC_STATS(net, TCP_MIB_INERRS);
} else {
-   tcp_v4_send_reset(NULL, skb, SK_RST_REASON_NOT_SPECIFIED);
+   tcp_v4_send_reset(NULL, skb, drop_reason);
}
 
 discard_it:
@@ -2407,7 +2407,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
tcp_v4_timewait_ack(sk, skb);
break;
case TCP_TW_RST:
-   tcp_v4_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
+   tcp_v4_send_reset(sk, skb, drop_reason);
inet_twsk_deschedule_put(inet_twsk(sk));
goto discard_it;
case TCP_TW_SUCCESS:;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 7e591521b193..6889ea70c760 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1678,7 +1678,7 @@ int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
return 0;
 
 reset:
-   tcp_v6_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
+   tcp_v6_send_reset(sk, skb, reason);
 discard:
if (opt_skb)
__kfree_skb(opt_skb);
@@ -1864,7 +1864,7 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff 
*skb)
} else {
drop_reason = tcp_child_process(sk, nsk, skb);
if (drop_reason) {
-   tcp_v6_send_reset(nsk, skb, 
SK_RST_REASON_NOT_SPECIFIED);
+   tcp_v6_send_reset(nsk, skb, drop_reason);
goto discard_and_relse;
}
sock_put(sk);
@@ -1940,7 +1940,7 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff 
*skb)
 bad_packet:
__TCP_INC_STATS(net, TCP_MIB_INERRS);
} else {
-   tcp_v6_send_reset(NULL, skb, SK_RST_REASON_NOT_SPECIFIED);
+   tcp_v6_send_reset(NULL, skb, drop_reason);
}
 
 discard_it:
@@ -1995,7 +1995,7 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff 
*skb)
tcp_v6_timewait_ack(sk, skb);
break;
case TCP_TW_RST:
-   tcp_v6_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
+   tcp_v6_send_reset(sk, skb, drop_reason);
inet_twsk_deschedule_put(inet_twsk(sk));
goto discard_it;
case TCP_TW_SUCCESS:
-- 
2.37.3

[PATCH net-next v3 3/6] rstreason: prepare for active reset

2024-04-09 Thread Jason Xing

From: Jason Xing 

Like what we did to passive reset:
only passing possible reset reason in each active reset path.

No functional changes.

Signed-off-by: Jason Xing 
---
 include/net/tcp.h |  2 +-
 net/ipv4/tcp.c| 15 ++-
 net/ipv4/tcp_output.c |  2 +-
 net/ipv4/tcp_timer.c  |  9 ++---
 net/mptcp/protocol.c  |  4 +++-
 net/mptcp/subflow.c   |  5 +++--
 6 files changed, 24 insertions(+), 13 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 9ab5b37e9d53..67ab4dbf7805 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -667,7 +667,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
 void tcp_send_probe0(struct sock *);
 int tcp_write_wakeup(struct sock *, int mib);
 void tcp_send_fin(struct sock *sk);
-void tcp_send_active_reset(struct sock *sk, gfp_t priority);
+void tcp_send_active_reset(struct sock *sk, gfp_t priority, int reason);
 int tcp_send_synack(struct sock *);
 void tcp_push_one(struct sock *, unsigned int mss_now);
 void __tcp_send_ack(struct sock *sk, u32 rcv_nxt);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 664c8ecb076b..d1610d4deb8f 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -275,6 +275,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -2805,7 +2806,8 @@ void __tcp_close(struct sock *sk, long timeout)
/* Unread data was tossed, zap the connection. */
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
tcp_set_state(sk, TCP_CLOSE);
-   tcp_send_active_reset(sk, sk->sk_allocation);
+   tcp_send_active_reset(sk, sk->sk_allocation,
+ SK_RST_REASON_NOT_SPECIFIED);
} else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
/* Check zero linger _after_ checking for unread data. */
sk->sk_prot->disconnect(sk, 0);
@@ -2879,7 +2881,8 @@ void __tcp_close(struct sock *sk, long timeout)
struct tcp_sock *tp = tcp_sk(sk);
if (READ_ONCE(tp->linger2) < 0) {
tcp_set_state(sk, TCP_CLOSE);
-   tcp_send_active_reset(sk, GFP_ATOMIC);
+   tcp_send_active_reset(sk, GFP_ATOMIC,
+ SK_RST_REASON_NOT_SPECIFIED);
__NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPABORTONLINGER);
} else {
@@ -2897,7 +2900,8 @@ void __tcp_close(struct sock *sk, long timeout)
if (sk->sk_state != TCP_CLOSE) {
if (tcp_check_oom(sk, 0)) {
tcp_set_state(sk, TCP_CLOSE);
-   tcp_send_active_reset(sk, GFP_ATOMIC);
+   tcp_send_active_reset(sk, GFP_ATOMIC,
+ SK_RST_REASON_NOT_SPECIFIED);
__NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPABORTONMEMORY);
} else if (!check_net(sock_net(sk))) {
@@ -3001,7 +3005,7 @@ int tcp_disconnect(struct sock *sk, int flags)
/* The last check adjusts for discrepancy of Linux wrt. RFC
 * states
 */
-   tcp_send_active_reset(sk, gfp_any());
+   tcp_send_active_reset(sk, gfp_any(), 
SK_RST_REASON_NOT_SPECIFIED);
WRITE_ONCE(sk->sk_err, ECONNRESET);
} else if (old_state == TCP_SYN_SENT)
WRITE_ONCE(sk->sk_err, ECONNRESET);
@@ -4557,7 +4561,8 @@ int tcp_abort(struct sock *sk, int err)
smp_wmb();
sk_error_report(sk);
if (tcp_need_reset(sk->sk_state))
-   tcp_send_active_reset(sk, GFP_ATOMIC);
+   tcp_send_active_reset(sk, GFP_ATOMIC,
+ SK_RST_REASON_NOT_SPECIFIED);
tcp_done(sk);
}
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 9282fafc0e61..6d807b5c1b9c 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3585,7 +3585,7 @@ void tcp_send_fin(struct sock *sk)
  * was unread data in the receive queue.  This behavior is recommended
  * by RFC 2525, section 2.17.  -DaveM
  */
-void tcp_send_active_reset(struct sock *sk, gfp_t priority)
+void tcp_send_active_reset(struct sock *sk, gfp_t priority, int reason)
 {
struct sk_buff *skb;
 
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 976db57b95d4..83fe7f62f7f1 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -22,6 +22,7 @@
 #include 
 #include 
 #include 
+#include 
 
 static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk)
 {
@@ -127,7 +128,8 @@ static int tcp_out_of_resources(struct sock *sk, bool 
do_reset)
(!tp->snd_wnd && !tp->packets_out))

[PATCH net-next v3 2/6] rstreason: prepare for passive reset

2024-04-09 Thread Jason Xing

From: Jason Xing 

Adjust the parameter and support passing reason of reset which
is for now NOT_SPECIFIED. No functional changes.

Signed-off-by: Jason Xing 
---
 include/net/request_sock.h |  3 ++-
 net/dccp/ipv4.c| 10 ++
 net/dccp/ipv6.c| 10 ++
 net/dccp/minisocks.c   |  3 ++-
 net/ipv4/tcp_ipv4.c| 12 +++-
 net/ipv4/tcp_minisocks.c   |  3 ++-
 net/ipv6/tcp_ipv6.c| 15 +--
 net/mptcp/subflow.c|  8 +---
 8 files changed, 39 insertions(+), 25 deletions(-)

diff --git a/include/net/request_sock.h b/include/net/request_sock.h
index 004e651e6067..93f9fee7e52f 100644
--- a/include/net/request_sock.h
+++ b/include/net/request_sock.h
@@ -34,7 +34,8 @@ struct request_sock_ops {
void(*send_ack)(const struct sock *sk, struct sk_buff *skb,
struct request_sock *req);
void(*send_reset)(const struct sock *sk,
- struct sk_buff *skb);
+ struct sk_buff *skb,
+ int reason);
void(*destructor)(struct request_sock *req);
void(*syn_ack_timeout)(const struct request_sock *req);
 };
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 9fc9cea4c251..11b8d14be3e2 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -24,6 +24,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "ackvec.h"
 #include "ccid.h"
@@ -521,7 +522,8 @@ static int dccp_v4_send_response(const struct sock *sk, 
struct request_sock *req
return err;
 }
 
-static void dccp_v4_ctl_send_reset(const struct sock *sk, struct sk_buff 
*rxskb)
+static void dccp_v4_ctl_send_reset(const struct sock *sk, struct sk_buff 
*rxskb,
+  int reason)
 {
int err;
const struct iphdr *rxiph;
@@ -706,7 +708,7 @@ int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
return 0;
 
 reset:
-   dccp_v4_ctl_send_reset(sk, skb);
+   dccp_v4_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
kfree_skb(skb);
return 0;
 }
@@ -869,7 +871,7 @@ static int dccp_v4_rcv(struct sk_buff *skb)
if (nsk == sk) {
reqsk_put(req);
} else if (dccp_child_process(sk, nsk, skb)) {
-   dccp_v4_ctl_send_reset(sk, skb);
+   dccp_v4_ctl_send_reset(sk, skb, 
SK_RST_REASON_NOT_SPECIFIED);
goto discard_and_relse;
} else {
sock_put(sk);
@@ -909,7 +911,7 @@ static int dccp_v4_rcv(struct sk_buff *skb)
if (dh->dccph_type != DCCP_PKT_RESET) {
DCCP_SKB_CB(skb)->dccpd_reset_code =
DCCP_RESET_CODE_NO_CONNECTION;
-   dccp_v4_ctl_send_reset(sk, skb);
+   dccp_v4_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
}
 
 discard_it:
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index c8ca703dc331..232092dc3887 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -29,6 +29,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "dccp.h"
 #include "ipv6.h"
@@ -256,7 +257,8 @@ static void dccp_v6_reqsk_destructor(struct request_sock 
*req)
kfree_skb(inet_rsk(req)->pktopts);
 }
 
-static void dccp_v6_ctl_send_reset(const struct sock *sk, struct sk_buff 
*rxskb)
+static void dccp_v6_ctl_send_reset(const struct sock *sk, struct sk_buff 
*rxskb,
+  int reason)
 {
const struct ipv6hdr *rxip6h;
struct sk_buff *skb;
@@ -656,7 +658,7 @@ static int dccp_v6_do_rcv(struct sock *sk, struct sk_buff 
*skb)
return 0;
 
 reset:
-   dccp_v6_ctl_send_reset(sk, skb);
+   dccp_v6_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
 discard:
if (opt_skb != NULL)
__kfree_skb(opt_skb);
@@ -762,7 +764,7 @@ static int dccp_v6_rcv(struct sk_buff *skb)
if (nsk == sk) {
reqsk_put(req);
} else if (dccp_child_process(sk, nsk, skb)) {
-   dccp_v6_ctl_send_reset(sk, skb);
+   dccp_v6_ctl_send_reset(sk, skb, 
SK_RST_REASON_NOT_SPECIFIED);
goto discard_and_relse;
} else {
sock_put(sk);
@@ -801,7 +803,7 @@ static int dccp_v6_rcv(struct sk_buff *skb)
if (dh->dccph_type != DCCP_PKT_RESET) {
DCCP_SKB_CB(skb)->dccpd_reset_code =
DCCP_RESET_CODE_NO_CONNECTION;
-   dccp_v6_ctl_send_reset(sk, skb);
+   dccp_v6_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
}
 
 discard_it:
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
index 64d805b27add..251a57cf5822 100644
--- a/net/dccp/minisocks.c

[PATCH net-next v3 1/6] net: introduce rstreason to detect why the RST is sent

2024-04-09 Thread Jason Xing

From: Jason Xing 

Add a new standalone file for the easy future extension to support
both active reset and passive reset in the TCP/DCCP/MPTCP protocols.

This patch only does the preparations for reset reason mechanism,
nothing else changes.

The reset reasons are divided into three parts:
1) reuse drop reasons for passive reset in TCP
2) reuse MP_TCPRST option for MPTCP
3) our own reasons

I will implement the basic codes of active/passive reset reason in
those three protocols, which is not complete for this moment. But
it provides a new chance to let other people add more reasons into
it:)

Signed-off-by: Jason Xing 
---
 include/net/rstreason.h | 93 +
 1 file changed, 93 insertions(+)
 create mode 100644 include/net/rstreason.h

diff --git a/include/net/rstreason.h b/include/net/rstreason.h
new file mode 100644
index ..24d098a78a60
--- /dev/null
+++ b/include/net/rstreason.h
@@ -0,0 +1,93 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifndef _LINUX_RSTREASON_H
+#define _LINUX_RSTREASON_H
+#include 
+
+#define DEFINE_RST_REASON(FN, FNe) \
+   FN(MPTCP_RST_EUNSPEC)   \
+   FN(MPTCP_RST_EMPTCP)\
+   FN(MPTCP_RST_ERESOURCE) \
+   FN(MPTCP_RST_EPROHIBIT) \
+   FN(MPTCP_RST_EWQ2BIG)   \
+   FN(MPTCP_RST_EBADPERF)  \
+   FN(MPTCP_RST_EMIDDLEBOX)\
+   FN(NOT_SPECIFIED)   \
+   FNe(MAX)
+
+#define RST_REASON_START (SKB_DROP_REASON_MAX + 1)
+
+/* There are three parts in order:
+ * 1) 0 - SKB_DROP_REASON_MAX: rely on drop reasons for passive reset in TCP
+ * 2) SKB_DROP_REASON_MAX + 1 - MPTCP_RST_EMIDDLEBOX: for MPTCP use
+ * 3) MPTCP_RST_EMIDDLEBOX - SK_RST_REASON_MAX: independent reset reason
+ */
+enum sk_rst_reason {
+   /* Leave this 'blank' part (0-SKB_DROP_REASON_MAX) for the reuse
+* of skb drop reason because rst reason relies on what drop reason
+* indicates exactly why it could happen.
+*/
+
+   /* Copy from include/uapi/linux/mptcp.h.
+* These reset fields will not be changed since they adhere to
+* RFC 8684. So do not touch them. I'm going to list each definition
+* of them respectively.
+*/
+   /* Unspecified error.
+* This is the default error; it implies that the subflow is no
+* longer available. The presence of this option shows that the
+* RST was generated by an MPTCP-aware device.
+*/
+   SK_RST_REASON_MPTCP_RST_EUNSPEC = RST_REASON_START,
+   /* MPTCP-specific error.
+* An error has been detected in the processing of MPTCP options.
+* This is the usual reason code to return in the cases where a RST
+* is being sent to close a subflow because of an invalid response.
+*/
+   SK_RST_REASON_MPTCP_RST_EMPTCP,
+   /* Lack of resources.
+* This code indicates that the sending host does not have enough
+* resources to support the terminated subflow.
+*/
+   SK_RST_REASON_MPTCP_RST_ERESOURCE,
+   /* Administratively prohibited.
+* This code indicates that the requested subflow is prohibited by
+* the policies of the sending host.
+*/
+   SK_RST_REASON_MPTCP_RST_EPROHIBIT,
+   /* Too much outstanding data.
+* This code indicates that there is an excessive amount of data
+* that needs to be transmitted over the terminated subflow while
+* having already been acknowledged over one or more other subflows.
+* This may occur if a path has been unavailable for a short period
+* and it is more efficient to reset and start again than it is to
+* retransmit the queued data.
+*/
+   SK_RST_REASON_MPTCP_RST_EWQ2BIG,
+   /* Unacceptable performance.
+* This code indicates that the performance of this subflow was
+* too low compared to the other subflows of this Multipath TCP
+* connection.
+*/
+   SK_RST_REASON_MPTCP_RST_EBADPERF,
+   /* Middlebox interference.
+* Middlebox interference has been detected over this subflow,
+* making MPTCP signaling invalid. For example, this may be sent
+* if the checksum does not validate.
+*/
+   SK_RST_REASON_MPTCP_RST_EMIDDLEBOX,
+
+   /* For the real standalone socket reset reason, we start from here */
+   SK_RST_REASON_NOT_SPECIFIED,
+
+   /* Maximum of socket reset reasons.
+* It shouldn't be used as a real 'reason'.
+*/
+   SK_RST_REASON_MAX,
+};
+
+static inline int convert_mptcp_reason(int reason)
+{
+   return reason += RST_REASON_START;
+}
+#endif
-- 
2.37.3

[PATCH net-next v3 0/6] Implement reset reason mechanism to detect

2024-04-09 Thread Jason Xing

From: Jason Xing 

In production, there are so many cases about why the RST skb is sent but
we don't have a very convenient/fast method to detect the exact underlying
reasons.

RST is implemented in two kinds: passive kind (like tcp_v4_send_reset())
and active kind (like tcp_send_active_reset()). The former can be traced
carefully 1) in TCP, with the help of drop reasons, which is based on
Eric's idea[1], 2) in MPTCP, with the help of reset options defined in
RFC 8684. The latter is relatively independent, which should be
implemented on our own.

In this series, I focus on the fundamental implement mostly about how
the rstreason mechnism works and give the detailed passive part as an
example, not including the active reset part. In future, we can go
further and refine those NOT_SPECIFIED reasons.

Here are some examples when tracing:
-0   [002] ..s1.  1830.262425: tcp_send_reset: skbaddr=x
skaddr=x src=x dest=x state=x reason=NOT_SPECIFIED
-0   [002] ..s1.  1830.262425: tcp_send_reset: skbaddr=x
skaddr=x src=x dest=x state=x reason=NO_SOCKET

[1]
Link: 
https://lore.kernel.org/all/CANn89iJw8x-LqgsWOeJQQvgVg6DnL5aBRLi10QN2WBdr+X4k=w...@mail.gmail.com/

v3
Link:
https://lore.kernel.org/all/20240404072047.11490-1-kerneljasonx...@gmail.com/
1. rebase (mptcp part) and address what Mat suggested.

v2
Link: https://lore.kernel.org/all/20240403185033.47ebc...@kernel.org/
1. rebase against the latest net-next tree

Jason Xing (6):
  net: introduce rstreason to detect why the RST is sent
  rstreason: prepare for passive reset
  rstreason: prepare for active reset
  tcp: support rstreason for passive reset
  mptcp: support rstreason for passive reset
  rstreason: make it work in trace world

 include/net/request_sock.h |  3 +-
 include/net/rstreason.h| 93 ++
 include/net/tcp.h  |  2 +-
 include/trace/events/tcp.h | 37 +--
 net/dccp/ipv4.c| 10 ++--
 net/dccp/ipv6.c| 10 ++--
 net/dccp/minisocks.c   |  3 +-
 net/ipv4/tcp.c | 15 --
 net/ipv4/tcp_ipv4.c| 14 +++---
 net/ipv4/tcp_minisocks.c   |  3 +-
 net/ipv4/tcp_output.c  |  4 +-
 net/ipv4/tcp_timer.c   |  9 ++--
 net/ipv6/tcp_ipv6.c| 17 ---
 net/mptcp/protocol.c   |  4 +-
 net/mptcp/subflow.c| 25 +++---
 15 files changed, 202 insertions(+), 47 deletions(-)
 create mode 100644 include/net/rstreason.h

-- 
2.37.3

Re: [PATCH net-next v2 5/6] mptcp: support rstreason for passive reset

2024-04-04 Thread Jason Xing

Hello Mat,

On Fri, Apr 5, 2024 at 4:33 AM Mat Martineau  wrote:
>
> On Thu, 4 Apr 2024, Jason Xing wrote:
>
> > From: Jason Xing 
> >
> > It relys on what reset options in MPTCP does as rfc8684 says. Reusing
> > this logic can save us much energy. This patch replaces all the prior
> > NOT_SPECIFIED reasons.
> >
> > Signed-off-by: Jason Xing 
> > ---
> > net/mptcp/subflow.c | 26 --
> > 1 file changed, 20 insertions(+), 6 deletions(-)
> >
> > diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
> > index a68d5d0f3e2a..24668d3020aa 100644
> > --- a/net/mptcp/subflow.c
> > +++ b/net/mptcp/subflow.c
> > @@ -304,7 +304,10 @@ static struct dst_entry *subflow_v4_route_req(const 
> > struct sock *sk,
> >
> >   dst_release(dst);
> >   if (!req->syncookie)
> > - tcp_request_sock_ops.send_reset(sk, skb, 
> > SK_RST_REASON_NOT_SPECIFIED);
> > + /* According to RFC 8684, 3.2. Starting a New Subflow,
> > +  * we should use an "MPTCP specific error" reason code.
> > +  */
> > + tcp_request_sock_ops.send_reset(sk, skb, 
> > SK_RST_REASON_MPTCP_RST_EMPTCP);
>
> Hi Jason -
>
> In this case, the MPTCP reset reason is set in subflow_check_req(). Looks
> like it uses EMPTCP but that isn't guaranteed to stay the same. I think it
> would be better to extract the reset reason from the skb extension or the
> subflow context "reset_reason" field.

Good suggestions :)

>
>
> >   return NULL;
> > }
> >
> > @@ -371,7 +374,10 @@ static struct dst_entry *subflow_v6_route_req(const 
> > struct sock *sk,
> >
> >   dst_release(dst);
> >   if (!req->syncookie)
> > - tcp6_request_sock_ops.send_reset(sk, skb, 
> > SK_RST_REASON_NOT_SPECIFIED);
> > + /* According to RFC 8684, 3.2. Starting a New Subflow,
> > +  * we should use an "MPTCP specific error" reason code.
> > +  */
> > + tcp6_request_sock_ops.send_reset(sk, skb, 
> > SK_RST_REASON_MPTCP_RST_EMPTCP);
>
> Same issue here.

Got it.

>
> >   return NULL;
> > }
> > #endif
> > @@ -778,6 +784,7 @@ static struct sock *subflow_syn_recv_sock(const struct 
> > sock *sk,
> >   bool fallback, fallback_is_fatal;
> >   struct mptcp_sock *owner;
> >   struct sock *child;
> > + int reason;
> >
> >   pr_debug("listener=%p, req=%p, conn=%p", listener, req, 
> > listener->conn);
> >
> > @@ -833,7 +840,8 @@ static struct sock *subflow_syn_recv_sock(const struct 
> > sock *sk,
> >*/
> >   if (!ctx || fallback) {
> >   if (fallback_is_fatal) {
> > - subflow_add_reset_reason(skb, 
> > MPTCP_RST_EMPTCP);
> > + reason = MPTCP_RST_EMPTCP;
> > + subflow_add_reset_reason(skb, reason);
> >   goto dispose_child;
> >   }
> >   goto fallback;
> > @@ -861,7 +869,8 @@ static struct sock *subflow_syn_recv_sock(const struct 
> > sock *sk,
> >   } else if (ctx->mp_join) {
> >   owner = subflow_req->msk;
> >   if (!owner) {
> > - subflow_add_reset_reason(skb, 
> > MPTCP_RST_EPROHIBIT);
> > + reason = MPTCP_RST_EPROHIBIT;
> > + subflow_add_reset_reason(skb, reason);
> >   goto dispose_child;
> >   }
> >
> > @@ -875,13 +884,18 @@ static struct sock *subflow_syn_recv_sock(const 
> > struct sock *sk,
> >ntohs(inet_sk((struct sock 
> > *)owner)->inet_sport));
> >   if (!mptcp_pm_sport_in_anno_list(owner, sk)) {
> >   SUBFLOW_REQ_INC_STATS(req, 
> > MPTCP_MIB_MISMATCHPORTACKRX);
> > + reason = MPTCP_RST_EUNSPEC;
>
> I think the MPTCP code here should have been using MPTCP_RST_EPROHIBIT.

I'll update in the V2 of the patch.

Thanks,
Jason

[PATCH net-next v2 6/6] rstreason: make it work in trace world

2024-04-04 Thread Jason Xing

From: Jason Xing 

At last, we should let it work by introducing this reset reason in
trace world.

One of the possible expected outputs is:
... tcp_send_reset: skbaddr=xxx skaddr=xxx src=xxx dest=xxx
state=TCP_ESTABLISHED reason=NOT_SPECIFIED

Signed-off-by: Jason Xing 
---
 include/trace/events/tcp.h | 37 +
 net/ipv4/tcp_ipv4.c|  2 +-
 net/ipv4/tcp_output.c  |  2 +-
 net/ipv6/tcp_ipv6.c|  2 +-
 4 files changed, 36 insertions(+), 7 deletions(-)

diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
index 5c04a61a11c2..9bed9e63c9c5 100644
--- a/include/trace/events/tcp.h
+++ b/include/trace/events/tcp.h
@@ -11,6 +11,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /*
  * tcp event with arguments sk and skb
@@ -74,20 +75,38 @@ DEFINE_EVENT(tcp_event_sk_skb, tcp_retransmit_skb,
TP_ARGS(sk, skb)
 );
 
+#undef FN1
+#define FN1(reason)TRACE_DEFINE_ENUM(SK_RST_REASON_##reason);
+#undef FN2
+#define FN2(reason)TRACE_DEFINE_ENUM(SKB_DROP_REASON_##reason);
+DEFINE_RST_REASON(FN1, FN1)
+
+#undef FN1
+#undef FNe1
+#define FN1(reason){ SK_RST_REASON_##reason, #reason },
+#define FNe1(reason)   { SK_RST_REASON_##reason, #reason }
+
+#undef FN2
+#undef FNe2
+#define FN2(reason){ SKB_DROP_REASON_##reason, #reason },
+#define FNe2(reason)   { SKB_DROP_REASON_##reason, #reason }
 /*
  * skb of trace_tcp_send_reset is the skb that caused RST. In case of
  * active reset, skb should be NULL
  */
 TRACE_EVENT(tcp_send_reset,
 
-   TP_PROTO(const struct sock *sk, const struct sk_buff *skb),
+   TP_PROTO(const struct sock *sk,
+const struct sk_buff *skb,
+const int reason),
 
-   TP_ARGS(sk, skb),
+   TP_ARGS(sk, skb, reason),
 
TP_STRUCT__entry(
__field(const void *, skbaddr)
__field(const void *, skaddr)
__field(int, state)
+   __field(int, reason)
__array(__u8, saddr, sizeof(struct sockaddr_in6))
__array(__u8, daddr, sizeof(struct sockaddr_in6))
),
@@ -113,14 +132,24 @@ TRACE_EVENT(tcp_send_reset,
 */
TP_STORE_ADDR_PORTS_SKB(skb, th, entry->daddr, 
entry->saddr);
}
+   __entry->reason = reason;
),
 
-   TP_printk("skbaddr=%p skaddr=%p src=%pISpc dest=%pISpc state=%s",
+   TP_printk("skbaddr=%p skaddr=%p src=%pISpc dest=%pISpc state=%s 
reason=%s",
  __entry->skbaddr, __entry->skaddr,
  __entry->saddr, __entry->daddr,
- __entry->state ? show_tcp_state_name(__entry->state) : 
"UNKNOWN")
+ __entry->state ? show_tcp_state_name(__entry->state) : 
"UNKNOWN",
+ __entry->reason < RST_REASON_START ?
+   __print_symbolic(__entry->reason, 
DEFINE_DROP_REASON(FN2, FNe2)) :
+   __print_symbolic(__entry->reason, 
DEFINE_RST_REASON(FN1, FNe1)))
 );
 
+#undef FN1
+#undef FNe1
+
+#undef FN2
+#undef FNe2
+
 /*
  * tcp event with arguments sk
  *
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 1ae2716f0c34..9c52a4a74842 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -871,7 +871,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct 
sk_buff *skb,
if (sk)
arg.bound_dev_if = sk->sk_bound_dev_if;
 
-   trace_tcp_send_reset(sk, skb);
+   trace_tcp_send_reset(sk, skb, reason);
 
BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 18fbbad2028a..d5a7ecfcc1b3 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3608,7 +3608,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t 
priority, int reason)
/* skb of trace_tcp_send_reset() keeps the skb that caused RST,
 * skb here is different to the troublesome skb, so use NULL
 */
-   trace_tcp_send_reset(sk, NULL);
+   trace_tcp_send_reset(sk, NULL, SK_RST_REASON_NOT_SPECIFIED);
 }
 
 /* Send a crossed SYN-ACK during socket establishment.
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 6889ea70c760..3c995eff6e52 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1131,7 +1131,7 @@ static void tcp_v6_send_reset(const struct sock *sk, 
struct sk_buff *skb,
label = ip6_flowlabel(ipv6h);
}
 
-   trace_tcp_send_reset(sk, skb);
+   trace_tcp_send_reset(sk, skb, reason);
 
tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, 1,
 ipv6_get_dsfield(ipv6h), label, priority, txhash,
-- 
2.37.3

[PATCH net-next v2 5/6] mptcp: support rstreason for passive reset

2024-04-04 Thread Jason Xing

From: Jason Xing 

It relys on what reset options in MPTCP does as rfc8684 says. Reusing
this logic can save us much energy. This patch replaces all the prior
NOT_SPECIFIED reasons.

Signed-off-by: Jason Xing 
---
 net/mptcp/subflow.c | 26 --
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index a68d5d0f3e2a..24668d3020aa 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -304,7 +304,10 @@ static struct dst_entry *subflow_v4_route_req(const struct 
sock *sk,
 
dst_release(dst);
if (!req->syncookie)
-   tcp_request_sock_ops.send_reset(sk, skb, 
SK_RST_REASON_NOT_SPECIFIED);
+   /* According to RFC 8684, 3.2. Starting a New Subflow,
+* we should use an "MPTCP specific error" reason code.
+*/
+   tcp_request_sock_ops.send_reset(sk, skb, 
SK_RST_REASON_MPTCP_RST_EMPTCP);
return NULL;
 }
 
@@ -371,7 +374,10 @@ static struct dst_entry *subflow_v6_route_req(const struct 
sock *sk,
 
dst_release(dst);
if (!req->syncookie)
-   tcp6_request_sock_ops.send_reset(sk, skb, 
SK_RST_REASON_NOT_SPECIFIED);
+   /* According to RFC 8684, 3.2. Starting a New Subflow,
+* we should use an "MPTCP specific error" reason code.
+*/
+   tcp6_request_sock_ops.send_reset(sk, skb, 
SK_RST_REASON_MPTCP_RST_EMPTCP);
return NULL;
 }
 #endif
@@ -778,6 +784,7 @@ static struct sock *subflow_syn_recv_sock(const struct sock 
*sk,
bool fallback, fallback_is_fatal;
struct mptcp_sock *owner;
struct sock *child;
+   int reason;
 
pr_debug("listener=%p, req=%p, conn=%p", listener, req, listener->conn);
 
@@ -833,7 +840,8 @@ static struct sock *subflow_syn_recv_sock(const struct sock 
*sk,
 */
if (!ctx || fallback) {
if (fallback_is_fatal) {
-   subflow_add_reset_reason(skb, MPTCP_RST_EMPTCP);
+   reason = MPTCP_RST_EMPTCP;
+   subflow_add_reset_reason(skb, reason);
goto dispose_child;
}
goto fallback;
@@ -861,7 +869,8 @@ static struct sock *subflow_syn_recv_sock(const struct sock 
*sk,
} else if (ctx->mp_join) {
owner = subflow_req->msk;
if (!owner) {
-   subflow_add_reset_reason(skb, 
MPTCP_RST_EPROHIBIT);
+   reason = MPTCP_RST_EPROHIBIT;
+   subflow_add_reset_reason(skb, reason);
goto dispose_child;
}
 
@@ -875,13 +884,18 @@ static struct sock *subflow_syn_recv_sock(const struct 
sock *sk,
 ntohs(inet_sk((struct sock 
*)owner)->inet_sport));
if (!mptcp_pm_sport_in_anno_list(owner, sk)) {
SUBFLOW_REQ_INC_STATS(req, 
MPTCP_MIB_MISMATCHPORTACKRX);
+   reason = MPTCP_RST_EUNSPEC;
goto dispose_child;
}
SUBFLOW_REQ_INC_STATS(req, 
MPTCP_MIB_JOINPORTACKRX);
}
 
-   if (!mptcp_finish_join(child))
+   if (!mptcp_finish_join(child)) {
+   struct mptcp_subflow_context *subflow = 
mptcp_subflow_ctx(sk);
+
+   reason = subflow->reset_reason;
goto dispose_child;
+   }
 
SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKRX);
tcp_rsk(req)->drop_req = true;
@@ -901,7 +915,7 @@ static struct sock *subflow_syn_recv_sock(const struct sock 
*sk,
tcp_rsk(req)->drop_req = true;
inet_csk_prepare_for_destroy_sock(child);
tcp_done(child);
-   req->rsk_ops->send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
+   req->rsk_ops->send_reset(sk, skb, convert_mptcp_reason(reason));
 
/* The last child reference will be released by the caller */
return child;
-- 
2.37.3

[PATCH net-next v2 4/6] tcp: support rstreason for passive reset

2024-04-04 Thread Jason Xing

From: Jason Xing 

Reuse the dropreason logic to show the exact reason of tcp reset,
so we don't need to implement those duplicated reset reasons.
This patch replaces all the prior NOT_SPECIFIED reasons.

Signed-off-by: Jason Xing 
---
 net/ipv4/tcp_ipv4.c | 8 
 net/ipv6/tcp_ipv6.c | 8 
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index d8d98db8f58e..1ae2716f0c34 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1935,7 +1935,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
return 0;
 
 reset:
-   tcp_v4_send_reset(rsk, skb, SK_RST_REASON_NOT_SPECIFIED);
+   tcp_v4_send_reset(rsk, skb, reason);
 discard:
kfree_skb_reason(skb, reason);
/* Be careful here. If this function gets more complicated and
@@ -2280,7 +2280,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
} else {
drop_reason = tcp_child_process(sk, nsk, skb);
if (drop_reason) {
-   tcp_v4_send_reset(nsk, skb, 
SK_RST_REASON_NOT_SPECIFIED);
+   tcp_v4_send_reset(nsk, skb, drop_reason);
goto discard_and_relse;
}
sock_put(sk);
@@ -2358,7 +2358,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
 bad_packet:
__TCP_INC_STATS(net, TCP_MIB_INERRS);
} else {
-   tcp_v4_send_reset(NULL, skb, SK_RST_REASON_NOT_SPECIFIED);
+   tcp_v4_send_reset(NULL, skb, drop_reason);
}
 
 discard_it:
@@ -2409,7 +2409,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
tcp_v4_timewait_ack(sk, skb);
break;
case TCP_TW_RST:
-   tcp_v4_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
+   tcp_v4_send_reset(sk, skb, drop_reason);
inet_twsk_deschedule_put(inet_twsk(sk));
goto discard_it;
case TCP_TW_SUCCESS:;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 7e591521b193..6889ea70c760 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1678,7 +1678,7 @@ int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
return 0;
 
 reset:
-   tcp_v6_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
+   tcp_v6_send_reset(sk, skb, reason);
 discard:
if (opt_skb)
__kfree_skb(opt_skb);
@@ -1864,7 +1864,7 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff 
*skb)
} else {
drop_reason = tcp_child_process(sk, nsk, skb);
if (drop_reason) {
-   tcp_v6_send_reset(nsk, skb, 
SK_RST_REASON_NOT_SPECIFIED);
+   tcp_v6_send_reset(nsk, skb, drop_reason);
goto discard_and_relse;
}
sock_put(sk);
@@ -1940,7 +1940,7 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff 
*skb)
 bad_packet:
__TCP_INC_STATS(net, TCP_MIB_INERRS);
} else {
-   tcp_v6_send_reset(NULL, skb, SK_RST_REASON_NOT_SPECIFIED);
+   tcp_v6_send_reset(NULL, skb, drop_reason);
}
 
 discard_it:
@@ -1995,7 +1995,7 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff 
*skb)
tcp_v6_timewait_ack(sk, skb);
break;
case TCP_TW_RST:
-   tcp_v6_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
+   tcp_v6_send_reset(sk, skb, drop_reason);
inet_twsk_deschedule_put(inet_twsk(sk));
goto discard_it;
case TCP_TW_SUCCESS:
-- 
2.37.3

[PATCH net-next v2 3/6] rstreason: prepare for active reset

2024-04-04 Thread Jason Xing

From: Jason Xing 

Like what we did to passive reset:
only passing possible reset reason in each active reset path.

No functional changes.

Signed-off-by: Jason Xing 
---
 include/net/tcp.h |  2 +-
 net/ipv4/tcp.c| 15 ++-
 net/ipv4/tcp_output.c |  2 +-
 net/ipv4/tcp_timer.c  |  9 ++---
 net/mptcp/protocol.c  |  4 +++-
 net/mptcp/subflow.c   |  5 +++--
 6 files changed, 24 insertions(+), 13 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 9ab5b37e9d53..67ab4dbf7805 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -667,7 +667,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
 void tcp_send_probe0(struct sock *);
 int tcp_write_wakeup(struct sock *, int mib);
 void tcp_send_fin(struct sock *sk);
-void tcp_send_active_reset(struct sock *sk, gfp_t priority);
+void tcp_send_active_reset(struct sock *sk, gfp_t priority, int reason);
 int tcp_send_synack(struct sock *);
 void tcp_push_one(struct sock *, unsigned int mss_now);
 void __tcp_send_ack(struct sock *sk, u32 rcv_nxt);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index e767721b3a58..eacfe0012977 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -275,6 +275,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -2805,7 +2806,8 @@ void __tcp_close(struct sock *sk, long timeout)
/* Unread data was tossed, zap the connection. */
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
tcp_set_state(sk, TCP_CLOSE);
-   tcp_send_active_reset(sk, sk->sk_allocation);
+   tcp_send_active_reset(sk, sk->sk_allocation,
+ SK_RST_REASON_NOT_SPECIFIED);
} else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
/* Check zero linger _after_ checking for unread data. */
sk->sk_prot->disconnect(sk, 0);
@@ -2879,7 +2881,8 @@ void __tcp_close(struct sock *sk, long timeout)
struct tcp_sock *tp = tcp_sk(sk);
if (READ_ONCE(tp->linger2) < 0) {
tcp_set_state(sk, TCP_CLOSE);
-   tcp_send_active_reset(sk, GFP_ATOMIC);
+   tcp_send_active_reset(sk, GFP_ATOMIC,
+ SK_RST_REASON_NOT_SPECIFIED);
__NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPABORTONLINGER);
} else {
@@ -2897,7 +2900,8 @@ void __tcp_close(struct sock *sk, long timeout)
if (sk->sk_state != TCP_CLOSE) {
if (tcp_check_oom(sk, 0)) {
tcp_set_state(sk, TCP_CLOSE);
-   tcp_send_active_reset(sk, GFP_ATOMIC);
+   tcp_send_active_reset(sk, GFP_ATOMIC,
+ SK_RST_REASON_NOT_SPECIFIED);
__NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPABORTONMEMORY);
} else if (!check_net(sock_net(sk))) {
@@ -3001,7 +3005,7 @@ int tcp_disconnect(struct sock *sk, int flags)
/* The last check adjusts for discrepancy of Linux wrt. RFC
 * states
 */
-   tcp_send_active_reset(sk, gfp_any());
+   tcp_send_active_reset(sk, gfp_any(), 
SK_RST_REASON_NOT_SPECIFIED);
WRITE_ONCE(sk->sk_err, ECONNRESET);
} else if (old_state == TCP_SYN_SENT)
WRITE_ONCE(sk->sk_err, ECONNRESET);
@@ -4557,7 +4561,8 @@ int tcp_abort(struct sock *sk, int err)
smp_wmb();
sk_error_report(sk);
if (tcp_need_reset(sk->sk_state))
-   tcp_send_active_reset(sk, GFP_ATOMIC);
+   tcp_send_active_reset(sk, GFP_ATOMIC,
+ SK_RST_REASON_NOT_SPECIFIED);
tcp_done(sk);
}
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index e3167ad96567..18fbbad2028a 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3583,7 +3583,7 @@ void tcp_send_fin(struct sock *sk)
  * was unread data in the receive queue.  This behavior is recommended
  * by RFC 2525, section 2.17.  -DaveM
  */
-void tcp_send_active_reset(struct sock *sk, gfp_t priority)
+void tcp_send_active_reset(struct sock *sk, gfp_t priority, int reason)
 {
struct sk_buff *skb;
 
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 976db57b95d4..83fe7f62f7f1 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -22,6 +22,7 @@
 #include 
 #include 
 #include 
+#include 
 
 static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk)
 {
@@ -127,7 +128,8 @@ static int tcp_out_of_resources(struct sock *sk, bool 
do_reset)
(!tp->snd_wnd && !tp->packets_out))

[PATCH net-next v2 2/6] rstreason: prepare for passive reset

2024-04-04 Thread Jason Xing

From: Jason Xing 

Adjust the paramenter and support passing reason of reset which
is for now NOT_SPECIFIED. No functional changes.

Signed-off-by: Jason Xing 
---
 include/net/request_sock.h |  3 ++-
 net/dccp/ipv4.c| 10 ++
 net/dccp/ipv6.c| 10 ++
 net/dccp/minisocks.c   |  3 ++-
 net/ipv4/tcp_ipv4.c| 12 +++-
 net/ipv4/tcp_minisocks.c   |  3 ++-
 net/ipv6/tcp_ipv6.c| 15 +--
 net/mptcp/subflow.c|  8 +---
 8 files changed, 39 insertions(+), 25 deletions(-)

diff --git a/include/net/request_sock.h b/include/net/request_sock.h
index 004e651e6067..93f9fee7e52f 100644
--- a/include/net/request_sock.h
+++ b/include/net/request_sock.h
@@ -34,7 +34,8 @@ struct request_sock_ops {
void(*send_ack)(const struct sock *sk, struct sk_buff *skb,
struct request_sock *req);
void(*send_reset)(const struct sock *sk,
- struct sk_buff *skb);
+ struct sk_buff *skb,
+ int reason);
void(*destructor)(struct request_sock *req);
void(*syn_ack_timeout)(const struct request_sock *req);
 };
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 9fc9cea4c251..11b8d14be3e2 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -24,6 +24,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "ackvec.h"
 #include "ccid.h"
@@ -521,7 +522,8 @@ static int dccp_v4_send_response(const struct sock *sk, 
struct request_sock *req
return err;
 }
 
-static void dccp_v4_ctl_send_reset(const struct sock *sk, struct sk_buff 
*rxskb)
+static void dccp_v4_ctl_send_reset(const struct sock *sk, struct sk_buff 
*rxskb,
+  int reason)
 {
int err;
const struct iphdr *rxiph;
@@ -706,7 +708,7 @@ int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
return 0;
 
 reset:
-   dccp_v4_ctl_send_reset(sk, skb);
+   dccp_v4_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
kfree_skb(skb);
return 0;
 }
@@ -869,7 +871,7 @@ static int dccp_v4_rcv(struct sk_buff *skb)
if (nsk == sk) {
reqsk_put(req);
} else if (dccp_child_process(sk, nsk, skb)) {
-   dccp_v4_ctl_send_reset(sk, skb);
+   dccp_v4_ctl_send_reset(sk, skb, 
SK_RST_REASON_NOT_SPECIFIED);
goto discard_and_relse;
} else {
sock_put(sk);
@@ -909,7 +911,7 @@ static int dccp_v4_rcv(struct sk_buff *skb)
if (dh->dccph_type != DCCP_PKT_RESET) {
DCCP_SKB_CB(skb)->dccpd_reset_code =
DCCP_RESET_CODE_NO_CONNECTION;
-   dccp_v4_ctl_send_reset(sk, skb);
+   dccp_v4_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
}
 
 discard_it:
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index c8ca703dc331..232092dc3887 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -29,6 +29,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "dccp.h"
 #include "ipv6.h"
@@ -256,7 +257,8 @@ static void dccp_v6_reqsk_destructor(struct request_sock 
*req)
kfree_skb(inet_rsk(req)->pktopts);
 }
 
-static void dccp_v6_ctl_send_reset(const struct sock *sk, struct sk_buff 
*rxskb)
+static void dccp_v6_ctl_send_reset(const struct sock *sk, struct sk_buff 
*rxskb,
+  int reason)
 {
const struct ipv6hdr *rxip6h;
struct sk_buff *skb;
@@ -656,7 +658,7 @@ static int dccp_v6_do_rcv(struct sock *sk, struct sk_buff 
*skb)
return 0;
 
 reset:
-   dccp_v6_ctl_send_reset(sk, skb);
+   dccp_v6_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
 discard:
if (opt_skb != NULL)
__kfree_skb(opt_skb);
@@ -762,7 +764,7 @@ static int dccp_v6_rcv(struct sk_buff *skb)
if (nsk == sk) {
reqsk_put(req);
} else if (dccp_child_process(sk, nsk, skb)) {
-   dccp_v6_ctl_send_reset(sk, skb);
+   dccp_v6_ctl_send_reset(sk, skb, 
SK_RST_REASON_NOT_SPECIFIED);
goto discard_and_relse;
} else {
sock_put(sk);
@@ -801,7 +803,7 @@ static int dccp_v6_rcv(struct sk_buff *skb)
if (dh->dccph_type != DCCP_PKT_RESET) {
DCCP_SKB_CB(skb)->dccpd_reset_code =
DCCP_RESET_CODE_NO_CONNECTION;
-   dccp_v6_ctl_send_reset(sk, skb);
+   dccp_v6_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
}
 
 discard_it:
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
index 64d805b27add..251a57cf5822 100644
--- a/net/dccp/minisocks.c

[PATCH net-next v2 1/6] net: introduce rstreason to detect why the RST is sent

2024-04-04 Thread Jason Xing

From: Jason Xing 

Add a new standalone file for the easy future extension to support
both active reset and passive reset in the TCP/DCCP/MPTCP protocols.

This patch only does the preparations for reset reason mechanism,
nothing else changes.

The reset reasons are divided into three parts:
1) reuse drop reasons for passive reset in TCP
2) reuse MP_TCPRST option for MPTCP
3) our own reasons

I will implement the basic codes of active/passive reset reason in
those three protocols, which is not complete for this moment. But
it provides a new chance to let other people add more reasons into
it:)

Signed-off-by: Jason Xing 
---
 include/net/rstreason.h | 93 +
 1 file changed, 93 insertions(+)
 create mode 100644 include/net/rstreason.h

diff --git a/include/net/rstreason.h b/include/net/rstreason.h
new file mode 100644
index ..24d098a78a60
--- /dev/null
+++ b/include/net/rstreason.h
@@ -0,0 +1,93 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifndef _LINUX_RSTREASON_H
+#define _LINUX_RSTREASON_H
+#include 
+
+#define DEFINE_RST_REASON(FN, FNe) \
+   FN(MPTCP_RST_EUNSPEC)   \
+   FN(MPTCP_RST_EMPTCP)\
+   FN(MPTCP_RST_ERESOURCE) \
+   FN(MPTCP_RST_EPROHIBIT) \
+   FN(MPTCP_RST_EWQ2BIG)   \
+   FN(MPTCP_RST_EBADPERF)  \
+   FN(MPTCP_RST_EMIDDLEBOX)\
+   FN(NOT_SPECIFIED)   \
+   FNe(MAX)
+
+#define RST_REASON_START (SKB_DROP_REASON_MAX + 1)
+
+/* There are three parts in order:
+ * 1) 0 - SKB_DROP_REASON_MAX: rely on drop reasons for passive reset in TCP
+ * 2) SKB_DROP_REASON_MAX + 1 - MPTCP_RST_EMIDDLEBOX: for MPTCP use
+ * 3) MPTCP_RST_EMIDDLEBOX - SK_RST_REASON_MAX: independent reset reason
+ */
+enum sk_rst_reason {
+   /* Leave this 'blank' part (0-SKB_DROP_REASON_MAX) for the reuse
+* of skb drop reason because rst reason relies on what drop reason
+* indicates exactly why it could happen.
+*/
+
+   /* Copy from include/uapi/linux/mptcp.h.
+* These reset fields will not be changed since they adhere to
+* RFC 8684. So do not touch them. I'm going to list each definition
+* of them respectively.
+*/
+   /* Unspecified error.
+* This is the default error; it implies that the subflow is no
+* longer available. The presence of this option shows that the
+* RST was generated by an MPTCP-aware device.
+*/
+   SK_RST_REASON_MPTCP_RST_EUNSPEC = RST_REASON_START,
+   /* MPTCP-specific error.
+* An error has been detected in the processing of MPTCP options.
+* This is the usual reason code to return in the cases where a RST
+* is being sent to close a subflow because of an invalid response.
+*/
+   SK_RST_REASON_MPTCP_RST_EMPTCP,
+   /* Lack of resources.
+* This code indicates that the sending host does not have enough
+* resources to support the terminated subflow.
+*/
+   SK_RST_REASON_MPTCP_RST_ERESOURCE,
+   /* Administratively prohibited.
+* This code indicates that the requested subflow is prohibited by
+* the policies of the sending host.
+*/
+   SK_RST_REASON_MPTCP_RST_EPROHIBIT,
+   /* Too much outstanding data.
+* This code indicates that there is an excessive amount of data
+* that needs to be transmitted over the terminated subflow while
+* having already been acknowledged over one or more other subflows.
+* This may occur if a path has been unavailable for a short period
+* and it is more efficient to reset and start again than it is to
+* retransmit the queued data.
+*/
+   SK_RST_REASON_MPTCP_RST_EWQ2BIG,
+   /* Unacceptable performance.
+* This code indicates that the performance of this subflow was
+* too low compared to the other subflows of this Multipath TCP
+* connection.
+*/
+   SK_RST_REASON_MPTCP_RST_EBADPERF,
+   /* Middlebox interference.
+* Middlebox interference has been detected over this subflow,
+* making MPTCP signaling invalid. For example, this may be sent
+* if the checksum does not validate.
+*/
+   SK_RST_REASON_MPTCP_RST_EMIDDLEBOX,
+
+   /* For the real standalone socket reset reason, we start from here */
+   SK_RST_REASON_NOT_SPECIFIED,
+
+   /* Maximum of socket reset reasons.
+* It shouldn't be used as a real 'reason'.
+*/
+   SK_RST_REASON_MAX,
+};
+
+static inline int convert_mptcp_reason(int reason)
+{
+   return reason += RST_REASON_START;
+}
+#endif
-- 
2.37.3

[PATCH net-next 0/6] Implement reset reason mechanism to detect

2024-04-04 Thread Jason Xing

From: Jason Xing 

In production, there are so many cases about why the RST skb is sent but
we don't have a very convenient/fast method to detect the exact underlying
reasons.

RST is implemented in two kinds: passive kind (like tcp_v4_send_reset())
and active kind (like tcp_send_active_reset()). The former can be traced
carefully 1) in TCP, with the help of drop reasons, which is based on
Eric's idea[1], 2) in MPTCP, with the help of reset options defined in
RFC 8684. The latter is relatively independent, which should be
implemented on our own.

In this series, I focus on the fundamental implement mostly about how
the rstreason mechnism and the detailed passive part works as an
example, not including the active reset part. In future, we can go
further and refine those NOT_SPECIFIED reasons.

Here are some examples when tracing:
-0   [002] ..s1.  1830.262425: tcp_send_reset: skbaddr=x
skaddr=x src=x dest=x state=x reason=NOT_SPECIFIED
-0   [002] ..s1.  1830.262425: tcp_send_reset: skbaddr=x
skaddr=x src=x dest=x state=x reason=NO_SOCKET

[1]
Link: 
https://lore.kernel.org/all/CANn89iJw8x-LqgsWOeJQQvgVg6DnL5aBRLi10QN2WBdr+X4k=w...@mail.gmail.com/

v2
Link: https://lore.kernel.org/all/20240403185033.47ebc...@kernel.org/
1. rebase against the latest net-next tree

Jason Xing (6):
  net: introduce rstreason to detect why the RST is sent
  rstreason: prepare for passive reset
  rstreason: prepare for active reset
  tcp: support rstreason for passive reset
  mptcp: support rstreason for passive reset
  rstreason: make it work in trace world

 include/net/request_sock.h |  3 +-
 include/net/rstreason.h| 93 ++
 include/net/tcp.h  |  2 +-
 include/trace/events/tcp.h | 37 +--
 net/dccp/ipv4.c| 10 ++--
 net/dccp/ipv6.c| 10 ++--
 net/dccp/minisocks.c   |  3 +-
 net/ipv4/tcp.c | 15 --
 net/ipv4/tcp_ipv4.c| 14 +++---
 net/ipv4/tcp_minisocks.c   |  3 +-
 net/ipv4/tcp_output.c  |  4 +-
 net/ipv4/tcp_timer.c   |  9 ++--
 net/ipv6/tcp_ipv6.c| 17 ---
 net/mptcp/protocol.c   |  4 +-
 net/mptcp/subflow.c| 33 ++
 15 files changed, 209 insertions(+), 48 deletions(-)
 create mode 100644 include/net/rstreason.h

-- 
2.37.3

Re: [PATCH net-next 0/6] Implement reset reason mechanism to detect

2024-04-03 Thread Jason Xing

On Thu, Apr 4, 2024 at 9:50 AM Jakub Kicinski  wrote:
>
> On Wed,  3 Apr 2024 15:31:38 +0800 Jason Xing wrote:
> > It's based on top of 
> > https://patchwork.kernel.org/project/netdevbpf/list/?series=840182
>
> Please post as RFC if there's a dependency.
> We don't maintain patch queues for people.

Got it. Thanks.

I'll wait for that patch series to get merged. I believe it will not
take too long:)

> --
> pw-bot: cr

[PATCH net-next 6/6] rstreason: make it work in trace world

2024-04-03 Thread Jason Xing

From: Jason Xing 

At last, we should let it work by introducing this reset reason in
trace world.

One of the possible expected outputs is:
... tcp_send_reset: skbaddr=xxx skaddr=xxx src=xxx dest=xxx
state=TCP_ESTABLISHED reason=NOT_SPECIFIED

Signed-off-by: Jason Xing 
---
 include/trace/events/tcp.h | 37 +
 net/ipv4/tcp_ipv4.c|  2 +-
 net/ipv4/tcp_output.c  |  2 +-
 net/ipv6/tcp_ipv6.c|  2 +-
 4 files changed, 36 insertions(+), 7 deletions(-)

diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
index 5c04a61a11c2..9bed9e63c9c5 100644
--- a/include/trace/events/tcp.h
+++ b/include/trace/events/tcp.h
@@ -11,6 +11,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /*
  * tcp event with arguments sk and skb
@@ -74,20 +75,38 @@ DEFINE_EVENT(tcp_event_sk_skb, tcp_retransmit_skb,
TP_ARGS(sk, skb)
 );
 
+#undef FN1
+#define FN1(reason)TRACE_DEFINE_ENUM(SK_RST_REASON_##reason);
+#undef FN2
+#define FN2(reason)TRACE_DEFINE_ENUM(SKB_DROP_REASON_##reason);
+DEFINE_RST_REASON(FN1, FN1)
+
+#undef FN1
+#undef FNe1
+#define FN1(reason){ SK_RST_REASON_##reason, #reason },
+#define FNe1(reason)   { SK_RST_REASON_##reason, #reason }
+
+#undef FN2
+#undef FNe2
+#define FN2(reason){ SKB_DROP_REASON_##reason, #reason },
+#define FNe2(reason)   { SKB_DROP_REASON_##reason, #reason }
 /*
  * skb of trace_tcp_send_reset is the skb that caused RST. In case of
  * active reset, skb should be NULL
  */
 TRACE_EVENT(tcp_send_reset,
 
-   TP_PROTO(const struct sock *sk, const struct sk_buff *skb),
+   TP_PROTO(const struct sock *sk,
+const struct sk_buff *skb,
+const int reason),
 
-   TP_ARGS(sk, skb),
+   TP_ARGS(sk, skb, reason),
 
TP_STRUCT__entry(
__field(const void *, skbaddr)
__field(const void *, skaddr)
__field(int, state)
+   __field(int, reason)
__array(__u8, saddr, sizeof(struct sockaddr_in6))
__array(__u8, daddr, sizeof(struct sockaddr_in6))
),
@@ -113,14 +132,24 @@ TRACE_EVENT(tcp_send_reset,
 */
TP_STORE_ADDR_PORTS_SKB(skb, th, entry->daddr, 
entry->saddr);
}
+   __entry->reason = reason;
),
 
-   TP_printk("skbaddr=%p skaddr=%p src=%pISpc dest=%pISpc state=%s",
+   TP_printk("skbaddr=%p skaddr=%p src=%pISpc dest=%pISpc state=%s 
reason=%s",
  __entry->skbaddr, __entry->skaddr,
  __entry->saddr, __entry->daddr,
- __entry->state ? show_tcp_state_name(__entry->state) : 
"UNKNOWN")
+ __entry->state ? show_tcp_state_name(__entry->state) : 
"UNKNOWN",
+ __entry->reason < RST_REASON_START ?
+   __print_symbolic(__entry->reason, 
DEFINE_DROP_REASON(FN2, FNe2)) :
+   __print_symbolic(__entry->reason, 
DEFINE_RST_REASON(FN1, FNe1)))
 );
 
+#undef FN1
+#undef FNe1
+
+#undef FN2
+#undef FNe2
+
 /*
  * tcp event with arguments sk
  *
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 35b0f3bbf596..3aee7cb35ee4 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -871,7 +871,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct 
sk_buff *skb,
if (sk)
arg.bound_dev_if = sk->sk_bound_dev_if;
 
-   trace_tcp_send_reset(sk, skb);
+   trace_tcp_send_reset(sk, skb, reason);
 
BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 18fbbad2028a..d5a7ecfcc1b3 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3608,7 +3608,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t 
priority, int reason)
/* skb of trace_tcp_send_reset() keeps the skb that caused RST,
 * skb here is different to the troublesome skb, so use NULL
 */
-   trace_tcp_send_reset(sk, NULL);
+   trace_tcp_send_reset(sk, NULL, SK_RST_REASON_NOT_SPECIFIED);
 }
 
 /* Send a crossed SYN-ACK during socket establishment.
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index cfcfa2626899..da2f70ad89b5 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1131,7 +1131,7 @@ static void tcp_v6_send_reset(const struct sock *sk, 
struct sk_buff *skb,
label = ip6_flowlabel(ipv6h);
}
 
-   trace_tcp_send_reset(sk, skb);
+   trace_tcp_send_reset(sk, skb, reason);
 
tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, 1,
 ipv6_get_dsfield(ipv6h), label, priority, txhash,
-- 
2.37.3

[PATCH net-next 5/6] mptcp: support rstreason for passive reset

2024-04-03 Thread Jason Xing

From: Jason Xing 

It relys on what reset options in MPTCP does as rfc8684 says. Reusing
this logic can save us much energy. This patch replaces all the prior
NOT_SPECIFIED reasons.

Signed-off-by: Jason Xing 
---
 net/mptcp/subflow.c | 26 --
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index a68d5d0f3e2a..24668d3020aa 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -304,7 +304,10 @@ static struct dst_entry *subflow_v4_route_req(const struct 
sock *sk,
 
dst_release(dst);
if (!req->syncookie)
-   tcp_request_sock_ops.send_reset(sk, skb, 
SK_RST_REASON_NOT_SPECIFIED);
+   /* According to RFC 8684, 3.2. Starting a New Subflow,
+* we should use an "MPTCP specific error" reason code.
+*/
+   tcp_request_sock_ops.send_reset(sk, skb, 
SK_RST_REASON_MPTCP_RST_EMPTCP);
return NULL;
 }
 
@@ -371,7 +374,10 @@ static struct dst_entry *subflow_v6_route_req(const struct 
sock *sk,
 
dst_release(dst);
if (!req->syncookie)
-   tcp6_request_sock_ops.send_reset(sk, skb, 
SK_RST_REASON_NOT_SPECIFIED);
+   /* According to RFC 8684, 3.2. Starting a New Subflow,
+* we should use an "MPTCP specific error" reason code.
+*/
+   tcp6_request_sock_ops.send_reset(sk, skb, 
SK_RST_REASON_MPTCP_RST_EMPTCP);
return NULL;
 }
 #endif
@@ -778,6 +784,7 @@ static struct sock *subflow_syn_recv_sock(const struct sock 
*sk,
bool fallback, fallback_is_fatal;
struct mptcp_sock *owner;
struct sock *child;
+   int reason;
 
pr_debug("listener=%p, req=%p, conn=%p", listener, req, listener->conn);
 
@@ -833,7 +840,8 @@ static struct sock *subflow_syn_recv_sock(const struct sock 
*sk,
 */
if (!ctx || fallback) {
if (fallback_is_fatal) {
-   subflow_add_reset_reason(skb, MPTCP_RST_EMPTCP);
+   reason = MPTCP_RST_EMPTCP;
+   subflow_add_reset_reason(skb, reason);
goto dispose_child;
}
goto fallback;
@@ -861,7 +869,8 @@ static struct sock *subflow_syn_recv_sock(const struct sock 
*sk,
} else if (ctx->mp_join) {
owner = subflow_req->msk;
if (!owner) {
-   subflow_add_reset_reason(skb, 
MPTCP_RST_EPROHIBIT);
+   reason = MPTCP_RST_EPROHIBIT;
+   subflow_add_reset_reason(skb, reason);
goto dispose_child;
}
 
@@ -875,13 +884,18 @@ static struct sock *subflow_syn_recv_sock(const struct 
sock *sk,
 ntohs(inet_sk((struct sock 
*)owner)->inet_sport));
if (!mptcp_pm_sport_in_anno_list(owner, sk)) {
SUBFLOW_REQ_INC_STATS(req, 
MPTCP_MIB_MISMATCHPORTACKRX);
+   reason = MPTCP_RST_EUNSPEC;
goto dispose_child;
}
SUBFLOW_REQ_INC_STATS(req, 
MPTCP_MIB_JOINPORTACKRX);
}
 
-   if (!mptcp_finish_join(child))
+   if (!mptcp_finish_join(child)) {
+   struct mptcp_subflow_context *subflow = 
mptcp_subflow_ctx(sk);
+
+   reason = subflow->reset_reason;
goto dispose_child;
+   }
 
SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKRX);
tcp_rsk(req)->drop_req = true;
@@ -901,7 +915,7 @@ static struct sock *subflow_syn_recv_sock(const struct sock 
*sk,
tcp_rsk(req)->drop_req = true;
inet_csk_prepare_for_destroy_sock(child);
tcp_done(child);
-   req->rsk_ops->send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
+   req->rsk_ops->send_reset(sk, skb, convert_mptcp_reason(reason));
 
/* The last child reference will be released by the caller */
return child;
-- 
2.37.3

[PATCH net-next 4/6] tcp: support rstreason for passive reset

2024-04-03 Thread Jason Xing

From: Jason Xing 

Reuse the dropreason logic to show the exact reason of tcp reset,
so we don't need to implement those duplicated reset reasons.
This patch replaces all the prior NOT_SPECIFIED reasons.

Signed-off-by: Jason Xing 
---
 net/ipv4/tcp_ipv4.c | 8 
 net/ipv6/tcp_ipv6.c | 8 
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 1c8248abe37a..35b0f3bbf596 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1935,7 +1935,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
return 0;
 
 reset:
-   tcp_v4_send_reset(rsk, skb, SK_RST_REASON_NOT_SPECIFIED);
+   tcp_v4_send_reset(rsk, skb, reason);
 discard:
kfree_skb_reason(skb, reason);
/* Be careful here. If this function gets more complicated and
@@ -2280,7 +2280,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
} else {
drop_reason = tcp_child_process(sk, nsk, skb);
if (drop_reason) {
-   tcp_v4_send_reset(nsk, skb, 
SK_RST_REASON_NOT_SPECIFIED);
+   tcp_v4_send_reset(nsk, skb, drop_reason);
goto discard_and_relse;
}
sock_put(sk);
@@ -2358,7 +2358,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
 bad_packet:
__TCP_INC_STATS(net, TCP_MIB_INERRS);
} else {
-   tcp_v4_send_reset(NULL, skb, SK_RST_REASON_NOT_SPECIFIED);
+   tcp_v4_send_reset(NULL, skb, drop_reason);
}
 
 discard_it:
@@ -2409,7 +2409,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
tcp_v4_timewait_ack(sk, skb);
break;
case TCP_TW_RST:
-   tcp_v4_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
+   tcp_v4_send_reset(sk, skb, drop_reason);
inet_twsk_deschedule_put(inet_twsk(sk));
goto discard_it;
case TCP_TW_SUCCESS:;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index f143b658fb71..cfcfa2626899 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1678,7 +1678,7 @@ int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
return 0;
 
 reset:
-   tcp_v6_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
+   tcp_v6_send_reset(sk, skb, reason);
 discard:
if (opt_skb)
__kfree_skb(opt_skb);
@@ -1864,7 +1864,7 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff 
*skb)
} else {
drop_reason = tcp_child_process(sk, nsk, skb);
if (drop_reason) {
-   tcp_v6_send_reset(nsk, skb, 
SK_RST_REASON_NOT_SPECIFIED);
+   tcp_v6_send_reset(nsk, skb, drop_reason);
goto discard_and_relse;
}
sock_put(sk);
@@ -1940,7 +1940,7 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff 
*skb)
 bad_packet:
__TCP_INC_STATS(net, TCP_MIB_INERRS);
} else {
-   tcp_v6_send_reset(NULL, skb, SK_RST_REASON_NOT_SPECIFIED);
+   tcp_v6_send_reset(NULL, skb, drop_reason);
}
 
 discard_it:
@@ -1995,7 +1995,7 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff 
*skb)
tcp_v6_timewait_ack(sk, skb);
break;
case TCP_TW_RST:
-   tcp_v6_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
+   tcp_v6_send_reset(sk, skb, drop_reason);
inet_twsk_deschedule_put(inet_twsk(sk));
goto discard_it;
case TCP_TW_SUCCESS:
-- 
2.37.3

[PATCH net-next 3/6] rstreason: prepare for active reset

2024-04-03 Thread Jason Xing

From: Jason Xing 

Like what we did to passive reset:
only passing possible reset reason in each active reset path.

No functional changes.

Signed-off-by: Jason Xing 
---
 include/net/tcp.h |  2 +-
 net/ipv4/tcp.c| 15 ++-
 net/ipv4/tcp_output.c |  2 +-
 net/ipv4/tcp_timer.c  |  9 ++---
 net/mptcp/protocol.c  |  4 +++-
 net/mptcp/subflow.c   |  5 +++--
 6 files changed, 24 insertions(+), 13 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 6ae35199d3b3..2b9b9d3d8065 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -667,7 +667,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
 void tcp_send_probe0(struct sock *);
 int tcp_write_wakeup(struct sock *, int mib);
 void tcp_send_fin(struct sock *sk);
-void tcp_send_active_reset(struct sock *sk, gfp_t priority);
+void tcp_send_active_reset(struct sock *sk, gfp_t priority, int reason);
 int tcp_send_synack(struct sock *);
 void tcp_push_one(struct sock *, unsigned int mss_now);
 void __tcp_send_ack(struct sock *sk, u32 rcv_nxt);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index e767721b3a58..eacfe0012977 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -275,6 +275,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -2805,7 +2806,8 @@ void __tcp_close(struct sock *sk, long timeout)
/* Unread data was tossed, zap the connection. */
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
tcp_set_state(sk, TCP_CLOSE);
-   tcp_send_active_reset(sk, sk->sk_allocation);
+   tcp_send_active_reset(sk, sk->sk_allocation,
+ SK_RST_REASON_NOT_SPECIFIED);
} else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
/* Check zero linger _after_ checking for unread data. */
sk->sk_prot->disconnect(sk, 0);
@@ -2879,7 +2881,8 @@ void __tcp_close(struct sock *sk, long timeout)
struct tcp_sock *tp = tcp_sk(sk);
if (READ_ONCE(tp->linger2) < 0) {
tcp_set_state(sk, TCP_CLOSE);
-   tcp_send_active_reset(sk, GFP_ATOMIC);
+   tcp_send_active_reset(sk, GFP_ATOMIC,
+ SK_RST_REASON_NOT_SPECIFIED);
__NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPABORTONLINGER);
} else {
@@ -2897,7 +2900,8 @@ void __tcp_close(struct sock *sk, long timeout)
if (sk->sk_state != TCP_CLOSE) {
if (tcp_check_oom(sk, 0)) {
tcp_set_state(sk, TCP_CLOSE);
-   tcp_send_active_reset(sk, GFP_ATOMIC);
+   tcp_send_active_reset(sk, GFP_ATOMIC,
+ SK_RST_REASON_NOT_SPECIFIED);
__NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPABORTONMEMORY);
} else if (!check_net(sock_net(sk))) {
@@ -3001,7 +3005,7 @@ int tcp_disconnect(struct sock *sk, int flags)
/* The last check adjusts for discrepancy of Linux wrt. RFC
 * states
 */
-   tcp_send_active_reset(sk, gfp_any());
+   tcp_send_active_reset(sk, gfp_any(), 
SK_RST_REASON_NOT_SPECIFIED);
WRITE_ONCE(sk->sk_err, ECONNRESET);
} else if (old_state == TCP_SYN_SENT)
WRITE_ONCE(sk->sk_err, ECONNRESET);
@@ -4557,7 +4561,8 @@ int tcp_abort(struct sock *sk, int err)
smp_wmb();
sk_error_report(sk);
if (tcp_need_reset(sk->sk_state))
-   tcp_send_active_reset(sk, GFP_ATOMIC);
+   tcp_send_active_reset(sk, GFP_ATOMIC,
+ SK_RST_REASON_NOT_SPECIFIED);
tcp_done(sk);
}
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index e3167ad96567..18fbbad2028a 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3583,7 +3583,7 @@ void tcp_send_fin(struct sock *sk)
  * was unread data in the receive queue.  This behavior is recommended
  * by RFC 2525, section 2.17.  -DaveM
  */
-void tcp_send_active_reset(struct sock *sk, gfp_t priority)
+void tcp_send_active_reset(struct sock *sk, gfp_t priority, int reason)
 {
struct sk_buff *skb;
 
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index d1ad20ce1c8c..7e7110bf3ea2 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -22,6 +22,7 @@
 #include 
 #include 
 #include 
+#include 
 
 static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk)
 {
@@ -127,7 +128,8 @@ static int tcp_out_of_resources(struct sock *sk, bool 
do_reset)
(!tp->snd_wnd && !tp->packets_out))

[PATCH net-next 2/6] rstreason: prepare for passive reset

2024-04-03 Thread Jason Xing

From: Jason Xing 

Adjust the paramenter and support passing reason of reset which
is for now NOT_SPECIFIED. No functional changes.

Signed-off-by: Jason Xing 
---
 include/net/request_sock.h |  3 ++-
 net/dccp/ipv4.c| 10 ++
 net/dccp/ipv6.c| 10 ++
 net/dccp/minisocks.c   |  3 ++-
 net/ipv4/tcp_ipv4.c| 12 +++-
 net/ipv4/tcp_minisocks.c   |  3 ++-
 net/ipv6/tcp_ipv6.c| 15 +--
 net/mptcp/subflow.c|  8 +---
 8 files changed, 39 insertions(+), 25 deletions(-)

diff --git a/include/net/request_sock.h b/include/net/request_sock.h
index 004e651e6067..93f9fee7e52f 100644
--- a/include/net/request_sock.h
+++ b/include/net/request_sock.h
@@ -34,7 +34,8 @@ struct request_sock_ops {
void(*send_ack)(const struct sock *sk, struct sk_buff *skb,
struct request_sock *req);
void(*send_reset)(const struct sock *sk,
- struct sk_buff *skb);
+ struct sk_buff *skb,
+ int reason);
void(*destructor)(struct request_sock *req);
void(*syn_ack_timeout)(const struct request_sock *req);
 };
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 44b033fe1ef6..628dd783e8f3 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -24,6 +24,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "ackvec.h"
 #include "ccid.h"
@@ -521,7 +522,8 @@ static int dccp_v4_send_response(const struct sock *sk, 
struct request_sock *req
return err;
 }
 
-static void dccp_v4_ctl_send_reset(const struct sock *sk, struct sk_buff 
*rxskb)
+static void dccp_v4_ctl_send_reset(const struct sock *sk, struct sk_buff 
*rxskb,
+  int reason)
 {
int err;
const struct iphdr *rxiph;
@@ -706,7 +708,7 @@ int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
return 0;
 
 reset:
-   dccp_v4_ctl_send_reset(sk, skb);
+   dccp_v4_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
kfree_skb(skb);
return 0;
 }
@@ -869,7 +871,7 @@ static int dccp_v4_rcv(struct sk_buff *skb)
if (nsk == sk) {
reqsk_put(req);
} else if (dccp_child_process(sk, nsk, skb)) {
-   dccp_v4_ctl_send_reset(sk, skb);
+   dccp_v4_ctl_send_reset(sk, skb, 
SK_RST_REASON_NOT_SPECIFIED);
goto discard_and_relse;
} else {
sock_put(sk);
@@ -909,7 +911,7 @@ static int dccp_v4_rcv(struct sk_buff *skb)
if (dh->dccph_type != DCCP_PKT_RESET) {
DCCP_SKB_CB(skb)->dccpd_reset_code =
DCCP_RESET_CODE_NO_CONNECTION;
-   dccp_v4_ctl_send_reset(sk, skb);
+   dccp_v4_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
}
 
 discard_it:
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index ded07e09f813..d64f39e26e87 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -29,6 +29,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "dccp.h"
 #include "ipv6.h"
@@ -256,7 +257,8 @@ static void dccp_v6_reqsk_destructor(struct request_sock 
*req)
kfree_skb(inet_rsk(req)->pktopts);
 }
 
-static void dccp_v6_ctl_send_reset(const struct sock *sk, struct sk_buff 
*rxskb)
+static void dccp_v6_ctl_send_reset(const struct sock *sk, struct sk_buff 
*rxskb,
+  int reason)
 {
const struct ipv6hdr *rxip6h;
struct sk_buff *skb;
@@ -656,7 +658,7 @@ static int dccp_v6_do_rcv(struct sock *sk, struct sk_buff 
*skb)
return 0;
 
 reset:
-   dccp_v6_ctl_send_reset(sk, skb);
+   dccp_v6_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
 discard:
if (opt_skb != NULL)
__kfree_skb(opt_skb);
@@ -762,7 +764,7 @@ static int dccp_v6_rcv(struct sk_buff *skb)
if (nsk == sk) {
reqsk_put(req);
} else if (dccp_child_process(sk, nsk, skb)) {
-   dccp_v6_ctl_send_reset(sk, skb);
+   dccp_v6_ctl_send_reset(sk, skb, 
SK_RST_REASON_NOT_SPECIFIED);
goto discard_and_relse;
} else {
sock_put(sk);
@@ -801,7 +803,7 @@ static int dccp_v6_rcv(struct sk_buff *skb)
if (dh->dccph_type != DCCP_PKT_RESET) {
DCCP_SKB_CB(skb)->dccpd_reset_code =
DCCP_RESET_CODE_NO_CONNECTION;
-   dccp_v6_ctl_send_reset(sk, skb);
+   dccp_v6_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
}
 
 discard_it:
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
index 64d805b27add..251a57cf5822 100644
--- a/net/dccp/minisocks.c

[PATCH net-next 1/6] net: introduce rstreason to detect why the RST is sent

2024-04-03 Thread Jason Xing

From: Jason Xing 

Add a new standalone file for the easy future extension to support
both active reset and passive reset in the TCP/DCCP/MPTCP protocols.

This patch only does the preparations for reset reason mechanism,
nothing else changes.

The reset reasons are divided into three parts:
1) reuse drop reasons for passive reset in TCP
2) reuse MP_TCPRST option for MPTCP
3) our own reasons

I will implement the basic codes of active/passive reset reason in
those three protocols, which is not complete for this moment. But
it provides a new chance to let other people add more reasons into
it:)

Signed-off-by: Jason Xing 
---
 include/net/rstreason.h | 93 +
 1 file changed, 93 insertions(+)
 create mode 100644 include/net/rstreason.h

diff --git a/include/net/rstreason.h b/include/net/rstreason.h
new file mode 100644
index ..24d098a78a60
--- /dev/null
+++ b/include/net/rstreason.h
@@ -0,0 +1,93 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifndef _LINUX_RSTREASON_H
+#define _LINUX_RSTREASON_H
+#include 
+
+#define DEFINE_RST_REASON(FN, FNe) \
+   FN(MPTCP_RST_EUNSPEC)   \
+   FN(MPTCP_RST_EMPTCP)\
+   FN(MPTCP_RST_ERESOURCE) \
+   FN(MPTCP_RST_EPROHIBIT) \
+   FN(MPTCP_RST_EWQ2BIG)   \
+   FN(MPTCP_RST_EBADPERF)  \
+   FN(MPTCP_RST_EMIDDLEBOX)\
+   FN(NOT_SPECIFIED)   \
+   FNe(MAX)
+
+#define RST_REASON_START (SKB_DROP_REASON_MAX + 1)
+
+/* There are three parts in order:
+ * 1) 0 - SKB_DROP_REASON_MAX: rely on drop reasons for passive reset in TCP
+ * 2) SKB_DROP_REASON_MAX + 1 - MPTCP_RST_EMIDDLEBOX: for MPTCP use
+ * 3) MPTCP_RST_EMIDDLEBOX - SK_RST_REASON_MAX: independent reset reason
+ */
+enum sk_rst_reason {
+   /* Leave this 'blank' part (0-SKB_DROP_REASON_MAX) for the reuse
+* of skb drop reason because rst reason relies on what drop reason
+* indicates exactly why it could happen.
+*/
+
+   /* Copy from include/uapi/linux/mptcp.h.
+* These reset fields will not be changed since they adhere to
+* RFC 8684. So do not touch them. I'm going to list each definition
+* of them respectively.
+*/
+   /* Unspecified error.
+* This is the default error; it implies that the subflow is no
+* longer available. The presence of this option shows that the
+* RST was generated by an MPTCP-aware device.
+*/
+   SK_RST_REASON_MPTCP_RST_EUNSPEC = RST_REASON_START,
+   /* MPTCP-specific error.
+* An error has been detected in the processing of MPTCP options.
+* This is the usual reason code to return in the cases where a RST
+* is being sent to close a subflow because of an invalid response.
+*/
+   SK_RST_REASON_MPTCP_RST_EMPTCP,
+   /* Lack of resources.
+* This code indicates that the sending host does not have enough
+* resources to support the terminated subflow.
+*/
+   SK_RST_REASON_MPTCP_RST_ERESOURCE,
+   /* Administratively prohibited.
+* This code indicates that the requested subflow is prohibited by
+* the policies of the sending host.
+*/
+   SK_RST_REASON_MPTCP_RST_EPROHIBIT,
+   /* Too much outstanding data.
+* This code indicates that there is an excessive amount of data
+* that needs to be transmitted over the terminated subflow while
+* having already been acknowledged over one or more other subflows.
+* This may occur if a path has been unavailable for a short period
+* and it is more efficient to reset and start again than it is to
+* retransmit the queued data.
+*/
+   SK_RST_REASON_MPTCP_RST_EWQ2BIG,
+   /* Unacceptable performance.
+* This code indicates that the performance of this subflow was
+* too low compared to the other subflows of this Multipath TCP
+* connection.
+*/
+   SK_RST_REASON_MPTCP_RST_EBADPERF,
+   /* Middlebox interference.
+* Middlebox interference has been detected over this subflow,
+* making MPTCP signaling invalid. For example, this may be sent
+* if the checksum does not validate.
+*/
+   SK_RST_REASON_MPTCP_RST_EMIDDLEBOX,
+
+   /* For the real standalone socket reset reason, we start from here */
+   SK_RST_REASON_NOT_SPECIFIED,
+
+   /* Maximum of socket reset reasons.
+* It shouldn't be used as a real 'reason'.
+*/
+   SK_RST_REASON_MAX,
+};
+
+static inline int convert_mptcp_reason(int reason)
+{
+   return reason += RST_REASON_START;
+}
+#endif
-- 
2.37.3

[PATCH net-next 0/6] Implement reset reason mechanism to detect

2024-04-03 Thread Jason Xing

From: Jason Xing 

In production, there are so many cases about why the RST skb is sent but
we don't have a very convenient/fast method to detect the exact underlying
reasons.

RST is implemented in two kinds: passive kind (like tcp_v4_send_reset())
and active kind (like tcp_send_active_reset()). The former can be traced
carefully 1) in TCP, with the help of drop reasons, which is based on
Eric's idea[1], 2) in MPTCP, with the help of reset options defined in
RFC 8684. The latter is relatively independent, which should be
implemented on our own.

In this series, I focus on the fundamental implement mostly about how
the rstreason mechnism and the detailed passive part works as an
example, not including the active reset part. In future, we can go
further and refine those NOT_SPECIFIED reasons.

Here are some examples when tracing:
-0   [002] ..s1.  1830.262425: tcp_send_reset: skbaddr=x
skaddr=x src=x dest=x state=x reason=NOT_SPECIFIED
-0   [002] ..s1.  1830.262425: tcp_send_reset: skbaddr=x
skaddr=x src=x dest=x state=x reason=NO_SOCKET

[1]
Link: 
https://lore.kernel.org/all/CANn89iJw8x-LqgsWOeJQQvgVg6DnL5aBRLi10QN2WBdr+X4k=w...@mail.gmail.com/

Note:
It's based on top of 
https://patchwork.kernel.org/project/netdevbpf/list/?series=840182


Jason Xing (6):
  net: introduce rstreason to detect why the RST is sent
  rstreason: prepare for passive reset
  rstreason: prepare for active reset
  tcp: support rstreason for passive reset
  mptcp: support rstreason for passive reset
  rstreason: make it work in trace world

 include/net/request_sock.h |  3 +-
 include/net/rstreason.h| 93 ++
 include/net/tcp.h  |  2 +-
 include/trace/events/tcp.h | 37 +--
 net/dccp/ipv4.c| 10 ++--
 net/dccp/ipv6.c| 10 ++--
 net/dccp/minisocks.c   |  3 +-
 net/ipv4/tcp.c | 15 --
 net/ipv4/tcp_ipv4.c| 14 +++---
 net/ipv4/tcp_minisocks.c   |  3 +-
 net/ipv4/tcp_output.c  |  4 +-
 net/ipv4/tcp_timer.c   |  9 ++--
 net/ipv6/tcp_ipv6.c| 17 ---
 net/mptcp/protocol.c   |  4 +-
 net/mptcp/subflow.c| 33 ++
 15 files changed, 209 insertions(+), 48 deletions(-)
 create mode 100644 include/net/rstreason.h

-- 
2.37.3

Re: Subject: [PATCH net-next v4] net/ipv4: add tracepoint for icmp_send

2024-04-01 Thread Jason Xing

On Mon, Apr 1, 2024 at 8:34 PM  wrote:
>
> From: hepeilin 
>
> Introduce a tracepoint for icmp_send, which can help users to get more
> detail information conveniently when icmp abnormal events happen.
>
> 1. Giving an usecase example:
> =
> When an application experiences packet loss due to an unreachable UDP
> destination port, the kernel will send an exception message through the
> icmp_send function. By adding a trace point for icmp_send, developers or
> system administrators can obtain detailed information about the UDP
> packet loss, including the type, code, source address, destination address,
> source port, and destination port. This facilitates the trouble-shooting
> of UDP packet loss issues especially for those network-service
> applications.
>
> 2. Operation Instructions:
> ==
> Switch to the tracing directory.
> cd /sys/kernel/tracing
> Filter for destination port unreachable.
> echo "type==3 && code==3" > events/icmp/icmp_send/filter
> Enable trace event.
> echo 1 > events/icmp/icmp_send/enable
>
> 3. Result View:
> 
>  udp_client_erro-11370   [002] ...s.12   124.728002:
>  icmp_send: icmp_send: type=3, code=3.
>  From 127.0.0.1:41895 to 127.0.0.1: ulen=23
>  skbaddr=589b167a
>
> v3->v4:
> Some fixes according to
> https://lore.kernel.org/all/CANn89i+EFEr7VHXNdOi59Ba_R1nFKSBJzBzkJFVgCTdXBx=y...@mail.gmail.com/
> 1.Add legality check for UDP header in SKB.

I think my understanding based on what Eric depicted differs from you:
we're supposed to filter out those many invalid cases and only trace
the valid action of sending a icmp, so where to add a new tracepoint
is important instead of adding more checks in the tracepoint itself.
Please refer to what trace_tcp_retransmit_skb() does :)

Thanks,
Jason

> 2.Target this patch for net-next.
>
> v2->v3:
> Some fixes according to
> https://lore.kernel.org/all/20240319102549.7f7f6...@gandalf.local.home/
> 1. Change the tracking directory to/sys/kernel/tracking.
> 2. Adjust the layout of the TP-STRUCT_entry parameter structure.
>
> v1->v2:
> Some fixes according to
> https://lore.kernel.org/all/CANn89iL-y9e_VFpdw=sztrnkru_tnuwqhufqtjvjsv-nz1x...@mail.gmail.com/
> 1. adjust the trace_icmp_send() to more protocols than UDP.
> 2. move the calling of trace_icmp_send after sanity checks
> in __icmp_send().
>
> Signed-off-by: Peilin He
> Reviewed-by: xu xin 
> Reviewed-by: Yunkai Zhang 
> Cc: Yang Yang 
> Cc: Liu Chun 
> Cc: Xuexin Jiang 
> ---
>  include/trace/events/icmp.h | 65 +
>  net/ipv4/icmp.c |  4 +++
>  2 files changed, 69 insertions(+)
>  create mode 100644 include/trace/events/icmp.h
>
> diff --git a/include/trace/events/icmp.h b/include/trace/events/icmp.h
> new file mode 100644
> index ..7d5190f48a28
> --- /dev/null
> +++ b/include/trace/events/icmp.h
> @@ -0,0 +1,65 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#undef TRACE_SYSTEM
> +#define TRACE_SYSTEM icmp
> +
> +#if !defined(_TRACE_ICMP_H) || defined(TRACE_HEADER_MULTI_READ)
> +#define _TRACE_ICMP_H
> +
> +#include 
> +#include 
> +
> +TRACE_EVENT(icmp_send,
> +
> +   TP_PROTO(const struct sk_buff *skb, int type, int code),
> +
> +   TP_ARGS(skb, type, code),
> +
> +   TP_STRUCT__entry(
> +   __field(const void *, skbaddr)
> +   __field(int, type)
> +   __field(int, code)
> +   __array(__u8, saddr, 4)
> +   __array(__u8, daddr, 4)
> +   __field(__u16, sport)
> +   __field(__u16, dport)
> +   __field(unsigned short, ulen)
> +   ),
> +
> +   TP_fast_assign(
> +   struct iphdr *iph = ip_hdr(skb);
> +   int proto_4 = iph->protocol;
> +   __be32 *p32;
> +
> +   __entry->skbaddr = skb;
> +   __entry->type = type;
> +   __entry->code = code;
> +
> +   struct udphdr *uh = udp_hdr(skb);
> +   if (proto_4 != IPPROTO_UDP || (u8 *)uh < skb->head ||
> +   (u8 *)uh + sizeof(struct udphdr) > 
> skb_tail_pointer(skb)) {
> +   __entry->sport = 0;
> +   __entry->dport = 0;
> +   __entry->ulen = 0;
> +   } else {
> +   __entry->sport = ntohs(uh->source);
> +   __entry->dport = ntohs(uh->dest);
> +   __entry->ulen = ntohs(uh->len);
> +   }
> +
> +   p32 = (__be32 *) __entry->saddr;
> +   *p32 = iph->saddr;
> +
> +   p32 = (__be32 *) __entry->daddr;
> +   *p32 =

[PATCH net-next v4 2/2] trace: tcp: fully support trace_tcp_send_reset

2024-04-01 Thread Jason Xing

From: Jason Xing 

Prior to this patch, what we can see by enabling trace_tcp_send is
only happening under two circumstances:
1) active rst mode
2) non-active rst mode and based on the full socket

That means the inconsistency occurs if we use tcpdump and trace
simultaneously to see how rst happens.

It's necessary that we should take into other cases into considerations,
say:
1) time-wait socket
2) no socket
...

By parsing the incoming skb and reversing its 4-tuple can
we know the exact 'flow' which might not exist.

Samples after applied this patch:
1. tcp_send_reset: skbaddr=XXX skaddr=XXX src=ip:port dest=ip:port
state=TCP_ESTABLISHED
2. tcp_send_reset: skbaddr=000...000 skaddr=XXX src=ip:port dest=ip:port
state=UNKNOWN
Note:
1) UNKNOWN means we cannot extract the right information from skb.
2) skbaddr/skaddr could be 0

Signed-off-by: Jason Xing 
---
 include/trace/events/tcp.h | 40 --
 net/ipv4/tcp_ipv4.c|  7 +++
 net/ipv6/tcp_ipv6.c|  3 ++-
 3 files changed, 43 insertions(+), 7 deletions(-)

diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
index cf14b6fcbeed..5c04a61a11c2 100644
--- a/include/trace/events/tcp.h
+++ b/include/trace/events/tcp.h
@@ -78,11 +78,47 @@ DEFINE_EVENT(tcp_event_sk_skb, tcp_retransmit_skb,
  * skb of trace_tcp_send_reset is the skb that caused RST. In case of
  * active reset, skb should be NULL
  */
-DEFINE_EVENT(tcp_event_sk_skb, tcp_send_reset,
+TRACE_EVENT(tcp_send_reset,
 
TP_PROTO(const struct sock *sk, const struct sk_buff *skb),
 
-   TP_ARGS(sk, skb)
+   TP_ARGS(sk, skb),
+
+   TP_STRUCT__entry(
+   __field(const void *, skbaddr)
+   __field(const void *, skaddr)
+   __field(int, state)
+   __array(__u8, saddr, sizeof(struct sockaddr_in6))
+   __array(__u8, daddr, sizeof(struct sockaddr_in6))
+   ),
+
+   TP_fast_assign(
+   __entry->skbaddr = skb;
+   __entry->skaddr = sk;
+   /* Zero means unknown state. */
+   __entry->state = sk ? sk->sk_state : 0;
+
+   memset(__entry->saddr, 0, sizeof(struct sockaddr_in6));
+   memset(__entry->daddr, 0, sizeof(struct sockaddr_in6));
+
+   if (sk && sk_fullsock(sk)) {
+   const struct inet_sock *inet = inet_sk(sk);
+
+   TP_STORE_ADDR_PORTS(__entry, inet, sk);
+   } else if (skb) {
+   const struct tcphdr *th = (const struct tcphdr 
*)skb->data;
+   /*
+* We should reverse the 4-tuple of skb, so later
+* it can print the right flow direction of rst.
+*/
+   TP_STORE_ADDR_PORTS_SKB(skb, th, entry->daddr, 
entry->saddr);
+   }
+   ),
+
+   TP_printk("skbaddr=%p skaddr=%p src=%pISpc dest=%pISpc state=%s",
+ __entry->skbaddr, __entry->skaddr,
+ __entry->saddr, __entry->daddr,
+ __entry->state ? show_tcp_state_name(__entry->state) : 
"UNKNOWN")
 );
 
 /*
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index a22ee5838751..0d47b48f8cfd 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -866,11 +866,10 @@ static void tcp_v4_send_reset(const struct sock *sk, 
struct sk_buff *skb)
 * routing might fail in this case. No choice here, if we choose to 
force
 * input interface, we will misroute in case of asymmetric route.
 */
-   if (sk) {
+   if (sk)
arg.bound_dev_if = sk->sk_bound_dev_if;
-   if (sk_fullsock(sk))
-   trace_tcp_send_reset(sk, skb);
-   }
+
+   trace_tcp_send_reset(sk, skb);
 
BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 3f4cba49e9ee..8e9c59b6c00c 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1113,7 +1113,6 @@ static void tcp_v6_send_reset(const struct sock *sk, 
struct sk_buff *skb)
if (sk) {
oif = sk->sk_bound_dev_if;
if (sk_fullsock(sk)) {
-   trace_tcp_send_reset(sk, skb);
if (inet6_test_bit(REPFLOW, sk))
label = ip6_flowlabel(ipv6h);
priority = READ_ONCE(sk->sk_priority);
@@ -1129,6 +1128,8 @@ static void tcp_v6_send_reset(const struct sock *sk, 
struct sk_buff *skb)
label = ip6_flowlabel(ipv6h);
}
 
+   trace_tcp_send_reset(sk, skb);
+
tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, 1,
 ipv6_get_dsfield(ipv6h), label, priority, txhash,
 );
-- 
2.37.3

[PATCH net-next v4 1/2] trace: adjust TP_STORE_ADDR_PORTS_SKB() parameters

2024-04-01 Thread Jason Xing

From: Jason Xing 

Introducing entry_saddr and entry_daddr parameters in this macro
for later use can help us record the reverse 4-tuple by analyzing
the 4-tuple of the incoming skb when receiving.

Signed-off-by: Jason Xing 
Reviewed-by: Eric Dumazet 
---
 include/trace/events/net_probe_common.h | 20 +++-
 include/trace/events/tcp.h  |  2 +-
 include/trace/events/udp.h  |  2 +-
 3 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/include/trace/events/net_probe_common.h 
b/include/trace/events/net_probe_common.h
index 5e33f91bdea3..976a58364bff 100644
--- a/include/trace/events/net_probe_common.h
+++ b/include/trace/events/net_probe_common.h
@@ -70,14 +70,14 @@
TP_STORE_V4MAPPED(__entry, saddr, daddr)
 #endif
 
-#define TP_STORE_ADDR_PORTS_SKB_V4(__entry, skb, protoh)   \
+#define TP_STORE_ADDR_PORTS_SKB_V4(skb, protoh, entry_saddr, entry_daddr) \
do {\
-   struct sockaddr_in *v4 = (void *)__entry->saddr;\
+   struct sockaddr_in *v4 = (void *)entry_saddr;   \
\
v4->sin_family = AF_INET;   \
v4->sin_port = protoh->source;  \
v4->sin_addr.s_addr = ip_hdr(skb)->saddr;   \
-   v4 = (void *)__entry->daddr;\
+   v4 = (void *)entry_daddr;   \
v4->sin_family = AF_INET;   \
v4->sin_port = protoh->dest;\
v4->sin_addr.s_addr = ip_hdr(skb)->daddr;   \
@@ -85,28 +85,30 @@
 
 #if IS_ENABLED(CONFIG_IPV6)
 
-#define TP_STORE_ADDR_PORTS_SKB(__entry, skb, protoh)  \
+#define TP_STORE_ADDR_PORTS_SKB(skb, protoh, entry_saddr, entry_daddr) \
do {\
const struct iphdr *iph = ip_hdr(skb);  \
\
if (iph->version == 6) {\
-   struct sockaddr_in6 *v6 = (void *)__entry->saddr; \
+   struct sockaddr_in6 *v6 = (void *)entry_saddr;  \
\
v6->sin6_family = AF_INET6; \
v6->sin6_port = protoh->source; \
v6->sin6_addr = ipv6_hdr(skb)->saddr;   \
-   v6 = (void *)__entry->daddr;\
+   v6 = (void *)entry_daddr;   \
v6->sin6_family = AF_INET6; \
v6->sin6_port = protoh->dest;   \
v6->sin6_addr = ipv6_hdr(skb)->daddr;   \
} else  \
-   TP_STORE_ADDR_PORTS_SKB_V4(__entry, skb, protoh); \
+   TP_STORE_ADDR_PORTS_SKB_V4(skb, protoh, \
+  entry_saddr, \
+  entry_daddr);\
} while (0)
 
 #else
 
-#define TP_STORE_ADDR_PORTS_SKB(__entry, skb, protoh)  \
-   TP_STORE_ADDR_PORTS_SKB_V4(__entry, skb, protoh)
+#define TP_STORE_ADDR_PORTS_SKB(skb, protoh, entry_saddr, entry_daddr) \
+   TP_STORE_ADDR_PORTS_SKB_V4(skb, protoh, entry_saddr, entry_daddr)
 
 #endif
 
diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
index 1db95175c1e5..cf14b6fcbeed 100644
--- a/include/trace/events/tcp.h
+++ b/include/trace/events/tcp.h
@@ -295,7 +295,7 @@ DECLARE_EVENT_CLASS(tcp_event_skb,
memset(__entry->saddr, 0, sizeof(struct sockaddr_in6));
memset(__entry->daddr, 0, sizeof(struct sockaddr_in6));
 
-   TP_STORE_ADDR_PORTS_SKB(__entry, skb, th);
+   TP_STORE_ADDR_PORTS_SKB(skb, th, __entry->saddr, 
__entry->daddr);
),
 
TP_printk("skbaddr=%p src=%pISpc dest=%pISpc",
diff --git a/include/trace/events/udp.h b/include/trace/events/udp.h
index 62bebe2a6ece..6142be4068e2 100644
--- a/include/trace/events/udp.h
+++ b/include/trace/events/udp.h
@@ -38,7 +38,7 @@ TRACE_EVENT(udp_fail_queue_rcv_skb,
memset(__entry->saddr, 0, sizeof(struct sockaddr_in6));
memset(__entry->daddr, 0, sizeof(struct sockaddr_in6));
 
-   TP_STORE_ADDR_PORTS_SKB(__entry, skb, uh);
+   TP_STORE_ADDR_PORTS_SKB(skb, uh, _

[PATCH net-next v4 0/2] tcp: make trace of reset logic complete

2024-04-01 Thread Jason Xing

From: Jason Xing 

Before this, we miss some cases where the TCP layer could send RST but
we cannot trace it. So I decided to complete it :)

v4
Link: 
https://lore.kernel.org/all/20240329034243.7929-1-kerneljasonx...@gmail.com/
1. rebased against latest net-next
2. remove {} and add skb test statement (Eric)
3. drop v3 patch [3/3] temporarily because 1) location is not that useful
since we can use perf or something else to trace, 2) Eric said we could
use drop_reason to show why we have to RST, which is good, but this seems
not work well for those ->send_reset() logic. I need more time to
investigate this part.

v3
1. fix a format problem in patch [3/3]

v2
1. fix spelling mistakes

Jason Xing (2):
  trace: adjust TP_STORE_ADDR_PORTS_SKB() parameters
  trace: tcp: fully support trace_tcp_send_reset

 include/trace/events/net_probe_common.h | 20 ++--
 include/trace/events/tcp.h  | 42 +++--
 include/trace/events/udp.h  |  2 +-
 net/ipv4/tcp_ipv4.c |  7 ++---
 net/ipv6/tcp_ipv6.c |  3 +-
 5 files changed, 56 insertions(+), 18 deletions(-)

-- 
2.37.3

Re: [PATCH net-next v3 3/3] tcp: add location into reset trace process

2024-03-29 Thread Jason Xing

On Fri, Mar 29, 2024 at 5:13 PM Eric Dumazet  wrote:
>
> On Fri, Mar 29, 2024 at 4:43 AM Jason Xing  wrote:
> >
> > From: Jason Xing 
> >
> > In addition to knowing the 4-tuple of the flow which generates RST,
> > the reason why it does so is very important because we have some
> > cases where the RST should be sent and have no clue which one
> > exactly.
> >
> > Adding location of reset process can help us more, like what
> > trace_kfree_skb does.
>
> Well, I would prefer a drop_reason here, even if there is no 'dropped' packet.

Good idea really. Then we can accurately diagnose which kind of reason
exactly causes the RST behavior.

I'm not sure if we can reuse the drop_reason here, like adding/using
some reasons in enum skb_drop_reason {}? The name is a little bit
strange.

Oh, I can just print the string of reason directly instead of really
using enum skb_drop_reason {}...

>
> This would be more stable than something based on function names that
> could be changed.
>
> tracepoints do not have to get ugly, we can easily get stack traces if needed.
>
> perf record -a -g  -e tcp:tcp_send_reset ...

Ah, yes, I blindly mimic what trace_skb_kfree() and
trace_consume_skb() do. Introducing some RST reasons is more
reasonable and easier to detect since it's not hard to add four or
five reasons only.

Thanks,
Jason

Re: [PATCH net-next v3 2/3] trace: tcp: fully support trace_tcp_send_reset

2024-03-29 Thread Jason Xing

On Fri, Mar 29, 2024 at 5:07 PM Eric Dumazet  wrote:
>
> On Fri, Mar 29, 2024 at 4:43 AM Jason Xing  wrote:
> >
> > From: Jason Xing 
> >
> > Prior to this patch, what we can see by enabling trace_tcp_send is
> > only happening under two circumstances:
> > 1) active rst mode
> > 2) non-active rst mode and based on the full socket
> >
> > That means the inconsistency occurs if we use tcpdump and trace
> > simultaneously to see how rst happens.
> >
> > It's necessary that we should take into other cases into considerations,
> > say:
> > 1) time-wait socket
> > 2) no socket
> > ...
> >
> > By parsing the incoming skb and reversing its 4-tuple can
> > we know the exact 'flow' which might not exist.
> >
> > Samples after applied this patch:
> > 1. tcp_send_reset: skbaddr=XXX skaddr=XXX src=ip:port dest=ip:port
> > state=TCP_ESTABLISHED
> > 2. tcp_send_reset: skbaddr=000...000 skaddr=XXX src=ip:port dest=ip:port
> > state=UNKNOWN
> > Note:
> > 1) UNKNOWN means we cannot extract the right information from skb.
> > 2) skbaddr/skaddr could be 0
> >
> > Signed-off-by: Jason Xing 
> > ---
> >  include/trace/events/tcp.h | 39 --
> >  net/ipv4/tcp_ipv4.c|  4 ++--
> >  net/ipv6/tcp_ipv6.c|  3 ++-
> >  3 files changed, 41 insertions(+), 5 deletions(-)
> >
> > diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
> > index 194425f69642..289438c54227 100644
> > --- a/include/trace/events/tcp.h
> > +++ b/include/trace/events/tcp.h
> > @@ -78,11 +78,46 @@ DEFINE_EVENT(tcp_event_sk_skb, tcp_retransmit_skb,
> >   * skb of trace_tcp_send_reset is the skb that caused RST. In case of
> >   * active reset, skb should be NULL
> >   */
> > -DEFINE_EVENT(tcp_event_sk_skb, tcp_send_reset,
> > +TRACE_EVENT(tcp_send_reset,
> >
> > TP_PROTO(const struct sock *sk, const struct sk_buff *skb),
> >
> > -   TP_ARGS(sk, skb)
> > +   TP_ARGS(sk, skb),
> > +
> > +   TP_STRUCT__entry(
> > +   __field(const void *, skbaddr)
> > +   __field(const void *, skaddr)
> > +   __field(int, state)
> > +   __array(__u8, saddr, sizeof(struct sockaddr_in6))
> > +   __array(__u8, daddr, sizeof(struct sockaddr_in6))
> > +   ),
> > +
> > +   TP_fast_assign(
> > +   __entry->skbaddr = skb;
> > +   __entry->skaddr = sk;
> > +   /* Zero means unknown state. */
> > +   __entry->state = sk ? sk->sk_state : 0;
> > +
> > +   memset(__entry->saddr, 0, sizeof(struct sockaddr_in6));
> > +   memset(__entry->daddr, 0, sizeof(struct sockaddr_in6));
> > +
> > +   if (sk && sk_fullsock(sk)) {
> > +   const struct inet_sock *inet = inet_sk(sk);
> > +
> > +   TP_STORE_ADDR_PORTS(__entry, inet, sk);
> > +   } else {
>
> To be on the safe side, I would test if (skb) here.
> We have one caller with skb == NULL, we might have more in the future.

Thanks for the review.

How about changing '} else {' to '} else if (skb) {', then if we go
into this else-if branch, we will print nothing, right? I'll test it
in this case.

>
> > +   /*
> > +* We should reverse the 4-tuple of skb, so later
> > +* it can print the right flow direction of rst.
> > +*/
> > +   TP_STORE_ADDR_PORTS_SKB(skb, entry->daddr, 
> > entry->saddr);
> > +   }
> > +   ),
> > +
> > +   TP_printk("skbaddr=%p skaddr=%p src=%pISpc dest=%pISpc state=%s",
> > + __entry->skbaddr, __entry->skaddr,
> > + __entry->saddr, __entry->daddr,
> > + __entry->state ? show_tcp_state_name(__entry->state) : 
> > "UNKNOWN")
> >  );
> >
> >  /*
> > diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> > index a22ee5838751..d5c4a969c066 100644
> > --- a/net/ipv4/tcp_ipv4.c
> > +++ b/net/ipv4/tcp_ipv4.c
> > @@ -868,10 +868,10 @@ static void tcp_v4_send_reset(const struct sock *sk, 
> > struct sk_buff *skb)
> >  */
> > if (sk) {
> > arg.bound_dev_if = sk->sk_bound_dev_if;
> > -   if (sk_fullsock(sk))
> > -

[PATCH net-next v3 3/3] tcp: add location into reset trace process

2024-03-28 Thread Jason Xing

From: Jason Xing 

In addition to knowing the 4-tuple of the flow which generates RST,
the reason why it does so is very important because we have some
cases where the RST should be sent and have no clue which one
exactly.

Adding location of reset process can help us more, like what
trace_kfree_skb does.

Signed-off-by: Jason Xing 
---
 include/trace/events/tcp.h | 13 +
 net/ipv4/tcp_ipv4.c|  2 +-
 net/ipv4/tcp_output.c  |  2 +-
 net/ipv6/tcp_ipv6.c|  2 +-
 4 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
index 289438c54227..7a6dc525bfc7 100644
--- a/include/trace/events/tcp.h
+++ b/include/trace/events/tcp.h
@@ -80,13 +80,16 @@ DEFINE_EVENT(tcp_event_sk_skb, tcp_retransmit_skb,
  */
 TRACE_EVENT(tcp_send_reset,
 
-   TP_PROTO(const struct sock *sk, const struct sk_buff *skb),
+   TP_PROTO(const struct sock *sk,
+const struct sk_buff *skb,
+void *location),
 
-   TP_ARGS(sk, skb),
+   TP_ARGS(sk, skb, location),
 
TP_STRUCT__entry(
__field(const void *, skbaddr)
__field(const void *, skaddr)
+   __field(void *, location)
__field(int, state)
__array(__u8, saddr, sizeof(struct sockaddr_in6))
__array(__u8, daddr, sizeof(struct sockaddr_in6))
@@ -112,12 +115,14 @@ TRACE_EVENT(tcp_send_reset,
 */
TP_STORE_ADDR_PORTS_SKB(skb, entry->daddr, 
entry->saddr);
}
+   __entry->location = location;
),
 
-   TP_printk("skbaddr=%p skaddr=%p src=%pISpc dest=%pISpc state=%s",
+   TP_printk("skbaddr=%p skaddr=%p src=%pISpc dest=%pISpc state=%s 
location=%pS",
  __entry->skbaddr, __entry->skaddr,
  __entry->saddr, __entry->daddr,
- __entry->state ? show_tcp_state_name(__entry->state) : 
"UNKNOWN")
+ __entry->state ? show_tcp_state_name(__entry->state) : 
"UNKNOWN",
+ __entry->location)
 );
 
 /*
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index d5c4a969c066..fec54cfc4fb3 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -870,7 +870,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct 
sk_buff *skb)
arg.bound_dev_if = sk->sk_bound_dev_if;
}
 
-   trace_tcp_send_reset(sk, skb);
+   trace_tcp_send_reset(sk, skb,  __builtin_return_address(0));
 
BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index e3167ad96567..fb613582817e 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3608,7 +3608,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t 
priority)
/* skb of trace_tcp_send_reset() keeps the skb that caused RST,
 * skb here is different to the troublesome skb, so use NULL
 */
-   trace_tcp_send_reset(sk, NULL);
+   trace_tcp_send_reset(sk, NULL,  __builtin_return_address(0));
 }
 
 /* Send a crossed SYN-ACK during socket establishment.
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 8e9c59b6c00c..7eba9c3d69f1 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1128,7 +1128,7 @@ static void tcp_v6_send_reset(const struct sock *sk, 
struct sk_buff *skb)
label = ip6_flowlabel(ipv6h);
}
 
-   trace_tcp_send_reset(sk, skb);
+   trace_tcp_send_reset(sk, skb,  __builtin_return_address(0));
 
tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, 1,
 ipv6_get_dsfield(ipv6h), label, priority, txhash,
-- 
2.37.3

[PATCH net-next v3 2/3] trace: tcp: fully support trace_tcp_send_reset

2024-03-28 Thread Jason Xing

From: Jason Xing 

Prior to this patch, what we can see by enabling trace_tcp_send is
only happening under two circumstances:
1) active rst mode
2) non-active rst mode and based on the full socket

That means the inconsistency occurs if we use tcpdump and trace
simultaneously to see how rst happens.

It's necessary that we should take into other cases into considerations,
say:
1) time-wait socket
2) no socket
...

By parsing the incoming skb and reversing its 4-tuple can
we know the exact 'flow' which might not exist.

Samples after applied this patch:
1. tcp_send_reset: skbaddr=XXX skaddr=XXX src=ip:port dest=ip:port
state=TCP_ESTABLISHED
2. tcp_send_reset: skbaddr=000...000 skaddr=XXX src=ip:port dest=ip:port
state=UNKNOWN
Note:
1) UNKNOWN means we cannot extract the right information from skb.
2) skbaddr/skaddr could be 0

Signed-off-by: Jason Xing 
---
 include/trace/events/tcp.h | 39 --
 net/ipv4/tcp_ipv4.c|  4 ++--
 net/ipv6/tcp_ipv6.c|  3 ++-
 3 files changed, 41 insertions(+), 5 deletions(-)

diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
index 194425f69642..289438c54227 100644
--- a/include/trace/events/tcp.h
+++ b/include/trace/events/tcp.h
@@ -78,11 +78,46 @@ DEFINE_EVENT(tcp_event_sk_skb, tcp_retransmit_skb,
  * skb of trace_tcp_send_reset is the skb that caused RST. In case of
  * active reset, skb should be NULL
  */
-DEFINE_EVENT(tcp_event_sk_skb, tcp_send_reset,
+TRACE_EVENT(tcp_send_reset,
 
TP_PROTO(const struct sock *sk, const struct sk_buff *skb),
 
-   TP_ARGS(sk, skb)
+   TP_ARGS(sk, skb),
+
+   TP_STRUCT__entry(
+   __field(const void *, skbaddr)
+   __field(const void *, skaddr)
+   __field(int, state)
+   __array(__u8, saddr, sizeof(struct sockaddr_in6))
+   __array(__u8, daddr, sizeof(struct sockaddr_in6))
+   ),
+
+   TP_fast_assign(
+   __entry->skbaddr = skb;
+   __entry->skaddr = sk;
+   /* Zero means unknown state. */
+   __entry->state = sk ? sk->sk_state : 0;
+
+   memset(__entry->saddr, 0, sizeof(struct sockaddr_in6));
+   memset(__entry->daddr, 0, sizeof(struct sockaddr_in6));
+
+   if (sk && sk_fullsock(sk)) {
+   const struct inet_sock *inet = inet_sk(sk);
+
+   TP_STORE_ADDR_PORTS(__entry, inet, sk);
+   } else {
+   /*
+* We should reverse the 4-tuple of skb, so later
+* it can print the right flow direction of rst.
+*/
+   TP_STORE_ADDR_PORTS_SKB(skb, entry->daddr, 
entry->saddr);
+   }
+   ),
+
+   TP_printk("skbaddr=%p skaddr=%p src=%pISpc dest=%pISpc state=%s",
+ __entry->skbaddr, __entry->skaddr,
+ __entry->saddr, __entry->daddr,
+ __entry->state ? show_tcp_state_name(__entry->state) : 
"UNKNOWN")
 );
 
 /*
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index a22ee5838751..d5c4a969c066 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -868,10 +868,10 @@ static void tcp_v4_send_reset(const struct sock *sk, 
struct sk_buff *skb)
 */
if (sk) {
arg.bound_dev_if = sk->sk_bound_dev_if;
-   if (sk_fullsock(sk))
-   trace_tcp_send_reset(sk, skb);
}
 
+   trace_tcp_send_reset(sk, skb);
+
BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
 
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 3f4cba49e9ee..8e9c59b6c00c 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1113,7 +1113,6 @@ static void tcp_v6_send_reset(const struct sock *sk, 
struct sk_buff *skb)
if (sk) {
oif = sk->sk_bound_dev_if;
if (sk_fullsock(sk)) {
-   trace_tcp_send_reset(sk, skb);
if (inet6_test_bit(REPFLOW, sk))
label = ip6_flowlabel(ipv6h);
priority = READ_ONCE(sk->sk_priority);
@@ -1129,6 +1128,8 @@ static void tcp_v6_send_reset(const struct sock *sk, 
struct sk_buff *skb)
label = ip6_flowlabel(ipv6h);
}
 
+   trace_tcp_send_reset(sk, skb);
+
tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, 1,
 ipv6_get_dsfield(ipv6h), label, priority, txhash,
 );
-- 
2.37.3

[PATCH net-next v3 1/3] trace: adjust TP_STORE_ADDR_PORTS_SKB() parameters

2024-03-28 Thread Jason Xing

From: Jason Xing 

Introducing entry_saddr and entry_daddr parameters in this macro
for later use can help us record the reverse 4-tuple by analyzing
the 4-tuple of the incoming skb when receiving.

Signed-off-by: Jason Xing 
---
 include/trace/events/tcp.h | 21 +++--
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
index 3c08a0846c47..194425f69642 100644
--- a/include/trace/events/tcp.h
+++ b/include/trace/events/tcp.h
@@ -273,15 +273,15 @@ TRACE_EVENT(tcp_probe,
  __entry->skbaddr, __entry->skaddr)
 );
 
-#define TP_STORE_ADDR_PORTS_SKB_V4(__entry, skb)   \
+#define TP_STORE_ADDR_PORTS_SKB_V4(skb, entry_saddr, entry_daddr)  \
do {\
const struct tcphdr *th = (const struct tcphdr *)skb->data; \
-   struct sockaddr_in *v4 = (void *)__entry->saddr;\
+   struct sockaddr_in *v4 = (void *)entry_saddr;   \
\
v4->sin_family = AF_INET;   \
v4->sin_port = th->source;  \
v4->sin_addr.s_addr = ip_hdr(skb)->saddr;   \
-   v4 = (void *)__entry->daddr;\
+   v4 = (void *)entry_daddr;   \
v4->sin_family = AF_INET;   \
v4->sin_port = th->dest;\
v4->sin_addr.s_addr = ip_hdr(skb)->daddr;   \
@@ -289,29 +289,30 @@ TRACE_EVENT(tcp_probe,
 
 #if IS_ENABLED(CONFIG_IPV6)
 
-#define TP_STORE_ADDR_PORTS_SKB(__entry, skb)  \
+#define TP_STORE_ADDR_PORTS_SKB(skb, entry_saddr, entry_daddr) \
do {\
const struct iphdr *iph = ip_hdr(skb);  \
\
if (iph->version == 6) {\
const struct tcphdr *th = (const struct tcphdr 
*)skb->data; \
-   struct sockaddr_in6 *v6 = (void *)__entry->saddr; \
+   struct sockaddr_in6 *v6 = (void *)entry_saddr;  \
\
v6->sin6_family = AF_INET6; \
v6->sin6_port = th->source; \
v6->sin6_addr = ipv6_hdr(skb)->saddr;   \
-   v6 = (void *)__entry->daddr;\
+   v6 = (void *)entry_daddr;   \
v6->sin6_family = AF_INET6; \
v6->sin6_port = th->dest;   \
v6->sin6_addr = ipv6_hdr(skb)->daddr;   \
} else  \
-   TP_STORE_ADDR_PORTS_SKB_V4(__entry, skb);   \
+   TP_STORE_ADDR_PORTS_SKB_V4(skb, entry_saddr,\
+  entry_daddr); \
} while (0)
 
 #else
 
-#define TP_STORE_ADDR_PORTS_SKB(__entry, skb)  \
-   TP_STORE_ADDR_PORTS_SKB_V4(__entry, skb)
+#define TP_STORE_ADDR_PORTS_SKB(skb, entry_saddr, entry_daddr) \
+   TP_STORE_ADDR_PORTS_SKB_V4(skb, entry_saddr, entry_daddr)
 
 #endif
 
@@ -336,7 +337,7 @@ DECLARE_EVENT_CLASS(tcp_event_skb,
memset(__entry->saddr, 0, sizeof(struct sockaddr_in6));
memset(__entry->daddr, 0, sizeof(struct sockaddr_in6));
 
-   TP_STORE_ADDR_PORTS_SKB(__entry, skb);
+   TP_STORE_ADDR_PORTS_SKB(skb, __entry->saddr, __entry->daddr);
),
 
TP_printk("skbaddr=%p src=%pISpc dest=%pISpc",
-- 
2.37.3

[PATCH net-next v3 0/3] tcp: make trace of reset logic complete

2024-03-28 Thread Jason Xing

From: Jason Xing 

Before this, we miss some cases where the TCP layer could send RST but
we cannot trace it. So I decided to complete it :)

v3
1. fix a format problem in patch [3/3]

v2
1. fix spelling mistakes

Jason Xing (3):
  trace: adjust TP_STORE_ADDR_PORTS_SKB() parameters
  trace: tcp: fully support trace_tcp_send_reset
  tcp: add location into reset trace process

 include/trace/events/tcp.h | 67 ++
 net/ipv4/tcp_ipv4.c|  4 +--
 net/ipv4/tcp_output.c  |  2 +-
 net/ipv6/tcp_ipv6.c|  3 +-
 4 files changed, 59 insertions(+), 17 deletions(-)

-- 
2.37.3

Re: [PATCH net-next v2 3/3] tcp: add location into reset trace process

2024-03-28 Thread Jason Xing

On Fri, Mar 29, 2024 at 9:15 AM Jakub Kicinski  wrote:
>
> On Tue, 26 Mar 2024 12:08:01 +0100 Paolo Abeni wrote:
> > > -   TP_PROTO(const struct sock *sk, const struct sk_buff *skb),
> > > +   TP_PROTO(
> > > +   const struct sock *sk,
> > > +   const struct sk_buff *skb,
> > > +   void *location),
> >
> > Very minor nit: the above lines should be aligned with the open
> > bracket.
>
> Yes, and a very odd way of breaking it up. Empty line after ( but
> ) not on a separate line.

After I blamed the history, maybe I should follow the format like
TRACE_EVENT(netfs_read)?

>
> > No need to repost just for this, but let's wait for Eric's feedback.
>
> Erring on the side of caution I'd read this:
> https://lore.kernel.org/all/CANn89iKK-qPhQ91Sq8rR_=KDWajnY2=et2bujdsgoqk4wxf...@mail.gmail.com/
> as lukewarm towards tp changes. Please repost if you think otherwise
> (with the formatting fixed)

Yes, I will repost it. I'm not introducing a controversial new tracepoint.

This patch is not only about whether we should use 'old-way' tracing
but about the tracepoint of this tcp reset that is not complete. Some
admins could use bpf to capture RST behaviours through hooking this
tracepoint which is not right currently apparently.

Besides, I simply tested the performance between using tracing and bpf
to monitor the fast path (like __tcp_transmit_skb()) on my loopback. I
saw at least 12% degradation with BPF used. So the advantage of trace
is obvious even though nowadays it is considered as an old school
method.

Thanks,
Jason

> --
> pw-bot: cr

Re: [PATCH net-next v4 2/2] net: udp: add IP/port data to the tracepoint udp/udp_fail_queue_rcv_skb

2024-03-27 Thread Jason Xing

On Wed, Mar 27, 2024 at 2:05 AM Balazs Scheidler  wrote:
>
> The udp_fail_queue_rcv_skb() tracepoint lacks any details on the source
> and destination IP/port whereas this information can be critical in case
> of UDP/syslog.
>
> Signed-off-by: Balazs Scheidler 

Looks good to me, thanks!

Reviewed-by: Jason Xing

Re: [PATCH net-next 0/3] trace: use TP_STORE_ADDRS macro

2024-03-26 Thread Jason Xing

On Tue, Mar 26, 2024 at 9:18 PM Eric Dumazet  wrote:
>
> On Tue, Mar 26, 2024 at 11:44 AM Jason Xing  wrote:
>
> > Well, it's a pity that it seems that we are about to abandon this
> > method but it's not that friendly to the users who are unable to
> > deploy BPF...
>
> It is a pity these tracepoint patches are consuming a lot of reviewer
> time, just because
> some people 'can not deploy BPF'

Sure, not everyone can do this easily. The phenomenon still exists and
we cannot ignore it. Do you remember that about a month ago someone
submitted one patch introducing a new tracepoint and then I replied
to/asked you if it's necessary that we replace most of the tracepoints
with BPF? Now I realise and accept the fact...

I'll keep reviewing such patches and hope it can give you maintainers
a break. I don't mind taking some time to do it, after all it's not a
bad thing to help some people.

>
> Well, I came up with more ideas about how to improve the
> > trace function in recent days. The motivation of doing this is that I
> > encountered some issues which could be traced/diagnosed by using trace
> > effortlessly without writing some bpftrace codes again and again. The
> > status of trace seems not active but many people are still using it, I
> > believe.
>
> 'Writing bpftrace codes again and again' is not a good reason to add
> maintenance costs
> to linux networking stack.

I'm just saying :)

Re: [PATCH net-next 0/3] trace: use TP_STORE_ADDRS macro

2024-03-26 Thread Jason Xing

On Tue, Mar 26, 2024 at 6:29 PM Paolo Abeni  wrote:
>
> On Tue, 2024-03-26 at 12:14 +0800, Jason Xing wrote:
> > On Mon, Mar 25, 2024 at 11:43 AM Jason Xing  
> > wrote:
> > >
> > > From: Jason Xing 
> > >
> > > Using the macro for other tracepoints use to be more concise.
> > > No functional change.
> > >
> > > Jason Xing (3):
> > >   trace: move to TP_STORE_ADDRS related macro to net_probe_common.h
> > >   trace: use TP_STORE_ADDRS() macro in inet_sk_error_report()
> > >   trace: use TP_STORE_ADDRS() macro in inet_sock_set_state()
> > >
> > >  include/trace/events/net_probe_common.h | 29 
> > >  include/trace/events/sock.h | 35 -
> >
> > I just noticed that some trace files in include/trace directory (like
> > net_probe_common.h, sock.h, skb.h, net.h, sock.h, udp.h, sctp.h,
> > qdisc.h, neigh.h, napi.h, icmp.h, ...) are not owned by networking
> > folks while some files (like tcp.h) have been maintained by specific
> > maintainers/experts (like Eric) because they belong to one specific
> > area. I wonder if we can get more networking guys involved in net
> > tracing.
> >
> > I'm not sure if 1) we can put those files into the "NETWORKING
> > [GENERAL]" category, or 2) we can create a new category to include
> > them all.
>
> I think all the file you mentioned are not under networking because of
> MAINTAINER file inaccuracy, and we could move there them accordingly.

Yes, they are not under the networking category currently. So how
could we move them? The MAINTAINER file doesn't have all the specific
categories which are suitable for each of the trace files.

> >
> > I know people start using BPF to trace them all instead, but I can see
> > some good advantages of those hooks implemented in the kernel, say:
> > 1) help those machines which are not easy to use BPF tools.
> > 2) insert the tracepoint in the middle of some functions which cannot
> > be replaced by bpf kprobe.
> > 3) if we have enough tracepoints, we can generate a timeline to
> > know/detect which flow/skb spends unexpected time at which point.
> > ...
> > We can do many things in this area, I think :)
> >
> > What do you think about this, Jakub, Paolo, Eric ?
>
> I agree tracepoints are useful, but I think the general agreement is
> that they are the 'old way', we should try to avoid their
> proliferation.

Well, it's a pity that it seems that we are about to abandon this
method but it's not that friendly to the users who are unable to
deploy BPF... Well, I came up with more ideas about how to improve the
trace function in recent days. The motivation of doing this is that I
encountered some issues which could be traced/diagnosed by using trace
effortlessly without writing some bpftrace codes again and again. The
status of trace seems not active but many people are still using it, I
believe.

Thanks,
Jason

>
> Cheers,
>
> Paolo
>

Re: [PATCH net-next v3 1/2] net: port TP_STORE_ADDR_PORTS_SKB macro to be tcp/udp independent

2024-03-26 Thread Jason Xing

On Mon, Mar 25, 2024 at 6:29 PM Balazs Scheidler  wrote:
>
> This patch moves TP_STORE_ADDR_PORTS_SKB() to a common header and removes
> the TCP specific implementation details.
>
> Previously the macro assumed the skb passed as an argument is a
> TCP packet, the implementation now uses an argument to the L4 header and
> uses that to extract the source/destination ports, which happen
> to be named the same in "struct tcphdr" and "struct udphdr"
>
> Signed-off-by: Balazs Scheidler 

The patch itself looks good to me, feel free to add:
Reviewed-by: Jason Xing

1 2 >

1 - 100 of 158 matches

Mail list logo