On Tue, Jun 16, 2026 at 4:53 PM <[email protected]> wrote:
>
> From: Numan Siddique <[email protected]>
>
> Implement the controller-side handler for the new OVN action
> 'icmp4.inner_ip4.src = <value>' introduced in the previous patch.
>
> When ovn-controller receives a packet with action opcode
> ACTION_OPCODE_PUT_ICMP4_INNER_IP4_SRC, it now:
>
> - Locates the inner IPv4 header inside the ICMP error payload
> (right after the 8-byte outer ICMP header), validating on
> lengths (dp_packet_l4_size()) so that no out-of-bounds pointer
> is ever synthesized.
> - Rewrites the inner IPv4 source address to the value supplied
> in the userdata.
> - Fixes the inner IPv4 header checksum incrementally.
>
> No outer ICMP checksum update is performed. The ICMP checksum
> covers the inner IP source (changed by +D) and the inner IP
> checksum field (changed by -D by the incremental update above),
> and nothing else here, so the two deltas cancel. This holds for
> any inner header, valid or not, precisely because the inner
> checksum update is incremental.
>
> The inner L4 (TCP/UDP) checksum is intentionally left unchanged.
> Rewriting the inner source invalidates it via the L4
> pseudo-header, but stateless NAT does not remap ports and the
> inner L4 checksum of an ICMP-quoted packet is not validated for
> error correlation: conntrack matches the error to the tracked
> flow using the inner 4-tuple, not the inner L4 checksum. (Should
> a future change ever fix the inner L4 checksum, it would then need
> a compensating outer ICMP checksum update, as that field has no
> offsetting change inside the ICMP payload.)
>
> The outer IPv4 header is not touched, so its checksum stays valid.
> Finally the OpenFlow pipeline is resumed with the rewritten packet.
>
> This is the building block needed to make ICMP fragmentation-
> needed errors (RFC 1191 PMTU discovery) work correctly when an
> OVN logical router uses stateless NAT. With stateless NAT, the
> inner payload of an incoming ICMP error still carries the
> external (post-NAT) IPv4 source, so conntrack in the VM's
> logical switch zone fails to match the tracked outgoing flow
> and marks the packet as +inv. By rewriting the inner source
> back to the logical IP in the LR pipeline before the LS
> pipeline runs, conntrack can correlate the error with the
> original flow and the VM can honor the PMTUD signal.
>
> A follow-up patch will add the corresponding ovn-northd logical
> flow that emits this action for each stateless NAT configured
> on a logical router.
>
> Assisted-by: Claude Opus 4.7, Claude Code
> Signed-off-by: Numan Siddique <[email protected]>
Recheck-request: github-robot-_Build_and_Test
> ---
> controller/pinctrl.c | 104 +++++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 104 insertions(+)
>
> diff --git a/controller/pinctrl.c b/controller/pinctrl.c
> index 12eb5d65f3..4dc4f38ac9 100644
> --- a/controller/pinctrl.c
> +++ b/controller/pinctrl.c
> @@ -263,6 +263,10 @@ static void pinctrl_handle_nd_ns(struct rconn *swconn,
> const struct ofputil_packet_in *pin,
> struct ofpbuf *userdata,
> const struct ofpbuf *continuation);
> +static void pinctrl_handle_put_icmp4_inner_ip4_src(
> + struct rconn *swconn, const struct flow *in_flow,
> + struct dp_packet *pkt_in, struct ofputil_packet_in *pin,
> + struct ofpbuf *userdata, struct ofpbuf *continuation);
> static void
> pinctrl_handle_event(struct ofpbuf *userdata)
> OVS_REQUIRES(pinctrl_mutex);
> @@ -3928,6 +3932,10 @@ process_packet_in(struct rconn *swconn, const struct
> ofp_header *msg)
> break;
> }
>
> + case ACTION_OPCODE_PUT_ICMP4_INNER_IP4_SRC:
> + pinctrl_handle_put_icmp4_inner_ip4_src(swconn, &headers, &packet,
> &pin,
> + &userdata, &continuation);
> + break;
> default:
> VLOG_WARN_RL(&rl, "unrecognized packet-in opcode %"PRIu32,
> ntohl(ah->opcode));
> @@ -6673,6 +6681,102 @@ exit:
> dp_packet_uninit(pkt_out_ptr);
> }
>
> +/* Called with in the pinctrl_handler thread context. */
> +static void
> +pinctrl_handle_put_icmp4_inner_ip4_src(struct rconn *swconn,
> + const struct flow *in_flow,
> + struct dp_packet *pkt_in,
> + struct ofputil_packet_in *pin,
> + struct ofpbuf *userdata,
> + struct ofpbuf *continuation)
> +{
> + enum ofp_version version = rconn_get_version(swconn);
> + enum ofputil_protocol proto = ofputil_protocol_from_ofp_version(version);
> + struct dp_packet *pkt_out = NULL;
> +
> + /* This action only works for ICMPv4 packets. */
> + if (!is_icmpv4(in_flow, NULL)) {
> + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
> + VLOG_WARN_RL(&rl,
> + "put_icmp4_inner_ip4_src action on non-ICMPv4 packet");
> + goto exit;
> + }
> +
> + /* A later IPv4 fragment carries no L4 header: the bytes at the L4 offset
> + * are payload continuation, not an ICMP header, so there is nothing to
> + * rewrite here. The northd match on icmp4.type already excludes later
> + * fragments; this keeps the controller path robust regardless. */
> + if (in_flow->nw_frag & FLOW_NW_FRAG_LATER) {
> + goto exit;
> + }
> +
> + ovs_be32 *ip4_src = ofpbuf_try_pull(userdata, sizeof *ip4_src);
> + if (!ip4_src) {
> + goto exit;
> + }
> +
> + /* dp_packet_clone() already copies l2_pad_size, l2_5_ofs, l3_ofs,
> + * l4_ofs, packet_type and md. */
> + pkt_out = dp_packet_clone(pkt_in);
> +
> + struct icmp_header *ih = dp_packet_l4(pkt_out);
> + size_t l4_size = dp_packet_l4_size(pkt_out);
> +
> + /* Need the 8-byte ICMP header plus at least a minimal inner IPv4
> + * header. Check on lengths so we never synthesize an out-of-bounds
> + * pointer. */
> + if (!ih || l4_size < sizeof *ih + sizeof(struct ip_header)) {
> + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
> + VLOG_WARN_RL(&rl, "put_icmp4_inner_ip4_src: ICMP payload too short "
> + "to contain an inner IPv4 header");
> + goto exit;
> + }
> +
> + /* Known-good: at least sizeof(struct ip_header) bytes are available
> + * from inner_ip to the tail (guaranteed by the check above). */
> + struct ip_header *inner_ip = ALIGNED_CAST(struct ip_header *,
> + (char *) ih + sizeof *ih);
> + size_t inner_avail = l4_size - sizeof *ih; /* no underflow: checked */
> + size_t inner_ihl_bytes = IP_IHL(inner_ip->ip_ihl_ver) * 4;
> +
> + if (inner_ihl_bytes < sizeof *inner_ip || inner_ihl_bytes > inner_avail)
> {
> + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
> + VLOG_WARN_RL(&rl, "put_icmp4_inner_ip4_src: inner IPv4 header "
> + "length (%"PRIuSIZE") invalid", inner_ihl_bytes);
> + goto exit;
> + }
> +
> + ovs_be32 old_inner_ip_src = get_16aligned_be32(&inner_ip->ip_src);
> + put_16aligned_be32(&inner_ip->ip_src, *ip4_src);
> +
> + /* Fix the inner IPv4 header checksum incrementally (not a full
> + * recompute): a header that arrived with a bad checksum must stay bad
> + * -- we must not launder it. */
> + inner_ip->ip_csum = recalc_csum32(inner_ip->ip_csum,
> + old_inner_ip_src, *ip4_src);
> +
> + /* No outer ICMP checksum update is needed: it covers the inner IP
> + * source (changed by +D) and the inner IP checksum field (changed by
> + * -D by the incremental update above) and nothing else here, so the
> + * two deltas cancel -- for any inner header, valid or not.
> + *
> + * The inner L4 (TCP/UDP) checksum is now stale w.r.t. its
> + * pseudo-header and is deliberately left unchanged: stateless NAT does
> + * not remap ports, and the inner L4 checksum of an ICMP-quoted packet
> + * is not validated for error correlation (conntrack matches the inner
> + * 4-tuple). The outer IPv4 header is unchanged, so its checksum stays
> + * valid too. */
> +
> + pin->packet = dp_packet_data(pkt_out);
> + pin->packet_len = dp_packet_size(pkt_out);
> +
> +exit:
> + queue_msg(swconn, ofputil_encode_resume(pin, continuation, proto));
> + if (pkt_out) {
> + dp_packet_delete(pkt_out);
> + }
> +}
> +
> static void
> wait_controller_event(void)
> {
> --
> 2.54.0
>
_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev