On Tue, Jun 16, 2026 at 4:53 PM <[email protected]> wrote:
>
> From: Numan Siddique <[email protected]>
>
> Implement the controller-side handler for the new OVN action
> 'icmp4.inner_ip4.src = <value>' introduced in the previous patch.
>
> When ovn-controller receives a packet with action opcode
> ACTION_OPCODE_PUT_ICMP4_INNER_IP4_SRC, it now:
>
>   - Locates the inner IPv4 header inside the ICMP error payload
>     (right after the 8-byte outer ICMP header), validating on
>     lengths (dp_packet_l4_size()) so that no out-of-bounds pointer
>     is ever synthesized.
>   - Rewrites the inner IPv4 source address to the value supplied
>     in the userdata.
>   - Fixes the inner IPv4 header checksum incrementally.
>
> No outer ICMP checksum update is performed.  The ICMP checksum
> covers the inner IP source (changed by +D) and the inner IP
> checksum field (changed by -D by the incremental update above),
> and nothing else here, so the two deltas cancel.  This holds for
> any inner header, valid or not, precisely because the inner
> checksum update is incremental.
>
> The inner L4 (TCP/UDP) checksum is intentionally left unchanged.
> Rewriting the inner source invalidates it via the L4
> pseudo-header, but stateless NAT does not remap ports and the
> inner L4 checksum of an ICMP-quoted packet is not validated for
> error correlation: conntrack matches the error to the tracked
> flow using the inner 4-tuple, not the inner L4 checksum.  (Should
> a future change ever fix the inner L4 checksum, it would then need
> a compensating outer ICMP checksum update, as that field has no
> offsetting change inside the ICMP payload.)
>
> The outer IPv4 header is not touched, so its checksum stays valid.
> Finally the OpenFlow pipeline is resumed with the rewritten packet.
>
> This is the building block needed to make ICMP fragmentation-
> needed errors (RFC 1191 PMTU discovery) work correctly when an
> OVN logical router uses stateless NAT.  With stateless NAT, the
> inner payload of an incoming ICMP error still carries the
> external (post-NAT) IPv4 source, so conntrack in the VM's
> logical switch zone fails to match the tracked outgoing flow
> and marks the packet as +inv.  By rewriting the inner source
> back to the logical IP in the LR pipeline before the LS
> pipeline runs, conntrack can correlate the error with the
> original flow and the VM can honor the PMTUD signal.
>
> A follow-up patch will add the corresponding ovn-northd logical
> flow that emits this action for each stateless NAT configured
> on a logical router.
>
> Assisted-by: Claude Opus 4.7, Claude Code
> Signed-off-by: Numan Siddique <[email protected]>

Recheck-request: github-robot-_Build_and_Test


> ---
>  controller/pinctrl.c | 104 +++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 104 insertions(+)
>
> diff --git a/controller/pinctrl.c b/controller/pinctrl.c
> index 12eb5d65f3..4dc4f38ac9 100644
> --- a/controller/pinctrl.c
> +++ b/controller/pinctrl.c
> @@ -263,6 +263,10 @@ static void pinctrl_handle_nd_ns(struct rconn *swconn,
>                                   const struct ofputil_packet_in *pin,
>                                   struct ofpbuf *userdata,
>                                   const struct ofpbuf *continuation);
> +static void pinctrl_handle_put_icmp4_inner_ip4_src(
> +    struct rconn *swconn, const struct flow *in_flow,
> +    struct dp_packet *pkt_in, struct ofputil_packet_in *pin,
> +    struct ofpbuf *userdata, struct ofpbuf *continuation);
>  static void
>  pinctrl_handle_event(struct ofpbuf *userdata)
>      OVS_REQUIRES(pinctrl_mutex);
> @@ -3928,6 +3932,10 @@ process_packet_in(struct rconn *swconn, const struct 
> ofp_header *msg)
>          break;
>      }
>
> +    case ACTION_OPCODE_PUT_ICMP4_INNER_IP4_SRC:
> +        pinctrl_handle_put_icmp4_inner_ip4_src(swconn, &headers, &packet, 
> &pin,
> +                                               &userdata, &continuation);
> +        break;
>      default:
>          VLOG_WARN_RL(&rl, "unrecognized packet-in opcode %"PRIu32,
>                       ntohl(ah->opcode));
> @@ -6673,6 +6681,102 @@ exit:
>      dp_packet_uninit(pkt_out_ptr);
>  }
>
> +/* Called with in the pinctrl_handler thread context. */
> +static void
> +pinctrl_handle_put_icmp4_inner_ip4_src(struct rconn *swconn,
> +                                       const struct flow *in_flow,
> +                                       struct dp_packet *pkt_in,
> +                                       struct ofputil_packet_in *pin,
> +                                       struct ofpbuf *userdata,
> +                                       struct ofpbuf *continuation)
> +{
> +    enum ofp_version version = rconn_get_version(swconn);
> +    enum ofputil_protocol proto = ofputil_protocol_from_ofp_version(version);
> +    struct dp_packet *pkt_out = NULL;
> +
> +    /* This action only works for ICMPv4 packets. */
> +    if (!is_icmpv4(in_flow, NULL)) {
> +        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
> +        VLOG_WARN_RL(&rl,
> +                     "put_icmp4_inner_ip4_src action on non-ICMPv4 packet");
> +        goto exit;
> +    }
> +
> +    /* A later IPv4 fragment carries no L4 header: the bytes at the L4 offset
> +     * are payload continuation, not an ICMP header, so there is nothing to
> +     * rewrite here.  The northd match on icmp4.type already excludes later
> +     * fragments; this keeps the controller path robust regardless. */
> +    if (in_flow->nw_frag & FLOW_NW_FRAG_LATER) {
> +        goto exit;
> +    }
> +
> +    ovs_be32 *ip4_src = ofpbuf_try_pull(userdata, sizeof *ip4_src);
> +    if (!ip4_src) {
> +        goto exit;
> +    }
> +
> +    /* dp_packet_clone() already copies l2_pad_size, l2_5_ofs, l3_ofs,
> +     * l4_ofs, packet_type and md. */
> +    pkt_out = dp_packet_clone(pkt_in);
> +
> +    struct icmp_header *ih = dp_packet_l4(pkt_out);
> +    size_t l4_size = dp_packet_l4_size(pkt_out);
> +
> +    /* Need the 8-byte ICMP header plus at least a minimal inner IPv4
> +     * header.  Check on lengths so we never synthesize an out-of-bounds
> +     * pointer. */
> +    if (!ih || l4_size < sizeof *ih + sizeof(struct ip_header)) {
> +        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
> +        VLOG_WARN_RL(&rl, "put_icmp4_inner_ip4_src: ICMP payload too short "
> +                     "to contain an inner IPv4 header");
> +        goto exit;
> +    }
> +
> +    /* Known-good: at least sizeof(struct ip_header) bytes are available
> +     * from inner_ip to the tail (guaranteed by the check above). */
> +    struct ip_header *inner_ip = ALIGNED_CAST(struct ip_header *,
> +                                              (char *) ih + sizeof *ih);
> +    size_t inner_avail = l4_size - sizeof *ih;   /* no underflow: checked */
> +    size_t inner_ihl_bytes = IP_IHL(inner_ip->ip_ihl_ver) * 4;
> +
> +    if (inner_ihl_bytes < sizeof *inner_ip || inner_ihl_bytes > inner_avail) 
> {
> +        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
> +        VLOG_WARN_RL(&rl, "put_icmp4_inner_ip4_src: inner IPv4 header "
> +                     "length (%"PRIuSIZE") invalid", inner_ihl_bytes);
> +        goto exit;
> +    }
> +
> +    ovs_be32 old_inner_ip_src = get_16aligned_be32(&inner_ip->ip_src);
> +    put_16aligned_be32(&inner_ip->ip_src, *ip4_src);
> +
> +    /* Fix the inner IPv4 header checksum incrementally (not a full
> +     * recompute): a header that arrived with a bad checksum must stay bad
> +     * -- we must not launder it. */
> +    inner_ip->ip_csum = recalc_csum32(inner_ip->ip_csum,
> +                                      old_inner_ip_src, *ip4_src);
> +
> +    /* No outer ICMP checksum update is needed: it covers the inner IP
> +     * source (changed by +D) and the inner IP checksum field (changed by
> +     * -D by the incremental update above) and nothing else here, so the
> +     * two deltas cancel -- for any inner header, valid or not.
> +     *
> +     * The inner L4 (TCP/UDP) checksum is now stale w.r.t. its
> +     * pseudo-header and is deliberately left unchanged: stateless NAT does
> +     * not remap ports, and the inner L4 checksum of an ICMP-quoted packet
> +     * is not validated for error correlation (conntrack matches the inner
> +     * 4-tuple).  The outer IPv4 header is unchanged, so its checksum stays
> +     * valid too. */
> +
> +    pin->packet = dp_packet_data(pkt_out);
> +    pin->packet_len = dp_packet_size(pkt_out);
> +
> +exit:
> +    queue_msg(swconn, ofputil_encode_resume(pin, continuation, proto));
> +    if (pkt_out) {
> +        dp_packet_delete(pkt_out);
> +    }
> +}
> +
>  static void
>  wait_controller_event(void)
>  {
> --
> 2.54.0
>
_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to