> From: Numan Siddique <[email protected]> > > OVN actions today can rewrite fields of the outermost IPv4 header > (e.g. 'ip4.dst = ...'), but they don't reach into the payload of an > ICMP error. An ICMPv4 error carries an embedded copy of the IP+L4 > header that triggered it (RFC 792 §3.2.2, RFC 1812 §4.3.2.3 expands > this to up to 576 bytes), and that inner header is what NAT-aware > receivers use to correlate the error with the original flow. > > For OVN logical routers that use stateless NAT, this gap is > user-visible: when an inbound ICMPv4 'Fragmentation Needed' error > (RFC 1191 PMTUD) arrives, the LR pipeline rewrites only the outer > ip4.dst back to the logical IP. The inner header still carries the > external (post-NAT) source, so conntrack in the downstream logical > switch zone fails to match the tracked outgoing flow, the LS ACL > stage drops the packet as ct.inv, and the VM's kernel never learns > to lower its PMTU. > > Introduce a new OVN-managed field 'icmp4.inner_ip4.src' that can be > assigned from an action, and which - at runtime - rewrites the > source IPv4 address inside an ICMPv4 error's embedded > original-packet header (un-NATing the post-NAT external address back > to the logical IP). > > Unlike the existing 'icmp4.frag_mtu' / 'icmp6.frag_mtu' fields, which > are encoded as ovn-field notes (the value sits in packet metadata and > is consumed at egress time without modifying packet bytes), > 'icmp4.inner_ip4.src' has to mutate bytes inside the ICMP payload and > recompute checksums. It is therefore encoded as a controller action > carrying a new opcode, ACTION_OPCODE_PUT_ICMP4_INNER_IP4_SRC: the > packet is sent to ovn-controller, which performs the inner rewrite > and resumes the OpenFlow pipeline with the modified packet. > > To support both encoding styles, struct ovn_field gains a new boolean > 'is_note'. The existing frag_mtu fields set is_note=true and keep > their previous code path; the new field sets is_note=false and routes > through encode_start_controller_op() / encode_finish_controller_op(). > > Assisted-by: Claude Opus 4.7, Claude Code > Signed-off-by: Numan Siddique <[email protected]>
Acked-by: Lorenzo Bianconi <[email protected]> > --- > include/ovn/actions.h | 8 +++++++- > include/ovn/logical-fields.h | 9 +++++++++ > lib/actions.c | 38 +++++++++++++++++++++++++++++++++--- > lib/logical-fields.c | 11 ++++++++++- > ovn-sb.xml | 27 ++++++++++++++++++++++++- > tests/ovn.at | 3 +++ > utilities/ovn-trace.c | 6 ++++++ > 7 files changed, 96 insertions(+), 6 deletions(-) > > diff --git a/include/ovn/actions.h b/include/ovn/actions.h > index 2ca8dac8f7..712b723cdd 100644 > --- a/include/ovn/actions.h > +++ b/include/ovn/actions.h > @@ -829,7 +829,13 @@ OVNACTS > * - The 32-bit DHCP relay IP. > * - The 32-bit DHCP server IP. > */ > \ > - ACTION_OPCODE(DHCP_RELAY_RESP_CHK) > + ACTION_OPCODE(DHCP_RELAY_RESP_CHK) > \ > + > \ > + /* icmp4.inner_ip4.src = <value> > + * Arguments follow the action_header, in this format: > + * - The 32-bit IPv4 address. > + */ > \ > + ACTION_OPCODE(PUT_ICMP4_INNER_IP4_SRC) > > > enum action_opcode { > diff --git a/include/ovn/logical-fields.h b/include/ovn/logical-fields.h > index 3c0fb22e76..1f8fe0a9f9 100644 > --- a/include/ovn/logical-fields.h > +++ b/include/ovn/logical-fields.h > @@ -256,6 +256,13 @@ enum ovn_field_id { > */ > OVN_ICMP6_FRAG_MTU, > > + /* > + * Name: "icmp4.inner_ip4.src" - > + * Type: be32 > + * Description: Sets the inner payload IPv4 source address if present. > + */ > + OVN_ICMP4_INNER_IP4_SRC, > + > OVN_FIELD_N_IDS > }; > > @@ -264,6 +271,8 @@ struct ovn_field { > const char *name; > unsigned int n_bytes; /* Width of the field in bytes. */ > unsigned int n_bits; /* Number of significant bits in field. */ > + bool is_note; /* If true, the field is encoded as 'note' action, else as > + * controller action. */ > }; > > static inline const struct ovn_field * > diff --git a/lib/actions.c b/lib/actions.c > index 3fbaed7af6..be9947003b 100644 > --- a/lib/actions.c > +++ b/lib/actions.c > @@ -2043,6 +2043,7 @@ is_paused_nested_action(enum action_opcode opcode) > case ACTION_OPCODE_SPLIT_BUF_ACTION: > case ACTION_OPCODE_DHCP_RELAY_REQ_CHK: > case ACTION_OPCODE_DHCP_RELAY_RESP_CHK: > + case ACTION_OPCODE_PUT_ICMP4_INNER_IP4_SRC: > default: > return false; > } > @@ -4017,6 +4018,10 @@ format_OVNFIELD_LOAD(const struct ovnact_load *load , > struct ds *s) > ds_put_format(s, "%s = %u;", f->name, > ntohs(load->imm.value.be16_int)); > break; > + case OVN_ICMP4_INNER_IP4_SRC: > + ds_put_format(s, "%s = "IP_FMT";", f->name, > + IP_ARGS(load->imm.value.be32_int)); > + break; > > case OVN_FIELD_N_IDS: > default: > @@ -4026,11 +4031,30 @@ format_OVNFIELD_LOAD(const struct ovnact_load *load , > struct ds *s) > > static void > encode_OVNFIELD_LOAD(const struct ovnact_load *load, > - const struct ovnact_encode_params *ep OVS_UNUSED, > + const struct ovnact_encode_params *ep, > struct ofpbuf *ofpacts) > { > const struct ovn_field *f = ovn_field_from_name(load->dst.symbol->name); > - size_t offset = encode_start_ovn_field_note(f->id, ofpacts); > + size_t offset = 0; > + if (f->is_note) { > + offset = encode_start_ovn_field_note(f->id, ofpacts); > + } else { > + enum action_opcode opcode; > + switch (f->id) { > + case OVN_ICMP4_INNER_IP4_SRC: > + opcode = ACTION_OPCODE_PUT_ICMP4_INNER_IP4_SRC; > + break; > + > + case OVN_ICMP4_FRAG_MTU: > + case OVN_ICMP6_FRAG_MTU: > + case OVN_FIELD_N_IDS: > + default: > + OVS_NOT_REACHED(); > + } > + > + offset = encode_start_controller_op(opcode, true, ep->ctrl_meter_id, > + ofpacts); > + } > > switch (f->id) { > case OVN_ICMP4_FRAG_MTU: { > @@ -4041,12 +4065,20 @@ encode_OVNFIELD_LOAD(const struct ovnact_load *load, > ofpbuf_put(ofpacts, &load->imm.value.be32_int, sizeof(ovs_be32)); > break; > } > + case OVN_ICMP4_INNER_IP4_SRC: { > + ofpbuf_put(ofpacts, &load->imm.value.be32_int, sizeof(ovs_be32)); > + break; > + } > case OVN_FIELD_N_IDS: > default: > OVS_NOT_REACHED(); > } > > - encode_finish_ovn_field_note(offset, ofpacts); > + if (f->is_note) { > + encode_finish_ovn_field_note(offset, ofpacts); > + } else { > + encode_finish_controller_op(offset, ofpacts); > + } > } > > static void > diff --git a/lib/logical-fields.c b/lib/logical-fields.c > index 807bb4db48..35d9a2e229 100644 > --- a/lib/logical-fields.c > +++ b/lib/logical-fields.c > @@ -30,11 +30,18 @@ const struct ovn_field ovn_fields[OVN_FIELD_N_IDS] = { > OVN_ICMP4_FRAG_MTU, > "icmp4.frag_mtu", > 2, 16, > + true, > }, { > OVN_ICMP6_FRAG_MTU, > "icmp6.frag_mtu", > 4, 32, > - }, > + true, > + }, { > + OVN_ICMP4_INNER_IP4_SRC, > + "icmp4.inner_ip4.src", > + 4, 32, > + false, > + } > }; > > static struct shash ovnfield_by_name; > @@ -411,6 +418,8 @@ ovn_init_symtab(struct shash *symtab) > > expr_symtab_add_ovn_field(symtab, "icmp4.frag_mtu", OVN_ICMP4_FRAG_MTU); > expr_symtab_add_ovn_field(symtab, "icmp6.frag_mtu", OVN_ICMP6_FRAG_MTU); > + expr_symtab_add_ovn_field(symtab, "icmp4.inner_ip4.src", > + OVN_ICMP4_INNER_IP4_SRC); > } > > const char * > diff --git a/ovn-sb.xml b/ovn-sb.xml > index e45b63d73f..470ca00f53 100644 > --- a/ovn-sb.xml > +++ b/ovn-sb.xml > @@ -1306,9 +1306,34 @@ > defined in the RFC 1191 with the value specified in > <var>constant</var>. > </p> > + </li> > + > + <li> > + <code>icmp4.inner_ip4.src</code> > + <p> > + This field rewrites the source IPv4 address carried in the > + inner (original-packet) IPv4 header that is embedded in the > + payload of an ICMPv4 error message, setting it to > + <var>constant</var>. The inner IPv4 header checksum and the > + outer ICMP checksum are recomputed accordingly. The action > + has no effect on packets that are not ICMPv4 errors. > + </p> > + > + <p> > + It can be used by stateless NAT to fix up inbound ICMPv4 > + errors, for example <code>Fragmentation Needed</code> > messages > + used for Path MTU discovery (RFC 1191). A stateless > + <code>dnat_and_snat</code> rule rewrites only the outer > + destination of such an error, while the embedded inner header > + still carries the external (post-NAT) IP as its source. As a > + result conntrack in the downstream logical switch cannot > + correlate the error with the tracked flow. Un-NATing the > inner > + source back to the logical IP lets conntrack relate the error > + and lets the originating workload honor the PMTU signal. > + </p> > > <p> > - Eg. icmp4.frag_mtu = 1500; > + Eg. <code>icmp4.inner_ip4.src = 10.0.0.10;</code> > </p> > </li> > </ul> > diff --git a/tests/ovn.at b/tests/ovn.at > index 1995a989d3..59b41bde82 100644 > --- a/tests/ovn.at > +++ b/tests/ovn.at > @@ -1849,6 +1849,9 @@ icmp4_error { eth.dst = ff:ff:ff:ff:ff:ff; > icmp4.frag_mtu = 1500; output; }; out > icmp4.frag_mtu = 1500; > encodes as note:6f.76.6e.00.00.00.00.00.05.dc > > +icmp4.inner_ip4.src = 10.0.0.40; > + encodes as controller(userdata=00.00.00.1e.00.00.00.00.0a.00.00.28,pause) > + > # icmp6 > icmp6 { eth.dst = ff:ff:ff:ff:ff:ff; output; }; output; > encodes as > controller(userdata=00.00.00.0a.00.00.00.00.00.19.00.10.80.00.06.06.ff.ff.ff.ff.ff.ff.00.00.ff.ff.00.10.00.00.23.20.00.0e.ff.f8.OFTABLE_SAVE_INPORT_HEX.00.00.00),resubmit(,OFTABLE_SAVE_INPORT) > diff --git a/utilities/ovn-trace.c b/utilities/ovn-trace.c > index c09a9041f5..263157dc3d 100644 > --- a/utilities/ovn-trace.c > +++ b/utilities/ovn-trace.c > @@ -2763,6 +2763,12 @@ execute_ovnfield_load(const struct ovnact_load *load, > ntohs(load->imm.value.be16_int)); > break; > } > + case OVN_ICMP4_INNER_IP4_SRC: { > + ovntrace_node_append(super, OVNTRACE_NODE_MODIFY, > + "icmp4.inner_ip4.src = "IP_FMT, > + IP_ARGS(load->imm.value.be32_int)); > + break; > + } > case OVN_FIELD_N_IDS: > default: > OVS_NOT_REACHED(); > -- > 2.54.0 > > _______________________________________________ > dev mailing list > [email protected] > https://mail.openvswitch.org/mailman/listinfo/ovs-dev >
_______________________________________________ dev mailing list [email protected] https://mail.openvswitch.org/mailman/listinfo/ovs-dev
