Re: [ovs-dev] [PATCH] netdev-dpdk: Fallback to non tunnel offloading API.

2024-03-27 Thread junwan...@cestc.cn
I validated this modification on my x710 network card, but I found that
the outer UDP checksum of the transmitted packets is incorrect, leading 
to communication abnormalities. I think it's necessary to disable the outer
UDP checksum because although the capability reported by DPDK 
indicates support, in reality, the hardware doesn't actually support offloading,
resulting in outer UDP checksum errors.

tx_geneve_tso_offload="false", tx_ip_csum_offload="true", 
tx_out_ip_csum_offload="true",
tx_out_udp_csum_offload="true", tx_sctp_csum_offload="true", 
tx_tcp_csum_offload="true", 
tx_tcp_seg_offload="false", tx_udp_csum_offload="true", 
tx_vxlan_tso_offload="false"



junwan...@cestc.cn

From: David Marchand
Date: 2024-03-28 00:50
To: dev
CC: Mike Pattrick; Ilya Maximets; Jun Wang
Subject: [PATCH] netdev-dpdk: Fallback to non tunnel offloading API.
The outer checksum offloading API in DPDK is ambiguous and was
added by Intel folks with the assumption that any outer offloading
always goes with an inner offloading request.

With net/i40e and net/ice drivers, requesting outer ip checksum with a
tunnel context but no inner offloading request triggers a Tx failure
associated with a port MDD event.
2024-03-27T16:02:07.084Z|00018|dpdk|WARN|ice_interrupt_handler(): OICR:
MDD event

To avoid this situation, if no checksum or segmentation offloading is
requested on the inner part of a packet, fallback to "normal" (non outer)
offloading request.
And outer offloading can be re-enabled for net/i40e and netice.

Fixes: 084c8087292c ("userspace: Support VXLAN and GENEVE TSO.")
Signed-off-by: David Marchand 
---
lib/netdev-dpdk.c | 84 +++
1 file changed, 42 insertions(+), 42 deletions(-)

diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index 2111f77681..939817474c 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -1354,18 +1354,6 @@ dpdk_eth_dev_init(struct netdev_dpdk *dev)
 info.tx_offload_capa &= ~RTE_ETH_TX_OFFLOAD_TCP_CKSUM;
 }
-if (!strcmp(info.driver_name, "net_ice")
-|| !strcmp(info.driver_name, "net_i40e")) {
-/* FIXME: Driver advertises the capability but doesn't seem
- * to actually support it correctly.  Can remove this once
- * the driver is fixed on DPDK side. */
-VLOG_INFO("%s: disabled Tx outer udp checksum offloads for a "
-  "net/ice or net/i40e port.", netdev_get_name(>up));
-info.tx_offload_capa &= ~RTE_ETH_TX_OFFLOAD_OUTER_UDP_CKSUM;
-info.tx_offload_capa &= ~RTE_ETH_TX_OFFLOAD_VXLAN_TNL_TSO;
-info.tx_offload_capa &= ~RTE_ETH_TX_OFFLOAD_GENEVE_TNL_TSO;
-}
-
 if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_IPV4_CKSUM) {
 dev->hw_ol_features |= NETDEV_TX_IPV4_CKSUM_OFFLOAD;
 } else {
@@ -2584,20 +2572,20 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, 
struct rte_mbuf *mbuf)
 struct dp_packet *pkt = CONTAINER_OF(mbuf, struct dp_packet, mbuf);
 struct tcp_header *th;
-const uint64_t all_requests = (RTE_MBUF_F_TX_IP_CKSUM |
-   RTE_MBUF_F_TX_L4_MASK  |
-   RTE_MBUF_F_TX_OUTER_IP_CKSUM  |
-   RTE_MBUF_F_TX_OUTER_UDP_CKSUM |
-   RTE_MBUF_F_TX_TCP_SEG);
-const uint64_t all_marks = (RTE_MBUF_F_TX_IPV4 |
-RTE_MBUF_F_TX_IPV6 |
-RTE_MBUF_F_TX_OUTER_IPV4 |
-RTE_MBUF_F_TX_OUTER_IPV6 |
-RTE_MBUF_F_TX_TUNNEL_MASK);
-
-if (!(mbuf->ol_flags & all_requests)) {
+const uint64_t all_inner_requests = (RTE_MBUF_F_TX_IP_CKSUM |
+ RTE_MBUF_F_TX_L4_MASK |
+ RTE_MBUF_F_TX_TCP_SEG);
+const uint64_t all_outer_requests = (RTE_MBUF_F_TX_OUTER_IP_CKSUM  |
+  RTE_MBUF_F_TX_OUTER_UDP_CKSUM);
+const uint64_t all_inner_marks = (RTE_MBUF_F_TX_IPV4 |
+  RTE_MBUF_F_TX_IPV6);
+const uint64_t all_outer_marks = (RTE_MBUF_F_TX_OUTER_IPV4 |
+  RTE_MBUF_F_TX_OUTER_IPV6 |
+  RTE_MBUF_F_TX_TUNNEL_MASK);
+
+if (!(mbuf->ol_flags & (all_inner_requests | all_outer_requests))) {
 /* No offloads requested, no marks should be set. */
-mbuf->ol_flags &= ~all_marks;
+mbuf->ol_flags &= ~(all_inner_marks | all_outer_marks);
 uint64_t unexpected = mbuf->ol_flags & RTE_MBUF_F_TX_OFFLOAD_MASK;
 if (OVS_UNLIKELY(unexpected)) {
@@ -2610,32 +2598,44 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, 
struct rte_mbuf *mbuf)
 return true;
 }
+const uint64_t tunnel_type = mbuf->ol_flags & RTE_MBUF_F_TX_TUNNEL_MASK;
+if (OVS_UNLIKELY(tunnel_type
+ && tunnel_type != RTE_MBUF_F_TX_TUNNEL_GENEVE
+  

Re: [ovs-dev] [PATCH v2 0/5] ovsdb: raft: Fixes for cluster joining state.

2024-03-27 Thread Ilya Maximets
On 3/26/24 18:27, Ilya Maximets wrote:
> 
> Issues discovered while working on:
>   https://github.com/ovn-org/ovn/issues/235
> 
> 
> Version 2:
>   * Replaced the first patch with an implementation of leadership
> transfer to servers that applied the most changes. [Felix, Han]
>   * Fixed some typos. [Han]
>   * Added a MACRO for the join timeout value. [Han]
>   * Added Ack's from Han to patches 2-5.
> 
> 
> Ilya Maximets (5):
>   ovsdb: raft: Avoid transferring leadership to unavailable servers.
>   ovsdb: raft: Fix time intervals for multitasking while joining.
>   ovsdb: raft: Fix permanent joining state on a cluster member.
>   ovsdb: raft: Fix assertion when 1-node cluster looses leadership.
>   ovsdb: raft: Fix inability to join after leadership change round trip.
> 
>  ovsdb/raft.c   | 121 +
>  tests/ovsdb-cluster.at | 106 
>  2 files changed, 216 insertions(+), 11 deletions(-)
> 

Thanks, Han and Felix!

I applied the set.

Patches 1, 3 and 5 backported down to 2.17.
Patches 2 and 4 - down to 3.3.

Best regards, Ilya Maximets.
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH v2 1/5] ovsdb: raft: Avoid transferring leadership to unavailable servers.

2024-03-27 Thread Ilya Maximets
On 3/26/24 21:10, Han Zhou wrote:
> 
> 
> On Tue, Mar 26, 2024 at 10:26 AM Ilya Maximets  > wrote:
>>
>> Current implementation of the leadership transfer just shoots the
>> leadership in the general direction of the first stable server in the
>> configuration.  It doesn't check if the server was active recently or
>> even that the connection is established.  This may result in sending
>> leadership to a disconnected or otherwise unavailable server.
>>
>> Such behavior should not cause log truncation or any other correctness
>> issues because the destination server would have all the append
>> requests queued up or the connection will be dropped by the leader.
>> In a worst case we will have a leader-less cluster until the next
>> election timer fires up.  Other servers will notice the absence of the
>> leader and will trigger a new leader election normally.
>> However, the potential wait for the election timer is not good as
>> real-world setups may have high values configured.
>>
>> Fix that by trying to transfer to servers that we know have applied
>> the most changes, i.e., have the highest 'match_index'.  Such servers
>> replied to the most recent append requests, so they have highest
>> chances to be healthy.  Choosing the random starting point in the
>> list of such servers so we don't transfer to the same server every
>> single time.  This slightly improves load distribution, but, most
>> importantly, increases robustness of our test suite, making it cover
>> more cases.  Also checking that the message was actually sent without
>> immediate failure.
>>
>> If we fail to transfer to any server with the highest index, try to
>> just transfer to any other server that is not behind majority and then
>> just any other server that is connected.  We did actually send them
>> all the updates (if the connection is open), they just didn't reply
>> yet for one reason or another.  It should be better than leaving the
>> cluster without a leader.
>>
>> Note that there is always a chance that transfer will fail, since
>> we're not waiting for it to be acknowledged (and must not wait).
>> In this case, normal election will be triggered after the election
>> timer fires up.
>>
>> Fixes: 1b1d2e6daa56 ("ovsdb: Introduce experimental support for clustered 
>> databases.")
>> Signed-off-by: Ilya Maximets mailto:i.maxim...@ovn.org>>
>> ---
>>
>> CC: Felix Huettner 
>>
>>  ovsdb/raft.c | 48 
>>  1 file changed, 44 insertions(+), 4 deletions(-)
>>
>> diff --git a/ovsdb/raft.c b/ovsdb/raft.c
>> index f463afcb3..b171da345 100644
>> --- a/ovsdb/raft.c
>> +++ b/ovsdb/raft.c
>> @@ -1261,10 +1261,30 @@ raft_transfer_leadership(struct raft *raft, const 
>> char *reason)
>>          return;
>>      }
>>
>> -    struct raft_server *s;
>> +    struct raft_server **servers, *s;
>> +    uint64_t threshold = 0;
>> +    size_t n = 0, start, i;
>> +
>> +    servers = xmalloc(hmap_count(>servers) * sizeof *servers);
>> +
>>      HMAP_FOR_EACH (s, hmap_node, >servers) {
>> -        if (!uuid_equals(>sid, >sid)
>> -            && s->phase == RAFT_PHASE_STABLE) {
>> +        if (uuid_equals(>sid, >sid)
>> +            || s->phase != RAFT_PHASE_STABLE) {
>> +            continue;
>> +        }
>> +        if (s->match_index > threshold) {
>> +            threshold = s->match_index;
>> +        }
>> +        servers[n++] = s;
>> +    }
>> +
>> +    start = n ? random_range(n) : 0;
>> +
>> +retry:
>> +    for (i = 0; i < n; i++) {
>> +        s = servers[(start + i) % n];
>> +
>> +        if (s->match_index >= threshold) {
>>              struct raft_conn *conn = raft_find_conn_by_sid(raft, >sid);
>>              if (!conn) {
>>                  continue;
>> @@ -1280,7 +1300,10 @@ raft_transfer_leadership(struct raft *raft, const 
>> char *reason)
>>                      .term = raft->term,
>>                  }
>>              };
>> -            raft_send_to_conn(raft, , conn);
>> +
>> +            if (!raft_send_to_conn(raft, , conn)) {
>> +                continue;
>> +            }
>>
>>              raft_record_note(raft, "transfer leadership",
>>                               "transferring leadership to %s because %s",
>> @@ -1288,6 +1311,23 @@ raft_transfer_leadership(struct raft *raft, const 
>> char *reason)
>>              break;
>>          }
>>      }
>> +
>> +    if (n && i == n && threshold) {
>> +        if (threshold > raft->commit_index) {
>> +            /* Failed to transfer to servers with the highest 'match_index'.
>> +             * Try other servers that are not behind the majority. */
>> +            threshold = raft->commit_index;
>> +        } else {
>> +            /* Try any other server.  It is safe, because they either have 
>> all
>> +             * the append requests queued up for them before the leadership
>> +             * transfer message or their connection is broken and we will 
>> not
>> +             * transfer anyway. */
>> +   

Re: [ovs-dev] [PATCH] Userspace: Software fallback for UDP encapsulated TCP segmentation.

2024-03-27 Thread Ilya Maximets
On 3/27/24 22:07, Mike Pattrick wrote:
> On Wed, Mar 27, 2024 at 1:39 PM Simon Horman  wrote:
>>
>> On Tue, Feb 20, 2024 at 11:08:55PM -0500, Mike Pattrick wrote:
>>> When sending packets that are flagged as requiring segmentation to an
>>> interface that doens't support this feature, send the packet to the TSO
>>> software fallback instead of dropping it.
>>>
>>> Signed-off-by: Mike Pattrick 
>>
>> Hi Mike,
>>
>> Can I confirm that from your PoV this patch is still awaiting review?
>> I ask because it's been sitting around for a while now.
> 
> I believe this patch now needs to be modified for the recent
> recirculation change. I'll update it for that, give it another once
> over, and resubmit.

While at it, please, check that the outer UDP header is correctly
updated.  It is likely not, according to the tests done here:
  https://github.com/openvswitch/ovs-issues/issues/321#issuecomment-2008592610
It seem to keep the length that is larger than IP payload size.

If so, some unit tests for this functionality would be nice to have
as well with the actual comparison of headers for different packets.

Best regards, Ilya Maximets.
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH] Userspace: Software fallback for UDP encapsulated TCP segmentation.

2024-03-27 Thread Mike Pattrick
On Wed, Mar 27, 2024 at 1:39 PM Simon Horman  wrote:
>
> On Tue, Feb 20, 2024 at 11:08:55PM -0500, Mike Pattrick wrote:
> > When sending packets that are flagged as requiring segmentation to an
> > interface that doens't support this feature, send the packet to the TSO
> > software fallback instead of dropping it.
> >
> > Signed-off-by: Mike Pattrick 
>
> Hi Mike,
>
> Can I confirm that from your PoV this patch is still awaiting review?
> I ask because it's been sitting around for a while now.

I believe this patch now needs to be modified for the recent
recirculation change. I'll update it for that, give it another once
over, and resubmit.

-M


>

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH] ovs-tcpdump:Solve the problem of residual mirror port

2024-03-27 Thread Simon Horman
On Thu, Mar 21, 2024 at 08:33:37PM +0800, yangchang wrote:
> Before creating an mirror port, it is necessary to first check whether the 
> original port exists.
> Otherwise, when the original port does not exist, the created mirror port 
> will remain.
> 
> Signed-off-by: yangchang 

Hi,

Thanks for your patch.

As I understand things, this avoids a residual mirror port for an error
case, by checking for the error condition before the port is created.

As such I agree it improves the situation.  However, I see several other
error cases that occur, which result in calls to sys.exit(1), after the
mirror port is created (the call to _make_mirror_name).  And that there are
other resources that seem to dangle, f.e. the call to _make_taps().

I am wondering if it would be netter to take a more robust approach to
cleaning up resources when errors occur.
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH] Userspace: Software fallback for UDP encapsulated TCP segmentation.

2024-03-27 Thread Simon Horman
On Tue, Feb 20, 2024 at 11:08:55PM -0500, Mike Pattrick wrote:
> When sending packets that are flagged as requiring segmentation to an
> interface that doens't support this feature, send the packet to the TSO
> software fallback instead of dropping it.
> 
> Signed-off-by: Mike Pattrick 

Hi Mike,

Can I confirm that from your PoV this patch is still awaiting review?
I ask because it's been sitting around for a while now.
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH v5] tunnel: Allow UDP zero checksum with IPv6 tunnels.

2024-03-27 Thread Simon Horman
On Wed, Mar 27, 2024 at 11:55:14AM -0400, Mike Pattrick wrote:
> This patch adopts the proposed RFC 6935 by allowing null UDP checksums
> even if the tunnel protocol is IPv6. This is already supported by Linux
> through the udp6zerocsumtx tunnel option. It is disabled by default and
> IPv6 tunnels are flagged as requiring a checksum, but this patch enables
> the user to set csum=false on IPv6 tunnels.
> 
> Signed-off-by: Mike Pattrick 
> ---
> v2: Changed documentation, and added a NEWS item
> v3: NEWS file merge conflict
> v4: Better comments, new test
> v5: Addressed identified nit's

Acked-by: Simon Horman 

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH] conntrack: Do clean instead of forece expire.

2024-03-27 Thread Aaron Conole
Cheng Li  writes:

> Force expire a connection and then create new connection of the same
> tuple(cmap hash). This makes ct->conns cmap operation expensive(
> within ct->ct_lock).
>
> This patch cover the scenario by doing the clean immediately instead
> of setting expire. Also this patch fix ct_clean debug log.
>
> Signed-off-by: Cheng Li 
> ---

I'm having trouble with this commit message.  You don't include any
details about any performance characteristics that you measured
anywhere, while (rightly) pointing out that this has a larger
performance impact.  Please include these details (how you measured the
impact, what changed, etc).

>  lib/conntrack.c | 25 -
>  1 file changed, 12 insertions(+), 13 deletions(-)
>
> diff --git a/lib/conntrack.c b/lib/conntrack.c
> index 8a7056bac..6e137daa6 100644
> --- a/lib/conntrack.c
> +++ b/lib/conntrack.c
> @@ -461,12 +461,6 @@ conn_clean(struct conntrack *ct, struct conn *conn)
>  atomic_count_dec(>n_conn);
>  }
> 
> -static void
> -conn_force_expire(struct conn *conn)
> -{
> -atomic_store_relaxed(>expiration, 0);
> -}
> -
>  /* Destroys the connection tracker 'ct' and frees all the allocated memory.
>   * The caller of this function must already have shut down packet input
>   * and PMD threads (which would have been quiesced).  */
> @@ -1030,7 +1024,10 @@ conn_update_state(struct conntrack *ct, struct 
> dp_packet *pkt,
>  case CT_UPDATE_NEW:
>  if (conn_lookup(ct, >key_node[CT_DIR_FWD].key,
>  now, NULL, NULL)) {
> -conn_force_expire(conn);
> +/* Instead of setting conn->expiration and let ct_clean 
> thread
> + * clean the conn, doing the clean now to avoid duplicate 
> hash
> + * in ct->conns for performance reason. */
> +conn_clean(ct, conn);

I guess the idea is that the clean can happen from the unlocked context,
but that creates a lock in the datapath (which is why the expire list
update is here).  The conn_key_lookup() has to iterate over the list
more, but then it doesn't need to take an expensive lock.

In this case, we will have to acquire the big CT lock, and if there are
two lookups happening simultaneously we will pend processing one packet
for the other.  Ideally, the adaptive nature of the pthread will prevent
us from having to call into kernel, but that isn't a guarantee - just a
best effort.

Have you done any kind of measurement on this?  I'm quite worried about
the worst-case that this introduces into the packet processing path.
The existing design avoids this worst-case lock performance precisely
because it marks the connection as needing expiration so that future
lookups will skip.

>  }
>  create_new_conn = true;
>  break;
> @@ -1242,7 +1239,7 @@ process_one(struct conntrack *ct, struct dp_packet *pkt,
>  if (OVS_UNLIKELY(force && ctx->reply && conn)) {
>  if (conn_lookup(ct, >key_node[CT_DIR_FWD].key,
>  now, NULL, NULL)) {
> -conn_force_expire(conn);
> +conn_clean(ct, conn);
>  }
>  conn = NULL;
>  }
> @@ -1429,7 +1426,8 @@ conntrack_get_sweep_interval(struct conntrack *ct)
>  }
> 
>  static size_t
> -ct_sweep(struct conntrack *ct, struct rculist *list, long long now)
> +ct_sweep(struct conntrack *ct, struct rculist *list, long long now,
> + size_t *cleaned_count)
>  OVS_NO_THREAD_SAFETY_ANALYSIS
>  {
>  struct conn *conn;
> @@ -1438,6 +1436,7 @@ ct_sweep(struct conntrack *ct, struct rculist *list, 
> long long now)
>  RCULIST_FOR_EACH (conn, node, list) {
>  if (conn_expired(conn, now)) {
>  conn_clean(ct, conn);
> +(*cleaned_count) ++;

A fix somewhat like this may be needed anyway.  The log seems to have become
inaccurate with:

3d9c1b855a5f ("conntrack: Replace timeout based expiration lists with 
rculists.")

Please submit it as a separate patch so we can backport.

>  }
> 
>  count++;
> @@ -1454,7 +1453,7 @@ conntrack_clean(struct conntrack *ct, long long now)
>  {
>  long long next_wakeup = now + conntrack_get_sweep_interval(ct);
>  unsigned int n_conn_limit, i;
> -size_t clean_end, count = 0;
> +size_t clean_end, count = 0, cleaned_count = 0;
> 
>  atomic_read_relaxed(>n_conn_limit, _conn_limit);
>  clean_end = n_conn_limit / 64;
> @@ -1465,13 +1464,13 @@ conntrack_clean(struct conntrack *ct, long long now)
>  break;
>  }
> 
> -count += ct_sweep(ct, >exp_lists[i], now);
> +count += ct_sweep(ct, >exp_lists[i], now, _count);
>  }
> 
>  ct->next_sweep = (i < N_EXP_LISTS) ? i : 0;
>  
> -VLOG_DBG("conntrack cleanup %"PRIuSIZE" entries in %lld msec", count,
> - time_msec() - now);
> +VLOG_DBG("conntrack cleanup %"PRIuSIZE"/%"PRIuSIZE" entries in %lld 
> msec",
> + cleaned_count, count, time_msec() 

Re: [ovs-dev] [PATCH ovn] controller: Track individual address set constants.

2024-03-27 Thread Han Zhou
On Wed, Mar 27, 2024 at 12:46 AM Ales Musil  wrote:
>
> On Wed, Mar 27, 2024 at 7:14 AM Han Zhou  wrote:
>
> >
> >
> > On Tue, Mar 19, 2024 at 9:45 AM Ales Musil  wrote:
> > >
> > >
> > >
> > > On Tue, Mar 19, 2024 at 5:43 PM Ales Musil  wrote:
> > >>
> > >> Instead of tracking address set per struct expr_constant_set track it
> > >> per individual struct expr_constant. This allows more fine grained
> > >> control for I-P processing of address sets in controller. It helps
with
> > >> scenarios like matching on two address sets in one expression e.g.
> > >> "ip4.src == {$as1, $as2}". This allows any addition or removal of
> > >> individual adress from the set to be incrementally processed instead
> > >> of reprocessing all the flows.
> > >>
> > >> This unfortunately doesn't help with the following flows:
> > >> "ip4.src == $as1 && ip4.dst == $as2"
> > >> "ip4.src == $as1 || ip4.dst == $as2"
> > >>
> > >> The memory impact should be minimal as there is only increase of 8
bytes
> > >> per the struct expr_constant.
> > >>
> > >> Signed-off-by: Ales Musil 
> >
> > Thanks Ales for the improvement! The approach looks good to me. Please
see
> > some comments below:
> >
> >
> >
> Hi Han,
>
> thank you for the review.
>
>
> >
> > >> ---
> > >>  controller/lflow.c  |  4 +-
> > >>  include/ovn/actions.h   |  2 +-
> > >>  include/ovn/expr.h  | 46 ++--
> > >>  lib/actions.c   | 20 -
> > >>  lib/expr.c  | 95
+
> > >>  tests/ovn-controller.at | 14 +++---
> > >>  6 files changed, 83 insertions(+), 98 deletions(-)
> > >>
> > >> diff --git a/controller/lflow.c b/controller/lflow.c
> > >> index 895d17d19..730dc879d 100644
> > >> --- a/controller/lflow.c
> > >> +++ b/controller/lflow.c
> > >> @@ -278,7 +278,7 @@ lflow_handle_changed_flows(struct lflow_ctx_in
> > *l_ctx_in,
> > >>  }
> > >>
> > >>  static bool
> > >> -as_info_from_expr_const(const char *as_name, const union
expr_constant
> > *c,
> > >> +as_info_from_expr_const(const char *as_name, const struct
> > expr_constant *c,
> > >>  struct addrset_info *as_info)
> > >>  {
> > >>  as_info->name = as_name;
> > >> @@ -714,7 +714,7 @@ lflow_handle_addr_set_update(const char *as_name,
> > >>  if (as_diff->deleted) {
> > >>  struct addrset_info as_info;
> > >>  for (size_t i = 0; i < as_diff->deleted->n_values; i++)
{
> > >> -union expr_constant *c =
_diff->deleted->values[i];
> > >> +struct expr_constant *c =
_diff->deleted->values[i];
> > >>  if (!as_info_from_expr_const(as_name, c, _info))
{
> > >>  continue;
> > >>  }
> > >> diff --git a/include/ovn/actions.h b/include/ovn/actions.h
> > >> index dcacbb1ff..1e20f9b81 100644
> > >> --- a/include/ovn/actions.h
> > >> +++ b/include/ovn/actions.h
> > >> @@ -238,7 +238,7 @@ struct ovnact_next {
> > >>  struct ovnact_load {
> > >>  struct ovnact ovnact;
> > >>  struct expr_field dst;
> > >> -union expr_constant imm;
> > >> +struct expr_constant imm;
> > >>  };
> > >>
> > >>  /* OVNACT_MOVE, OVNACT_EXCHANGE. */
> > >> diff --git a/include/ovn/expr.h b/include/ovn/expr.h
> > >> index c48f82398..e54edb5bf 100644
> > >> --- a/include/ovn/expr.h
> > >> +++ b/include/ovn/expr.h
> > >> @@ -368,7 +368,7 @@ bool expr_relop_from_token(enum lex_type type,
enum
> > expr_relop *relop);
> > >>  struct expr {
> > >>  struct ovs_list node;   /* In parent EXPR_T_AND or EXPR_T_OR
> > if any. */
> > >>  enum expr_type type;/* Expression type. */
> > >> -char *as_name;  /* Address set name. Null if it is
not
> > an
> > >> +const char *as_name;/* Address set name. Null if it is
not
> > an
> > >> address set. */
> > >>
> > >>  union {
> > >> @@ -505,40 +505,42 @@ enum expr_constant_type {
> > >>  };
> > >>
> > >>  /* A string or integer constant (one must know which from context).
*/
> > >> -union expr_constant {
> > >> -/* Integer constant.
> > >> - *
> > >> - * The width of a constant isn't always clear, e.g. if you write
> > "1",
> > >> - * there's no way to tell whether you mean for that to be a
1-bit
> > constant
> > >> - * or a 128-bit constant or somewhere in between. */
> > >> -struct {
> > >> -union mf_subvalue value;
> > >> -union mf_subvalue mask; /* Only initialized if 'masked'. */
> > >> -bool masked;
> > >> -
> > >> -enum lex_format format; /* From the constant's lex_token. */
> > >> -};
> > >> +struct expr_constant {
> > >> +const char *as_name;
> > >>
> > >> -/* Null-terminated string constant. */
> > >> -char *string;
> > >> +union {
> > >> +/* Integer constant.
> > >> + *
> > >> + * The width of a constant isn't always clear, e.g. if you
> > write "1",
> > >> + * there's no way to tell whether 

Re: [ovs-dev] [PATCH net] openvswitch: Set the skbuff pkt_type for proper pmtud support.

2024-03-27 Thread Aaron Conole
Eelco Chaudron  writes:

> On 25 Mar 2024, at 13:37, Ilya Maximets wrote:
>
>> On 3/25/24 13:22, Aaron Conole wrote:
>>> Eelco Chaudron  writes:
>>>
 On 22 Mar 2024, at 20:06, Aaron Conole wrote:

> Open vSwitch is originally intended to switch at layer 2, only dealing 
> with
> Ethernet frames.  With the introduction of l3 tunnels support, it crossed
> into the realm of needing to care a bit about some routing details when
> making forwarding decisions.  If an oversized packet would need to be
> fragmented during this forwarding decision, there is a chance for pmtu
> to get involved and generate a routing exception.  This is gated by the
> skbuff->pkt_type field.
>
> When a flow is already loaded into the openvswitch module this field is
> set up and transitioned properly as a packet moves from one port to
> another.  In the case that a packet execute is invoked after a flow is
> newly installed this field is not properly initialized.  This causes the
> pmtud mechanism to omit sending the required exception messages across
> the tunnel boundary and a second attempt needs to be made to make sure
> that the routing exception is properly setup.  To fix this, we set the
> outgoing packet's pkt_type to PACKET_OUTGOING, since it can only get
> to the openvswitch module via a port device or packet command.

 Is this not a problem when the packet comes from the bridge port in the 
 kernel?
>>>
>>> It very well may be an issue there as well, but the recommendation is to
>>> operate with the bridge port down as far as I know, so I don't know if
>>> this issue has been observed happening from the bridge port.
>>
>> FWIW, bridge ports are typically used as an entry point for tunneled
>> traffic so it can egress from a physical port attached to OVS.  It means
>> they are pretty much always UP in most common setups like OpenStack or
>> ovn-kubernetes and handle a decent amount of traffic.  They are also used
>> to direct some other types of traffic to the host kernel.
>
> +1 here, I’m talking about the same port. I think we only advise
> having this down for userspace bridges, but not in the case the bridge
> is the tunnel endpoint.

Okay, I'll confirm about up/down, but it seems like it shouldn't matter
and we should be setting the outgoing type.

>> Unless I misunderstood which ports we're talking about here.
>>
>> Best regards, Ilya Maximets.

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH] netdev-dpdk: Fallback to non tunnel offloading API.

2024-03-27 Thread David Marchand
The outer checksum offloading API in DPDK is ambiguous and was
added by Intel folks with the assumption that any outer offloading
always goes with an inner offloading request.

With net/i40e and net/ice drivers, requesting outer ip checksum with a
tunnel context but no inner offloading request triggers a Tx failure
associated with a port MDD event.
2024-03-27T16:02:07.084Z|00018|dpdk|WARN|ice_interrupt_handler(): OICR:
MDD event

To avoid this situation, if no checksum or segmentation offloading is
requested on the inner part of a packet, fallback to "normal" (non outer)
offloading request.
And outer offloading can be re-enabled for net/i40e and netice.

Fixes: 084c8087292c ("userspace: Support VXLAN and GENEVE TSO.")
Signed-off-by: David Marchand 
---
 lib/netdev-dpdk.c | 84 +++
 1 file changed, 42 insertions(+), 42 deletions(-)

diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index 2111f77681..939817474c 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -1354,18 +1354,6 @@ dpdk_eth_dev_init(struct netdev_dpdk *dev)
 info.tx_offload_capa &= ~RTE_ETH_TX_OFFLOAD_TCP_CKSUM;
 }
 
-if (!strcmp(info.driver_name, "net_ice")
-|| !strcmp(info.driver_name, "net_i40e")) {
-/* FIXME: Driver advertises the capability but doesn't seem
- * to actually support it correctly.  Can remove this once
- * the driver is fixed on DPDK side. */
-VLOG_INFO("%s: disabled Tx outer udp checksum offloads for a "
-  "net/ice or net/i40e port.", netdev_get_name(>up));
-info.tx_offload_capa &= ~RTE_ETH_TX_OFFLOAD_OUTER_UDP_CKSUM;
-info.tx_offload_capa &= ~RTE_ETH_TX_OFFLOAD_VXLAN_TNL_TSO;
-info.tx_offload_capa &= ~RTE_ETH_TX_OFFLOAD_GENEVE_TNL_TSO;
-}
-
 if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_IPV4_CKSUM) {
 dev->hw_ol_features |= NETDEV_TX_IPV4_CKSUM_OFFLOAD;
 } else {
@@ -2584,20 +2572,20 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, 
struct rte_mbuf *mbuf)
 struct dp_packet *pkt = CONTAINER_OF(mbuf, struct dp_packet, mbuf);
 struct tcp_header *th;
 
-const uint64_t all_requests = (RTE_MBUF_F_TX_IP_CKSUM |
-   RTE_MBUF_F_TX_L4_MASK  |
-   RTE_MBUF_F_TX_OUTER_IP_CKSUM  |
-   RTE_MBUF_F_TX_OUTER_UDP_CKSUM |
-   RTE_MBUF_F_TX_TCP_SEG);
-const uint64_t all_marks = (RTE_MBUF_F_TX_IPV4 |
-RTE_MBUF_F_TX_IPV6 |
-RTE_MBUF_F_TX_OUTER_IPV4 |
-RTE_MBUF_F_TX_OUTER_IPV6 |
-RTE_MBUF_F_TX_TUNNEL_MASK);
-
-if (!(mbuf->ol_flags & all_requests)) {
+const uint64_t all_inner_requests = (RTE_MBUF_F_TX_IP_CKSUM |
+ RTE_MBUF_F_TX_L4_MASK |
+ RTE_MBUF_F_TX_TCP_SEG);
+const uint64_t all_outer_requests = (RTE_MBUF_F_TX_OUTER_IP_CKSUM  |
+  RTE_MBUF_F_TX_OUTER_UDP_CKSUM);
+const uint64_t all_inner_marks = (RTE_MBUF_F_TX_IPV4 |
+  RTE_MBUF_F_TX_IPV6);
+const uint64_t all_outer_marks = (RTE_MBUF_F_TX_OUTER_IPV4 |
+  RTE_MBUF_F_TX_OUTER_IPV6 |
+  RTE_MBUF_F_TX_TUNNEL_MASK);
+
+if (!(mbuf->ol_flags & (all_inner_requests | all_outer_requests))) {
 /* No offloads requested, no marks should be set. */
-mbuf->ol_flags &= ~all_marks;
+mbuf->ol_flags &= ~(all_inner_marks | all_outer_marks);
 
 uint64_t unexpected = mbuf->ol_flags & RTE_MBUF_F_TX_OFFLOAD_MASK;
 if (OVS_UNLIKELY(unexpected)) {
@@ -2610,32 +2598,44 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, 
struct rte_mbuf *mbuf)
 return true;
 }
 
+const uint64_t tunnel_type = mbuf->ol_flags & RTE_MBUF_F_TX_TUNNEL_MASK;
+if (OVS_UNLIKELY(tunnel_type
+ && tunnel_type != RTE_MBUF_F_TX_TUNNEL_GENEVE
+ && tunnel_type != RTE_MBUF_F_TX_TUNNEL_VXLAN)) {
+VLOG_WARN_RL(, "%s: Unexpected tunnel type: %#"PRIx64,
+ netdev_get_name(>up), tunnel_type);
+netdev_dpdk_mbuf_dump(netdev_get_name(>up),
+  "Packet with unexpected tunnel type", mbuf);
+return false;
+}
+
 /* If packet is vxlan or geneve tunnel packet, calculate outer
  * l2 len and outer l3 len. Inner l2/l3/l4 len are calculated
  * before. */
-const uint64_t tunnel_type = mbuf->ol_flags & RTE_MBUF_F_TX_TUNNEL_MASK;
-if (tunnel_type == RTE_MBUF_F_TX_TUNNEL_GENEVE ||
-tunnel_type == RTE_MBUF_F_TX_TUNNEL_VXLAN) {
+if ((tunnel_type == RTE_MBUF_F_TX_TUNNEL_GENEVE ||
+ tunnel_type == RTE_MBUF_F_TX_TUNNEL_VXLAN) &&
+mbuf->ol_flags & 

[ovs-dev] [PATCH v5] tunnel: Allow UDP zero checksum with IPv6 tunnels.

2024-03-27 Thread Mike Pattrick
This patch adopts the proposed RFC 6935 by allowing null UDP checksums
even if the tunnel protocol is IPv6. This is already supported by Linux
through the udp6zerocsumtx tunnel option. It is disabled by default and
IPv6 tunnels are flagged as requiring a checksum, but this patch enables
the user to set csum=false on IPv6 tunnels.

Signed-off-by: Mike Pattrick 
---
v2: Changed documentation, and added a NEWS item
v3: NEWS file merge conflict
v4: Better comments, new test
v5: Addressed identified nit's
---
 NEWS  |  4 
 lib/netdev-native-tnl.c   |  2 +-
 lib/netdev-vport.c| 17 +++--
 lib/netdev.h  | 18 +-
 ofproto/tunnel.c  | 10 --
 tests/tunnel-push-pop-ipv6.at |  9 +
 tests/tunnel-push-pop.at  |  7 +++
 tests/tunnel.at   |  2 +-
 vswitchd/vswitch.xml  | 12 +---
 9 files changed, 71 insertions(+), 10 deletions(-)

diff --git a/NEWS b/NEWS
index c9e4064e6..6c8c4a2dc 100644
--- a/NEWS
+++ b/NEWS
@@ -4,6 +4,10 @@ Post-v3.3.0
  * Conntrack now supports 'random' flag for selecting ports in a range
while natting and 'persistent' flag for selection of the IP address
from a range.
+ * IPv6 UDP tunnel encapsulation including Geneve and VXLAN will now
+   honour the csum option.  Configuring the interface with
+   "options:csum=false" now has the same effect as the udp6zerocsumtx
+   option has with Linux kernel UDP tunnels.
 
 
 v3.3.0 - 16 Feb 2024
diff --git a/lib/netdev-native-tnl.c b/lib/netdev-native-tnl.c
index dee9ab344..e8258bc4e 100644
--- a/lib/netdev-native-tnl.c
+++ b/lib/netdev-native-tnl.c
@@ -424,7 +424,7 @@ udp_build_header(const struct netdev_tunnel_config *tnl_cfg,
 udp = netdev_tnl_ip_build_header(data, params, IPPROTO_UDP, 0);
 udp->udp_dst = tnl_cfg->dst_port;
 
-if (params->is_ipv6 || params->flow->tunnel.flags & FLOW_TNL_F_CSUM) {
+if (params->flow->tunnel.flags & FLOW_TNL_F_CSUM) {
 /* Write a value in now to mark that we should compute the checksum
  * later. 0x is handy because it is transparent to the
  * calculation. */
diff --git a/lib/netdev-vport.c b/lib/netdev-vport.c
index 60caa02fb..234a4ebe1 100644
--- a/lib/netdev-vport.c
+++ b/lib/netdev-vport.c
@@ -702,7 +702,9 @@ set_tunnel_config(struct netdev *dev_, const struct smap 
*args, char **errp)
 tnl_cfg.dst_port = htons(atoi(node->value));
 } else if (!strcmp(node->key, "csum") && has_csum) {
 if (!strcmp(node->value, "true")) {
-tnl_cfg.csum = true;
+tnl_cfg.csum = NETDEV_TNL_CSUM_ENABLED;
+} else if (!strcmp(node->value, "false")) {
+tnl_cfg.csum = NETDEV_TNL_CSUM_DISABLED;
 }
 } else if (!strcmp(node->key, "seq") && has_seq) {
 if (!strcmp(node->value, "true")) {
@@ -850,6 +852,15 @@ set_tunnel_config(struct netdev *dev_, const struct smap 
*args, char **errp)
 }
 }
 
+/* The default csum state for GRE is special as it does have an optional
+ * checksum but the default configuration isn't correlated with IP version
+ * like UDP tunnels are.  Likewise, tunnels with no checksum at all must be
+ * in this state. */
+if (tnl_cfg.csum == NETDEV_TNL_CSUM_DEFAULT &&
+(!has_csum || strstr(type, "gre"))) {
+tnl_cfg.csum = NETDEV_TNL_DEFAULT_NO_CSUM;
+}
+
 enum tunnel_layers layers = tunnel_supported_layers(type, _cfg);
 const char *full_type = (strcmp(type, "vxlan") ? type
  : (tnl_cfg.exts & (1 << OVS_VXLAN_EXT_GPE)
@@ -1026,8 +1037,10 @@ get_tunnel_config(const struct netdev *dev, struct smap 
*args)
 }
 }
 
-if (tnl_cfg->csum) {
+if (tnl_cfg->csum == NETDEV_TNL_CSUM_ENABLED) {
 smap_add(args, "csum", "true");
+} else if (tnl_cfg->csum == NETDEV_TNL_CSUM_DISABLED) {
+smap_add(args, "csum", "false");
 }
 
 if (tnl_cfg->set_seq) {
diff --git a/lib/netdev.h b/lib/netdev.h
index 67a8486bd..5d253157c 100644
--- a/lib/netdev.h
+++ b/lib/netdev.h
@@ -111,6 +111,22 @@ enum netdev_srv6_flowlabel {
 SRV6_FLOWLABEL_COMPUTE,
 };
 
+enum netdev_tnl_csum {
+/* Default value for UDP tunnels if no configurations is present.  Enforce
+ * checksum calculation in IPv6 tunnels, disable in IPv4 tunnels. */
+NETDEV_TNL_CSUM_DEFAULT = 0,
+
+/* Checksum explicitly to be calculated. */
+NETDEV_TNL_CSUM_ENABLED,
+
+/* Checksum calculation explicitly disabled. */
+NETDEV_TNL_CSUM_DISABLED,
+
+/* A value for when there is no checksum or the default value is no
+ * checksum reguardless of IP version. */
+NETDEV_TNL_DEFAULT_NO_CSUM,
+};
+
 /* Configuration specific to tunnels. */
 struct netdev_tunnel_config {
 ovs_be64 in_key;
@@ -139,7 +155,7 @@ struct netdev_tunnel_config {
 uint8_t tos;
 bool tos_inherit;
 
-bool csum;

Re: [ovs-dev] [PATCH v2] ofproto-dpif-xlate: Fix continuations with associated metering.

2024-03-27 Thread Aaron Conole
Aaron Conole  writes:

> Ilya Maximets  writes:
>
>> On 3/22/24 14:40, Aaron Conole wrote:
>>> Open vSwitch supports the ability to invoke a controller action by way
>>> of a sample action with a specified meter.  In the normal case, this
>>> sample action is transparently generated during xlate processing.  However,
>>> when executing via a continuation, the logic to generate the sample
>>> action when finishing the context freeze was missing.  The result is that
>>> the behavior when action is 'controller(pause,meter_id=1)' does not match
>>> the behavior when action is 'controller(meter_id=1)'.
>>> 
>>> OVN and other controller solutions may rely on this metering to protect
>>> the control path, so it is critical to preserve metering, whether we are
>>> doing a plain old send to controller, or a continuation.
>>> 
>>> Fixes: 77ab5fd2a95b ("Implement serializing the state of packet
>>> traversal in "continuations".")
>>> Reported-at: https://issues.redhat.com/browse/FDP-455
>>> Tested-by: Alex Musil 
>>> Signed-off-by: Aaron Conole 
>>> ---
>>> v1 -> v2:
>>>   Clean up unrelated whitespace change
>>>   Fix style issues around test comments
>>>   Fix a line length issue truncating action in a comment
>>>   Change from `` to $() in the test
>>
>> Nit: May also change `` to $() in on_exit hook.
>
> I'll fix on apply.
>
>> But anyway:
>>
>> Acked-by: Ilya Maximets 
>
> Thanks for the review!

I fixed the nit and applied to all branches back to 2.17

Thanks all!

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH v4] tunnel: Allow UDP zero checksum with IPv6 tunnels.

2024-03-27 Thread Simon Horman
On Thu, Mar 21, 2024 at 05:08:06PM -0400, Mike Pattrick wrote:
> This patch adopts the proposed RFC 6935 by allowing null UDP checksums
> even if the tunnel protocol is IPv6. This is already supported by Linux
> through the udp6zerocsumtx tunnel option. It is disabled by default and
> IPv6 tunnels are flagged as requiring a checksum, but this patch enables
> the user to set csum=false on IPv6 tunnels.
> 
> Signed-off-by: Mike Pattrick 
> ---
> v2: Changed documentation, and added a NEWS item
> v3: NEWS file merge conflict
> v4: Better comments, new test

Thanks Mike,

FWIIW I believe v4 addresses Ilya's review of v3.

Please find some minor comments from my side below.

...

> diff --git a/lib/netdev-vport.c b/lib/netdev-vport.c

...

> @@ -850,6 +852,15 @@ set_tunnel_config(struct netdev *dev_, const struct smap 
> *args, char **errp)
>  }
>  }
>  
> +/* The default csum state for GRE is special as it does have an optional
> + * checksum but the default configuration isn't correlated with IP 
> version
> + * like UDP tunnels are.  Likewise, tunnels with checksum at all must be 
> in
> + * this state. */

nit: with checksum -> with no checksum

> +if (tnl_cfg.csum == NETDEV_TNL_CSUM_DEFAULT &&
> +(!has_csum || strstr(type, "gre"))) {
> +tnl_cfg.csum = NETDEV_TNL_DEFAULT_NO_CSUM;
> +}
> +
>  enum tunnel_layers layers = tunnel_supported_layers(type, _cfg);
>  const char *full_type = (strcmp(type, "vxlan") ? type
>   : (tnl_cfg.exts & (1 << OVS_VXLAN_EXT_GPE)

...

> diff --git a/ofproto/tunnel.c b/ofproto/tunnel.c
> index 80ddee78a..6f462874e 100644
> --- a/ofproto/tunnel.c
> +++ b/ofproto/tunnel.c
> @@ -465,9 +465,14 @@ tnl_port_send(const struct ofport_dpif *ofport, struct 
> flow *flow,
>  
>  flow->tunnel.flags &= ~(FLOW_TNL_F_MASK & ~FLOW_TNL_PUB_F_MASK);
>  flow->tunnel.flags |= (cfg->dont_fragment ? FLOW_TNL_F_DONT_FRAGMENT : 0)
> -| (cfg->csum ? FLOW_TNL_F_CSUM : 0)
>  | (cfg->out_key_present ? FLOW_TNL_F_KEY : 0);
>  
> +if (cfg->csum == NETDEV_TNL_CSUM_ENABLED) {
> +flow->tunnel.flags |= FLOW_TNL_F_CSUM;
> +} else if (cfg->csum == NETDEV_TNL_CSUM_DEFAULT && !flow->tunnel.ip_dst) 
> {
> +flow->tunnel.flags |= FLOW_TNL_F_CSUM;
> +}
> +

nit: As the action is the same (or alternatively my eyes deceive me)
 for both arms of the condition above, I would have written it as
 (completely untested!):

if (cfg->csum == NETDEV_TNL_CSUM_ENABLED ||
(cfg->csum == NETDEV_TNL_CSUM_DEFAULT && !flow->tunnel.ip_dst)) {
flow->tunnel.flags |= FLOW_TNL_F_CSUM;
}

>  if (cfg->set_egress_pkt_mark) {
>  flow->pkt_mark = cfg->egress_pkt_mark;
>  wc->masks.pkt_mark = UINT32_MAX;

...

> diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml
> index 8a1b607d7..5c3aa2a2a 100644
> --- a/vswitchd/vswitch.xml
> +++ b/vswitchd/vswitch.xml
> @@ -3207,9 +3207,15 @@
>  
>
>  Optional.  Compute encapsulation header (either GRE or UDP)
> -checksums on outgoing packets.  Default is disabled, set to
> -true to enable.  Checksums present on incoming
> -packets will be validated regardless of this setting.
> +checksums on outgoing packets.  When unset (the default value),
> +checksum computing for outgoing packets is enabled for UDP IPv6
> +tunnels, and disabled for IPv4 UDP and GRE tunnels.  When set to

nit: I'm assuming this applies to both IPv4 and IPv6 GRE tunnels.
 If so perhaps the following is slightly clearer.

   tunnels, and disabled for GRE and IPv4 UDP tunnels.  When set to

> +false, no checksums will be computed for outgoing
> +tunnel encapsulation headers.  When true, checksums
> +will be computed for all outgoing tunnel encapsulation headers.
> +Checksums present on incoming packets will be validated
> +regardless of this setting.  Incoming packets without a checksum
> +will also be accepted regardless of this setting.
>
>  
>

...
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH ovn] northd: Support routing over other address families.

2024-03-27 Thread Felix Huettner via dev
In most cases IPv4 packets are routed only over other IPv4 networks and
IPv6 packets are routed only over IPv6 networks. However there is no
interent reason for this limitation. Routing IPv4 packets over IPv6
networks just requires the router to contain a route for an IPv4 network
with an IPv6 nexthop.

This was previously prevented in OVN in ovn-nbctl and northd. By
removing these filters the forwarding will work if the mac addresses are
prepopulated.

If the mac addresses are not prepopulated we will attempt to resolve them using
the original address family of the packet and not the address family of the
nexthop. This will fail and we will not forward the packet.

This feature can for example be used by service providers to
interconnect multiple IPv4 networks of a customer without needing to
negotiate free IPv4 addresses by just using any IPv6 address.

Signed-off-by: Felix Huettner 
---
 NEWS  |   4 +
 northd/northd.c   |  45 ++--
 tests/ovn-nbctl.at|   8 +-
 tests/ovn.at  | 615 ++
 utilities/ovn-nbctl.c |  12 +-
 5 files changed, 650 insertions(+), 34 deletions(-)

diff --git a/NEWS b/NEWS
index 4d6ebea89..b419b2628 100644
--- a/NEWS
+++ b/NEWS
@@ -12,6 +12,10 @@ Post v24.03.0
 flow table id.
 "lflow-stage-to-oftable STAGE_NAME" that converts stage name into OpenFlow
 table id.
+  - Allow Static Routes where the address families of ip_prefix and nexthop
+diverge (e.g. IPv4 packets over IPv6 links). This is currently limited to
+nexthops that have their mac addresses prepopulated (so
+dynamic_neigh_routers must be false).
 
 OVN v24.03.0 - 01 Mar 2024
 --
diff --git a/northd/northd.c b/northd/northd.c
index 1839b7d8b..0359cde89 100644
--- a/northd/northd.c
+++ b/northd/northd.c
@@ -10238,18 +10238,6 @@ parsed_routes_add(struct ovn_datapath *od, const 
struct hmap *lr_ports,
 return NULL;
 }
 
-/* Verify that ip_prefix and nexthop have same address familiy. */
-if (valid_nexthop) {
-if (IN6_IS_ADDR_V4MAPPED() != IN6_IS_ADDR_V4MAPPED()) {
-static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
-VLOG_WARN_RL(, "Address family doesn't match between 
'ip_prefix'"
- " %s and 'nexthop' %s in static route "UUID_FMT,
- route->ip_prefix, route->nexthop,
- UUID_ARGS(>header_.uuid));
-return NULL;
-}
-}
-
 /* Verify that ip_prefix and nexthop are on the same network. */
 if (!is_discard_route &&
 !find_static_route_outport(od, lr_ports, route,
@@ -10666,7 +10654,7 @@ build_ecmp_route_flow(struct lflow_table *lflows, 
struct ovn_datapath *od,
   struct lflow_ref *lflow_ref)
 
 {
-bool is_ipv4 = IN6_IS_ADDR_V4MAPPED(>prefix);
+bool is_ipv4_network = IN6_IS_ADDR_V4MAPPED(>prefix);
 uint16_t priority;
 struct ecmp_route_list_node *er;
 struct ds route_match = DS_EMPTY_INITIALIZER;
@@ -10675,7 +10663,8 @@ build_ecmp_route_flow(struct lflow_table *lflows, 
struct ovn_datapath *od,
 int ofs = !strcmp(eg->origin, ROUTE_ORIGIN_CONNECTED) ?
 ROUTE_PRIO_OFFSET_CONNECTED: ROUTE_PRIO_OFFSET_STATIC;
 build_route_match(NULL, eg->route_table_id, prefix_s, eg->plen,
-  eg->is_src_route, is_ipv4, _match, , ofs);
+  eg->is_src_route, is_ipv4_network, _match,
+  , ofs);
 free(prefix_s);
 
 struct ds actions = DS_EMPTY_INITIALIZER;
@@ -10708,7 +10697,11 @@ build_ecmp_route_flow(struct lflow_table *lflows, 
struct ovn_datapath *od,
 /* Find the outgoing port. */
 const char *lrp_addr_s = NULL;
 struct ovn_port *out_port = NULL;
-if (!find_static_route_outport(od, lr_ports, route, is_ipv4,
+bool is_ipv4_gateway = is_ipv4_network;
+if (route->nexthop && route->nexthop[0]) {
+  is_ipv4_gateway = strchr(route->nexthop, '.') ? true : false;
+}
+if (!find_static_route_outport(od, lr_ports, route, is_ipv4_gateway,
_addr_s, _port)) {
 continue;
 }
@@ -10733,9 +10726,9 @@ build_ecmp_route_flow(struct lflow_table *lflows, 
struct ovn_datapath *od,
   "eth.src = %s; "
   "outport = %s; "
   "next;",
-  is_ipv4 ? REG_NEXT_HOP_IPV4 : REG_NEXT_HOP_IPV6,
+  is_ipv4_gateway ? REG_NEXT_HOP_IPV4 : REG_NEXT_HOP_IPV6,
   route->nexthop,
-  is_ipv4 ? REG_SRC_IPV4 : REG_SRC_IPV6,
+  is_ipv4_gateway ? REG_SRC_IPV4 : REG_SRC_IPV6,
   lrp_addr_s,
   out_port->lrp_networks.ea_s,
   out_port->json_key);
@@ -10757,13 +10750,18 @@ add_route(struct lflow_table *lflows, struct 
ovn_datapath *od,
   const 

Re: [ovs-dev] [PATCH v2 1/5] ovsdb: raft: Avoid transferring leadership to unavailable servers.

2024-03-27 Thread Felix Huettner via dev
On Tue, Mar 26, 2024 at 06:27:09PM +0100, Ilya Maximets wrote:
> Current implementation of the leadership transfer just shoots the
> leadership in the general direction of the first stable server in the
> configuration.  It doesn't check if the server was active recently or
> even that the connection is established.  This may result in sending
> leadership to a disconnected or otherwise unavailable server.
> 
> Such behavior should not cause log truncation or any other correctness
> issues because the destination server would have all the append
> requests queued up or the connection will be dropped by the leader.
> In a worst case we will have a leader-less cluster until the next
> election timer fires up.  Other servers will notice the absence of the
> leader and will trigger a new leader election normally.
> However, the potential wait for the election timer is not good as
> real-world setups may have high values configured.
> 
> Fix that by trying to transfer to servers that we know have applied
> the most changes, i.e., have the highest 'match_index'.  Such servers
> replied to the most recent append requests, so they have highest
> chances to be healthy.  Choosing the random starting point in the
> list of such servers so we don't transfer to the same server every
> single time.  This slightly improves load distribution, but, most
> importantly, increases robustness of our test suite, making it cover
> more cases.  Also checking that the message was actually sent without
> immediate failure.
> 
> If we fail to transfer to any server with the highest index, try to
> just transfer to any other server that is not behind majority and then
> just any other server that is connected.  We did actually send them
> all the updates (if the connection is open), they just didn't reply
> yet for one reason or another.  It should be better than leaving the
> cluster without a leader.
> 
> Note that there is always a chance that transfer will fail, since
> we're not waiting for it to be acknowledged (and must not wait).
> In this case, normal election will be triggered after the election
> timer fires up.
> 
> Fixes: 1b1d2e6daa56 ("ovsdb: Introduce experimental support for clustered 
> databases.")
> Signed-off-by: Ilya Maximets 
> ---
> 
> CC: Felix Huettner 
> 
>  ovsdb/raft.c | 48 
>  1 file changed, 44 insertions(+), 4 deletions(-)
> 
> diff --git a/ovsdb/raft.c b/ovsdb/raft.c
> index f463afcb3..b171da345 100644
> --- a/ovsdb/raft.c
> +++ b/ovsdb/raft.c
> @@ -1261,10 +1261,30 @@ raft_transfer_leadership(struct raft *raft, const 
> char *reason)
>  return;
>  }
>  
> -struct raft_server *s;
> +struct raft_server **servers, *s;
> +uint64_t threshold = 0;
> +size_t n = 0, start, i;
> +
> +servers = xmalloc(hmap_count(>servers) * sizeof *servers);
> +
>  HMAP_FOR_EACH (s, hmap_node, >servers) {
> -if (!uuid_equals(>sid, >sid)
> -&& s->phase == RAFT_PHASE_STABLE) {
> +if (uuid_equals(>sid, >sid)
> +|| s->phase != RAFT_PHASE_STABLE) {
> +continue;
> +}
> +if (s->match_index > threshold) {
> +threshold = s->match_index;
> +}
> +servers[n++] = s;
> +}
> +
> +start = n ? random_range(n) : 0;
> +
> +retry:
> +for (i = 0; i < n; i++) {
> +s = servers[(start + i) % n];
> +
> +if (s->match_index >= threshold) {
>  struct raft_conn *conn = raft_find_conn_by_sid(raft, >sid);
>  if (!conn) {
>  continue;
> @@ -1280,7 +1300,10 @@ raft_transfer_leadership(struct raft *raft, const char 
> *reason)
>  .term = raft->term,
>  }
>  };
> -raft_send_to_conn(raft, , conn);
> +
> +if (!raft_send_to_conn(raft, , conn)) {
> +continue;
> +}
>  
>  raft_record_note(raft, "transfer leadership",
>   "transferring leadership to %s because %s",
> @@ -1288,6 +1311,23 @@ raft_transfer_leadership(struct raft *raft, const char 
> *reason)
>  break;
>  }
>  }
> +
> +if (n && i == n && threshold) {
> +if (threshold > raft->commit_index) {
> +/* Failed to transfer to servers with the highest 'match_index'.
> + * Try other servers that are not behind the majority. */
> +threshold = raft->commit_index;
> +} else {
> +/* Try any other server.  It is safe, because they either have 
> all
> + * the append requests queued up for them before the leadership
> + * transfer message or their connection is broken and we will not
> + * transfer anyway. */
> +threshold = 0;
> +}
> +goto retry;
> +}
> +
> +free(servers);
>  }
>  
>  /* Send a RemoveServerRequest to the rest of the servers in the cluster.
> -- 
> 2.44.0
> 


Re: [ovs-dev] [PATCH ovn] controller: Track individual address set constants.

2024-03-27 Thread Ales Musil
On Wed, Mar 27, 2024 at 7:14 AM Han Zhou  wrote:

>
>
> On Tue, Mar 19, 2024 at 9:45 AM Ales Musil  wrote:
> >
> >
> >
> > On Tue, Mar 19, 2024 at 5:43 PM Ales Musil  wrote:
> >>
> >> Instead of tracking address set per struct expr_constant_set track it
> >> per individual struct expr_constant. This allows more fine grained
> >> control for I-P processing of address sets in controller. It helps with
> >> scenarios like matching on two address sets in one expression e.g.
> >> "ip4.src == {$as1, $as2}". This allows any addition or removal of
> >> individual adress from the set to be incrementally processed instead
> >> of reprocessing all the flows.
> >>
> >> This unfortunately doesn't help with the following flows:
> >> "ip4.src == $as1 && ip4.dst == $as2"
> >> "ip4.src == $as1 || ip4.dst == $as2"
> >>
> >> The memory impact should be minimal as there is only increase of 8 bytes
> >> per the struct expr_constant.
> >>
> >> Signed-off-by: Ales Musil 
>
> Thanks Ales for the improvement! The approach looks good to me. Please see
> some comments below:
>
>
>
Hi Han,

thank you for the review.


>
> >> ---
> >>  controller/lflow.c  |  4 +-
> >>  include/ovn/actions.h   |  2 +-
> >>  include/ovn/expr.h  | 46 ++--
> >>  lib/actions.c   | 20 -
> >>  lib/expr.c  | 95 +
> >>  tests/ovn-controller.at | 14 +++---
> >>  6 files changed, 83 insertions(+), 98 deletions(-)
> >>
> >> diff --git a/controller/lflow.c b/controller/lflow.c
> >> index 895d17d19..730dc879d 100644
> >> --- a/controller/lflow.c
> >> +++ b/controller/lflow.c
> >> @@ -278,7 +278,7 @@ lflow_handle_changed_flows(struct lflow_ctx_in
> *l_ctx_in,
> >>  }
> >>
> >>  static bool
> >> -as_info_from_expr_const(const char *as_name, const union expr_constant
> *c,
> >> +as_info_from_expr_const(const char *as_name, const struct
> expr_constant *c,
> >>  struct addrset_info *as_info)
> >>  {
> >>  as_info->name = as_name;
> >> @@ -714,7 +714,7 @@ lflow_handle_addr_set_update(const char *as_name,
> >>  if (as_diff->deleted) {
> >>  struct addrset_info as_info;
> >>  for (size_t i = 0; i < as_diff->deleted->n_values; i++) {
> >> -union expr_constant *c = _diff->deleted->values[i];
> >> +struct expr_constant *c = _diff->deleted->values[i];
> >>  if (!as_info_from_expr_const(as_name, c, _info)) {
> >>  continue;
> >>  }
> >> diff --git a/include/ovn/actions.h b/include/ovn/actions.h
> >> index dcacbb1ff..1e20f9b81 100644
> >> --- a/include/ovn/actions.h
> >> +++ b/include/ovn/actions.h
> >> @@ -238,7 +238,7 @@ struct ovnact_next {
> >>  struct ovnact_load {
> >>  struct ovnact ovnact;
> >>  struct expr_field dst;
> >> -union expr_constant imm;
> >> +struct expr_constant imm;
> >>  };
> >>
> >>  /* OVNACT_MOVE, OVNACT_EXCHANGE. */
> >> diff --git a/include/ovn/expr.h b/include/ovn/expr.h
> >> index c48f82398..e54edb5bf 100644
> >> --- a/include/ovn/expr.h
> >> +++ b/include/ovn/expr.h
> >> @@ -368,7 +368,7 @@ bool expr_relop_from_token(enum lex_type type, enum
> expr_relop *relop);
> >>  struct expr {
> >>  struct ovs_list node;   /* In parent EXPR_T_AND or EXPR_T_OR
> if any. */
> >>  enum expr_type type;/* Expression type. */
> >> -char *as_name;  /* Address set name. Null if it is not
> an
> >> +const char *as_name;/* Address set name. Null if it is not
> an
> >> address set. */
> >>
> >>  union {
> >> @@ -505,40 +505,42 @@ enum expr_constant_type {
> >>  };
> >>
> >>  /* A string or integer constant (one must know which from context). */
> >> -union expr_constant {
> >> -/* Integer constant.
> >> - *
> >> - * The width of a constant isn't always clear, e.g. if you write
> "1",
> >> - * there's no way to tell whether you mean for that to be a 1-bit
> constant
> >> - * or a 128-bit constant or somewhere in between. */
> >> -struct {
> >> -union mf_subvalue value;
> >> -union mf_subvalue mask; /* Only initialized if 'masked'. */
> >> -bool masked;
> >> -
> >> -enum lex_format format; /* From the constant's lex_token. */
> >> -};
> >> +struct expr_constant {
> >> +const char *as_name;
> >>
> >> -/* Null-terminated string constant. */
> >> -char *string;
> >> +union {
> >> +/* Integer constant.
> >> + *
> >> + * The width of a constant isn't always clear, e.g. if you
> write "1",
> >> + * there's no way to tell whether you mean for that to be a
> 1-bit
> >> + * constant or a 128-bit constant or somewhere in between. */
> >> +struct {
> >> +union mf_subvalue value;
> >> +union mf_subvalue mask; /* Only initialized if 'masked'. */
> >> +bool masked;
> >> +
> >> +

Re: [ovs-dev] [PATCH ovn v3 6/8] utilities/docker: Fix up container build.

2024-03-27 Thread Ales Musil
On Tue, Mar 26, 2024 at 7:09 PM Mark Michelson  wrote:

> On 3/25/24 05:09, Ales Musil wrote:
> >
> >
> > On Mon, Mar 25, 2024 at 9:47 AM Dumitru Ceara  > > wrote:
> >
> > On 3/22/24 19:34, Mark Michelson wrote:
> >  > On 3/21/24 19:15, Dumitru Ceara wrote:
> >  >> Most of the steps were inaccurate.  Instead, use latest Ubuntu,
> use
> >  >> OVS from the submodule inside the OVN repo.
> >  >>
> >  >> Signed-off-by: Dumitru Ceara  > >
> >  >> ---
> >  >>   utilities/docker/Makefile  |  4 ++--
> >  >>   utilities/docker/debian/Dockerfile |  2 +-
> >  >>   utilities/docker/debian/build.sh   |  2 ++
> >  >>   utilities/docker/install_ovn.sh| 31
> > ++
> >  >>   4 files changed, 19 insertions(+), 20 deletions(-)
> >  >>
> >  >> diff --git a/utilities/docker/Makefile
> b/utilities/docker/Makefile
> >  >> index 57e95651cb..aad9c3482c 100644
> >  >> --- a/utilities/docker/Makefile
> >  >> +++ b/utilities/docker/Makefile
> >  >> @@ -1,5 +1,5 @@
> >  >> -#export OVN_BRANCH=master
> >  >> -#export OVN_VERSION=2.12
> >  >> +#export OVN_BRANCH=main
> >  >> +#export OVN_VERSION=24.03.90
> >  >
> >  > Is this something that we should update with each major release
> > of OVN?
> >  > If so, I could probably alter my release script to include
> updating
> >  > utilities/docker/Makefile as part of the release patches.
> >  >
> >
> > It's just a comment and I guess the original intention was to show
> users
> > how to set this up.  But, back to your question, if it's not a lot of
> > work, automatically changing this on release would be nice.
> >
> > Also, it might make sense to set up a CI job that actually runs this
> in
> > CI, maybe periodically.  I can try to find some time to do that at
> some
> > point in the future.  Wdyt?
> >
> >
> >
> > Hi Mark, Dumitru,
> >
> > from a different perspective, I would be interested to see if this is
> > still used by anyone. We have other methods of running OVN in containers
> > that are up to date and maintained. It might be the case of working
> > perfectly, still used and doesn't need any attention. But it would
> > probably still be useful to hear from the community if that's the case.
> > IMO we shouldn't run CI on something just for the sake of running CI.
> >
> > Does that make sense?
>
> First off, with regards to this patch, I think it can be merged as-is,
> with no further action. If we want to continue this discussion, we
> shouldn't let it hold up the patch.
>

 Yeah definitely, I don't want to block this at all, I was just curious.


> When it comes to your questions, Ales, they're really hard to answer :)
>
> Is anyone using utilities/docker? I have no idea. I'd be willing to bet
> most CMSes are using their own containerization methods instead of the
> in-tree method. However, I don't know if there are people that use it
> for development purposes. I know I tend to use ovn-fake-multinode during
> development/testing, but that's just me.
>
> When you mention having other methods of running OVN in containers, are
> you referring to the content in utilities/containers ?


I was mainly referring to ovn-fake-multinode in that case.



> I agree that when
> it comes to CI, we probably don't need the content in utilities/docker .
> But it appears to have been written to be a general-purpose method of
> creating versioned container builds of OVN, rather than specifically for
> CI. The content in utilities/docker is also quite out of date, I would
> imagine, since it hasn't been updated since adding cluster support in
> January 2020. There is also documentation throughout the tree that
> refers to the content in utilities/docker. If a new user were to
> download the OVN source and browse the documentation, they would try to
> use the Makefile in utilities/docker.
>

> In my opinion, the best thing to do is to merge the two containerization
> methods together into a single unified method. We can then use this for
> our CI, meaning we will keep it up to date and add necessary features as
> we see fit. This way we aren't keeping a "dead" method alive in the tree
> for no reason, and we won't be adding unnecessary CI either.
>
> What do you think?
>

I'm not sure merging is the right thing, as there is probably not one size
to fit them all. We could certainly change the documentation with reference
to the utilities/containers as a way to have reproducible builds.

Anyway let's proceed with the series as is and we can always follow up with
other ideas for this area.


>
> >
> >
> >  >>   #export DISTRO=debian
> >  >>   #export GITHUB_SRC=https://github.com/ovn-org/ovn.git
> > 
> >  >>   #export DOCKER_REPO=ovn-org/ovn
> >  >> diff --git a/utilities/docker/debian/Dockerfile
> >  >> 

Re: [ovs-dev] [PATCH ovn] controller: Track individual address set constants.

2024-03-27 Thread Han Zhou
On Tue, Mar 19, 2024 at 9:45 AM Ales Musil  wrote:
>
>
>
> On Tue, Mar 19, 2024 at 5:43 PM Ales Musil  wrote:
>>
>> Instead of tracking address set per struct expr_constant_set track it
>> per individual struct expr_constant. This allows more fine grained
>> control for I-P processing of address sets in controller. It helps with
>> scenarios like matching on two address sets in one expression e.g.
>> "ip4.src == {$as1, $as2}". This allows any addition or removal of
>> individual adress from the set to be incrementally processed instead
>> of reprocessing all the flows.
>>
>> This unfortunately doesn't help with the following flows:
>> "ip4.src == $as1 && ip4.dst == $as2"
>> "ip4.src == $as1 || ip4.dst == $as2"
>>
>> The memory impact should be minimal as there is only increase of 8 bytes
>> per the struct expr_constant.
>>
>> Signed-off-by: Ales Musil 

Thanks Ales for the improvement! The approach looks good to me. Please see
some comments below:

>> ---
>>  controller/lflow.c  |  4 +-
>>  include/ovn/actions.h   |  2 +-
>>  include/ovn/expr.h  | 46 ++--
>>  lib/actions.c   | 20 -
>>  lib/expr.c  | 95 +
>>  tests/ovn-controller.at | 14 +++---
>>  6 files changed, 83 insertions(+), 98 deletions(-)
>>
>> diff --git a/controller/lflow.c b/controller/lflow.c
>> index 895d17d19..730dc879d 100644
>> --- a/controller/lflow.c
>> +++ b/controller/lflow.c
>> @@ -278,7 +278,7 @@ lflow_handle_changed_flows(struct lflow_ctx_in
*l_ctx_in,
>>  }
>>
>>  static bool
>> -as_info_from_expr_const(const char *as_name, const union expr_constant
*c,
>> +as_info_from_expr_const(const char *as_name, const struct expr_constant
*c,
>>  struct addrset_info *as_info)
>>  {
>>  as_info->name = as_name;
>> @@ -714,7 +714,7 @@ lflow_handle_addr_set_update(const char *as_name,
>>  if (as_diff->deleted) {
>>  struct addrset_info as_info;
>>  for (size_t i = 0; i < as_diff->deleted->n_values; i++) {
>> -union expr_constant *c = _diff->deleted->values[i];
>> +struct expr_constant *c = _diff->deleted->values[i];
>>  if (!as_info_from_expr_const(as_name, c, _info)) {
>>  continue;
>>  }
>> diff --git a/include/ovn/actions.h b/include/ovn/actions.h
>> index dcacbb1ff..1e20f9b81 100644
>> --- a/include/ovn/actions.h
>> +++ b/include/ovn/actions.h
>> @@ -238,7 +238,7 @@ struct ovnact_next {
>>  struct ovnact_load {
>>  struct ovnact ovnact;
>>  struct expr_field dst;
>> -union expr_constant imm;
>> +struct expr_constant imm;
>>  };
>>
>>  /* OVNACT_MOVE, OVNACT_EXCHANGE. */
>> diff --git a/include/ovn/expr.h b/include/ovn/expr.h
>> index c48f82398..e54edb5bf 100644
>> --- a/include/ovn/expr.h
>> +++ b/include/ovn/expr.h
>> @@ -368,7 +368,7 @@ bool expr_relop_from_token(enum lex_type type, enum
expr_relop *relop);
>>  struct expr {
>>  struct ovs_list node;   /* In parent EXPR_T_AND or EXPR_T_OR if
any. */
>>  enum expr_type type;/* Expression type. */
>> -char *as_name;  /* Address set name. Null if it is not
an
>> +const char *as_name;/* Address set name. Null if it is not
an
>> address set. */
>>
>>  union {
>> @@ -505,40 +505,42 @@ enum expr_constant_type {
>>  };
>>
>>  /* A string or integer constant (one must know which from context). */
>> -union expr_constant {
>> -/* Integer constant.
>> - *
>> - * The width of a constant isn't always clear, e.g. if you write
"1",
>> - * there's no way to tell whether you mean for that to be a 1-bit
constant
>> - * or a 128-bit constant or somewhere in between. */
>> -struct {
>> -union mf_subvalue value;
>> -union mf_subvalue mask; /* Only initialized if 'masked'. */
>> -bool masked;
>> -
>> -enum lex_format format; /* From the constant's lex_token. */
>> -};
>> +struct expr_constant {
>> +const char *as_name;
>>
>> -/* Null-terminated string constant. */
>> -char *string;
>> +union {
>> +/* Integer constant.
>> + *
>> + * The width of a constant isn't always clear, e.g. if you
write "1",
>> + * there's no way to tell whether you mean for that to be a
1-bit
>> + * constant or a 128-bit constant or somewhere in between. */
>> +struct {
>> +union mf_subvalue value;
>> +union mf_subvalue mask; /* Only initialized if 'masked'. */
>> +bool masked;
>> +
>> +enum lex_format format; /* From the constant's lex_token. */
>> +};
>> +
>> +/* Null-terminated string constant. */
>> +char *string;
>> +};
>>  };
>>
>>  bool expr_constant_parse(struct lexer *,
>>   const struct expr_field *,
>> - union expr_constant *);
>> -void