This change attempts to track the actual available stack and force
recircuation if the flow translation logic is about to overflow it.

Unlike the recursion depth counter, this approach allows to track the
actual stack usage and bail early even if the recursion depth is not
reached yet.  This is important because different actions have vastly
different stack requirements and different systems have different
amount of stack allocated per thread by default.

Should work with both GCC and Clang, will likely not work on Windows.

The main thread is not treated fairly in this version.  At least on
Linux the main thread can grow its stack if the limit is dynamically
increased.  That is not normally true for other threads.  However
this patch sticks to initial stack size even for the main thread.

Sending as an RFC for discussion.  For v1 this change should likely
be split in about 4 patches:

1. Preserve resubmits counter across recirculations.  We might need
   that as a bug fix even, because it might be technically possible to
   bypass the resubmit limit by inducing recirculations in the current
   code, e.g. with conntrack.

2. Introduction of the stack frame functions.

3. Actual recirculation on stack exhaution.

4. Enabe tracing over this kind of recirculations.

Tracing also may need a refactor, it doesn't look particularly clean.
And we may want to track resubmits over packet-ins as well.

The behavioral change with this approach is that if we have a large
pipeline that eventually exceeds the number of resubmits, part of this
pipeline can be executed in the datapath then packet recirculates and
will be dropped afterwards.  In contrast, no actions will be executed
today in such scenario, but OVS would also crash with stack overflow,
so I'm not sure if that counts as a behavioral change. :)

Another one is tracking resubmits over recirculations.  That will
impact, for example, forked pipelines on conntrack, since resubmits
before the fork will be accounted for after the fork.

Didn't test much, only checked an infinite resubmit case:

  make -j8
  (ulimit -s 386; make sandbox)
  ovs-vsctl add-br br0
  ovs-vsctl add-port br0 p1
  ovs-ofclt del-flows br0
  (for i in $(seq 0 64); do
      j=$(expr $i + 1);
      echo "table=$i, actions=local,resubmit(,$j),local,resubmit(,$j),local";
   done;
   echo "table=65, actions=resubmit(,0)") > ./resubmits.txt
  ovs-ofclt add-flows br0 ./resubmits.txt
  ovs-appctl ofproto/trace br0 'in_port=1' > ./trace.txt

Depending on compiler flags it manages to get to 4096 resubmit limit
in anywhere from 5 to 16 recirculations.

Thoughts?

Signed-off-by: Ilya Maximets <[email protected]>
---
 include/openvswitch/compiler.h | 12 +++++++++
 lib/ovs-thread.c               | 47 ++++++++++++++++++++++++++++++++++
 lib/ovs-thread.h               |  8 ++++++
 lib/sat-math.h                 |  5 +---
 ofproto/ofproto-dpif-rid.h     |  1 +
 ofproto/ofproto-dpif-trace.h   |  2 ++
 ofproto/ofproto-dpif-xlate.c   | 32 +++++++++++++++++++++++
 vswitchd/ovs-vswitchd.c        |  1 +
 8 files changed, 104 insertions(+), 4 deletions(-)

diff --git a/include/openvswitch/compiler.h b/include/openvswitch/compiler.h
index 878c5c6a7..8da2e6eef 100644
--- a/include/openvswitch/compiler.h
+++ b/include/openvswitch/compiler.h
@@ -26,6 +26,9 @@
 #ifndef __has_extension
   #define __has_extension(x) 0
 #endif
+#ifndef __has_builtin
+  #define __has_builtin(x) 0
+#endif
 
 /* To make OVS_NO_RETURN portable across gcc/clang and MSVC, it should be
  * added at the beginning of the function declaration. */
@@ -300,5 +303,14 @@
 #define BUILD_ASSERT_DECL_GCCONLY(EXPR) ((void) 0)
 #endif
 
+/* OVS_FRAME_ADDRESS can be used to get the address of the current stack frame.
+ * Note: Attempts to get address of any frame beside the current one (0) are
+ * dangerous and can lead to crashes according to GCC documentation.  */
+#if __has_builtin(__builtin_frame_address)
+#define OVS_FRAME_ADDRESS() ((char *) __builtin_frame_address(0))
+#else
+#define OVS_FRAME_ADDRESS() ((char *) 0)
+#endif
+
 
 #endif /* compiler.h */
diff --git a/lib/ovs-thread.c b/lib/ovs-thread.c
index f80008061..637ce27ec 100644
--- a/lib/ovs-thread.c
+++ b/lib/ovs-thread.c
@@ -407,6 +407,52 @@ ovsthread_id_init(void)
     return *ovsthread_id_get() = atomic_count_inc(&next_id);
 }
 
+DEFINE_STATIC_PER_THREAD_DATA(uintptr_t, ovsthread_stack_base, 0);
+DEFINE_STATIC_PER_THREAD_DATA(size_t, ovsthread_stack_size, 0);
+
+void
+ovsthread_stack_init(void)
+{
+    pthread_attr_t attr;
+    size_t stacksize;
+    int error;
+
+    ovs_assert(*ovsthread_stack_base_get() == 0);
+    *ovsthread_stack_base_get() = (uintptr_t) OVS_FRAME_ADDRESS();
+
+    pthread_attr_init(&attr);
+
+    error = pthread_attr_getstacksize(&attr, &stacksize);
+    if (error) {
+        VLOG_ABORT("pthread_attr_getstacksize failed: %s",
+                   ovs_strerror(error));
+    }
+    *ovsthread_stack_size_get() = stacksize;
+    pthread_attr_destroy(&attr);
+}
+
+bool
+ovsthread_low_on_stack(void)
+{
+    uintptr_t curr = (uintptr_t) OVS_FRAME_ADDRESS();
+    uintptr_t base = *ovsthread_stack_base_get();
+    size_t size = *ovsthread_stack_size_get();
+    size_t used;
+
+    if (!curr || !base || !size) {
+        /* Either not initialized or not supported. */
+        return false;
+    }
+
+    used = (base > curr) ? base - curr : curr - base;
+
+    /* Consider 'low' as less than a 100 KB. */
+    if (size < used + 100 * 1024) {
+        return true;
+    }
+    return false;
+}
+
 static void *
 ovsthread_wrapper(void *aux_)
 {
@@ -415,6 +461,7 @@ ovsthread_wrapper(void *aux_)
     unsigned int id;
 
     id = ovsthread_id_init();
+    ovsthread_stack_init();
 
     aux = *auxp;
     free(auxp);
diff --git a/lib/ovs-thread.h b/lib/ovs-thread.h
index aac5e19c9..e81f28970 100644
--- a/lib/ovs-thread.h
+++ b/lib/ovs-thread.h
@@ -474,6 +474,14 @@ ovsthread_id_self(void)
     return id;
 }
 
+/* Enables use of ovsthread_low_on_stack().  Must be called by a new thread
+ * early after the thread creation. */
+void ovsthread_stack_init(void);
+
+/* Returns 'true' is the current thread used up most of its stack space.
+ * Can be used, for example, to check if recursion has to be stopped. */
+bool ovsthread_low_on_stack(void);
+
 /* Simulated global counter.
  *
  * Incrementing such a counter is meant to be cheaper than incrementing a
diff --git a/lib/sat-math.h b/lib/sat-math.h
index d66872387..34b939277 100644
--- a/lib/sat-math.h
+++ b/lib/sat-math.h
@@ -18,12 +18,9 @@
 #define SAT_MATH_H 1
 
 #include <limits.h>
+#include "openvswitch/compiler.h"
 #include "openvswitch/util.h"
 
-#ifndef __has_builtin
-#define __has_builtin(x) 0
-#endif
-
 /* Returns x + y, clamping out-of-range results into the range of the return
  * type. */
 static inline unsigned int
diff --git a/ofproto/ofproto-dpif-rid.h b/ofproto/ofproto-dpif-rid.h
index 4df630c62..9488a3850 100644
--- a/ofproto/ofproto-dpif-rid.h
+++ b/ofproto/ofproto-dpif-rid.h
@@ -151,6 +151,7 @@ struct frozen_state {
     struct frozen_metadata metadata; /* Flow metadata. */
     uint8_t *stack;               /* Stack if any. */
     size_t stack_size;
+    size_t resubmits;             /* Number of resubmits prior to freeze. */
     mirror_mask_t mirrors;        /* Mirrors already output. */
     bool conntracked;             /* Conntrack occurred prior to freeze. */
     bool was_mpls;                /* MPLS packet */
diff --git a/ofproto/ofproto-dpif-trace.h b/ofproto/ofproto-dpif-trace.h
index f579a5ca4..1e34cdb95 100644
--- a/ofproto/ofproto-dpif-trace.h
+++ b/ofproto/ofproto-dpif-trace.h
@@ -51,9 +51,11 @@ enum oftrace_node_type {
 
 /* Reason why a flow is in a recirculation queue. */
 enum oftrace_recirc_type {
+    OFT_RECIRC_NONE = 0,
     OFT_RECIRC_CONNTRACK,
     OFT_RECIRC_MPLS,
     OFT_RECIRC_BOND,
+    OFT_RECIRC_STACK_EXHAUSTED,
 };
 
 /* A node within a trace. */
diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c
index 89f183182..e5e17b0a6 100644
--- a/ofproto/ofproto-dpif-xlate.c
+++ b/ofproto/ofproto-dpif-xlate.c
@@ -71,6 +71,7 @@
 COVERAGE_DEFINE(xlate_actions);
 COVERAGE_DEFINE(xlate_actions_oversize);
 COVERAGE_DEFINE(xlate_actions_too_many_output);
+COVERAGE_DEFINE(xlate_stack_exhausted);
 
 VLOG_DEFINE_THIS_MODULE(ofproto_dpif_xlate);
 
@@ -381,6 +382,7 @@ struct xlate_ctx {
     * case at that point.
     */
     bool freezing;
+    enum oftrace_recirc_type recirc_type; /* Recirculation type for tracing. */
     bool recirc_update_dp_hash;    /* Generated recirculation will be preceded
                                     * by datapath HASH action to get an updated
                                     * dp_hash after recirculation. */
@@ -495,6 +497,7 @@ ctx_cancel_freeze(struct xlate_ctx *ctx)
 {
     if (ctx->freezing) {
         ctx->freezing = false;
+        ctx->recirc_type = OFT_RECIRC_NONE;
         ctx->recirc_update_dp_hash = false;
         ofpbuf_clear(&ctx->frozen_actions);
         ctx->frozen_actions.header = NULL;
@@ -4607,6 +4610,12 @@ xlate_resubmit_resource_check(struct xlate_ctx *ctx)
     } else if (ctx->stack.size >= 65536) {
         xlate_report_error(ctx, "resubmits yielded over 64 kB of stack");
         ctx->error = XLATE_STACK_TOO_DEEP;
+    } else if (ovsthread_low_on_stack()) {
+        xlate_report(ctx, OFT_WARN, "Thread stack exhausted, "
+                     "recirculating to avoid overflow.");
+        COVERAGE_INC(xlate_stack_exhausted);
+        ctx->recirc_type = OFT_RECIRC_STACK_EXHAUSTED;
+        ctx_trigger_freeze(ctx);
     } else {
         return true;
     }
@@ -5125,6 +5134,7 @@ xlate_controller_action(struct xlate_ctx *ctx, int len,
         .ofproto_uuid = ctx->xbridge->ofproto->uuid,
         .stack = ctx->stack.data,
         .stack_size = ctx->stack.size,
+        .resubmits = ctx->resubmits,
         .mirrors = ctx->mirrors,
         .conntracked = ctx->conntracked,
         .was_mpls = ctx->was_mpls,
@@ -5200,6 +5210,7 @@ finish_freezing__(struct xlate_ctx *ctx, uint8_t table)
         .ofproto_uuid = ctx->xbridge->ofproto->uuid,
         .stack = ctx->stack.data,
         .stack_size = ctx->stack.size,
+        .resubmits = ctx->resubmits,
         .mirrors = ctx->mirrors,
         .conntracked = ctx->conntracked,
         .was_mpls = ctx->was_mpls,
@@ -5248,6 +5259,24 @@ finish_freezing__(struct xlate_ctx *ctx, uint8_t table)
             act_hash->hash_basis = ctx->dp_hash_basis;
         }
         nl_msg_put_u32(ctx->odp_actions, OVS_ACTION_ATTR_RECIRC, recirc_id);
+
+        if (OVS_UNLIKELY(ctx->xin->trace) && recirc_id &&
+            ctx->recirc_type == OFT_RECIRC_STACK_EXHAUSTED) {
+            if (oftrace_add_recirc_node(ctx->xin->recirc_queue,
+                                        ctx->recirc_type,
+                                        &ctx->xin->flow,
+                                        ctx->ct_nat_action,
+                                        ctx->xin->packet,
+                                        recirc_id,
+                                        ctx->xin->flow.ct_zone)) {
+                xlate_report(ctx, OFT_DETAIL,
+                             "Packet is sent for recirculation, "
+                             "recric_id = 0x%"PRIx32".", recirc_id);
+            } else {
+                xlate_report(ctx, OFT_DETAIL, "Failed to trace recirculation "
+                             "with recirc_id = 0x%"PRIx32".", recirc_id);
+            }
+        }
     }
 
     /* Undo changes done by freezing. */
@@ -8062,6 +8091,7 @@ xlate_actions(struct xlate_in *xin, struct xlate_out 
*xout)
         .mirrors = 0,
 
         .freezing = false,
+        .recirc_type = OFT_RECIRC_NONE,
         .recirc_update_dp_hash = false,
         .frozen_actions = OFPBUF_STUB_INITIALIZER(frozen_actions_stub),
         .pause = NULL,
@@ -8139,6 +8169,8 @@ xlate_actions(struct xlate_in *xin, struct xlate_out 
*xout)
             clear_conntrack(&ctx);
         }
 
+        ctx.resubmits = state->resubmits;
+
         /* Restore pipeline metadata. May change flow's in_port and other
          * metadata to the values that existed when freezing was triggered. */
         frozen_metadata_to_flow(&ctx.xbridge->ofproto->up,
diff --git a/vswitchd/ovs-vswitchd.c b/vswitchd/ovs-vswitchd.c
index 273af9f5d..326e5b9f7 100644
--- a/vswitchd/ovs-vswitchd.c
+++ b/vswitchd/ovs-vswitchd.c
@@ -93,6 +93,7 @@ main(int argc, char *argv[])
     fatal_ignore_sigpipe();
 
     daemonize_start(true, hw_rawio_access);
+    ovsthread_stack_init();
 
     if (want_mlockall) {
 #ifdef HAVE_MLOCKALL
-- 
2.43.0

_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to