Re: [ovs-dev] [PATCH ovn] controller: Fixed ovs/ovn(features) connection lost when running more than 120 seconds

2022-11-24 Thread Xavier Simonart
Hi Ales, Dumitru

Thanks for the feedback - and I agree that we needed a way to get rid of
the sleep.
Thanks Dumitru for the time/warp trick!.

I was wondering whether we really needed the additional probe for the
connection.
The disconnect is now detected by the controller by rconn_recv within
ovs_feature_get_openflow_cap, detecting that the connection has been closed
("connection closed by peer"). Then the connection enters in a backoff
state, and reconnects after one sec.
But I guess that if ovn-controller is idle (not receiving anything, maybe
waiting for OFPST_METER_FEATURES reply), then we would not detect the
connection disconnect (as no rconn_recv).
Having ovn-controller idle is pretty uncommon (there are other probes such
as for ofctrl), but your proposed change provides a more clear view that we
want to keep this connection open.

So, the change looks good. I'll run a few more tests and I'll' submit a v2

Thanks
Xavier

On Wed, Nov 23, 2022 at 12:04 PM Dumitru Ceara  wrote:

> On 11/23/22 09:10, Ales Musil wrote:
> > On Tue, Nov 22, 2022 at 9:24 PM Xavier Simonart 
> wrote:
> >
> >> Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=2144084
> >>
> >> Signed-off-by: Xavier Simonart 
> >> ---
> >>  lib/features.c | 16 ++--
> >>  tests/ovn.at   | 18 ++
> >>  2 files changed, 28 insertions(+), 6 deletions(-)
> >>
> >> diff --git a/lib/features.c b/lib/features.c
> >> index f15ec42bb..571e24ded 100644
> >> --- a/lib/features.c
> >> +++ b/lib/features.c
> >> @@ -26,6 +26,7 @@
> >>  #include "openvswitch/rconn.h"
> >>  #include "openvswitch/ofp-msgs.h"
> >>  #include "openvswitch/ofp-meter.h"
> >> +#include "openvswitch/ofp-util.h"
> >>  #include "ovn/features.h"
> >>
> >>  VLOG_DEFINE_THIS_MODULE(features);
> >> @@ -90,6 +91,8 @@ ovs_feature_rconn_setup(const char *br_name)
> >>  static bool
> >>  ovs_feature_get_openflow_cap(const char *br_name)
> >>  {
> >> +struct ofpbuf *msg;
> >> +
> >>  if (!br_name) {
> >>  return false;
> >>  }
> >> @@ -102,15 +105,14 @@ ovs_feature_get_openflow_cap(const char *br_name)
> >>  }
> >>
> >>  /* send new requests just after reconnect. */
> >> -if (conn_seq_no == rconn_get_connection_seqno(swconn)) {
> >> -return false;
> >> +if (conn_seq_no != rconn_get_connection_seqno(swconn)) {
> >> +/* dump datapath meter capabilities. */
> >> +msg = ofpraw_alloc(OFPRAW_OFPST13_METER_FEATURES_REQUEST,
> >> +   rconn_get_version(swconn), 0);
> >> +rconn_send(swconn, msg, NULL);
> >>  }
> >>
> >>  bool ret = false;
> >> -/* dump datapath meter capabilities. */
> >> -struct ofpbuf *msg =
> >> ofpraw_alloc(OFPRAW_OFPST13_METER_FEATURES_REQUEST,
> >> -  rconn_get_version(swconn), 0);
> >> -rconn_send(swconn, msg, NULL);
> >>  for (int i = 0; i < 50; i++) {
> >>  msg = rconn_recv(swconn);
> >>  if (!msg) {
> >> @@ -137,6 +139,8 @@ ovs_feature_get_openflow_cap(const char *br_name)
> >>  }
> >>  }
> >>  conn_seq_no = rconn_get_connection_seqno(swconn);
> >> +} else if (type == OFPTYPE_ECHO_REQUEST) {
> >> +rconn_send(swconn, ofputil_encode_echo_reply(oh), NULL);
> >>  }
> >>  ofpbuf_delete(msg);
> >>  }
> >> diff --git a/tests/ovn.at b/tests/ovn.at
> >> index cf1ea991d..3a1c231f5 100644
> >> --- a/tests/ovn.at
> >> +++ b/tests/ovn.at
> >> @@ -33069,3 +33069,21 @@ check ovn-nbctl --wait=hv sync
> >>  OVN_CLEANUP([hv1])
> >>  AT_CLEANUP
> >>  ])
> >> +
> >> +OVN_FOR_EACH_NORTHD([
> >> +AT_SETUP([feature inactivity probe])
> >> +ovn_start
> >> +net_add n1
> >> +
> >> +sim_add hv1
> >> +as hv1
> >> +check ovs-vsctl add-br br-phys
> >> +ovn_attach n1 br-phys 192.168.0.1
> >> +
> >> +# Wait for more than 2x 60 seconds
> >> +sleep 125
> >> +
> >> +AT_CHECK([test -z "`grep disconnecting hv1/ovs-vswitchd.log`"])
> >> +OVN_CLEANUP([hv1])
> >> +AT_CLEANUP
> >> +])
> >> --
> >> 2.31.1
> >>
> >> ___
> >> dev mailing list
> >> d...@openvswitch.org
> >> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
> >>
> >>
> > Hi Xavier,
> >
>
> Hi Xavier, Ales,
>
> > the change itself looks good, however I'm very concerned about the test.
> > I wonder if we can roll with confirmation that this just works, without
> > actually having
> > test that sleeps for more than 2 minutes.
> >
> > Dumitru, Mark, Han any thoughts?
> >
>
> I agree, this is too long.  I played a bit with it and this is what I
> came up with:
>
> diff --git a/tests/ovn.at b/tests/ovn.at
> index fc5cf257ef..6d21446ddd 100644
> --- a/tests/ovn.at
> +++ b/tests/ovn.at
> @@ -33253,8 +33253,21 @@ as hv1
>  check ovs-vsctl add-br br-phys
>  ovn_attach n1 br-phys 192.168.0.1
>
> -# Wait for more than 2x 60 seconds
> -sleep 125
> +dnl Ensure that there are at least 3 openflow connections.
> +OVS_WAIT_UNTIL([test "$(grep -c 

Re: [ovs-dev] [PATCH ovn] controller: Fixed ovs/ovn(features) connection lost when running more than 120 seconds

2022-11-23 Thread Dumitru Ceara
On 11/23/22 09:10, Ales Musil wrote:
> On Tue, Nov 22, 2022 at 9:24 PM Xavier Simonart  wrote:
> 
>> Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=2144084
>>
>> Signed-off-by: Xavier Simonart 
>> ---
>>  lib/features.c | 16 ++--
>>  tests/ovn.at   | 18 ++
>>  2 files changed, 28 insertions(+), 6 deletions(-)
>>
>> diff --git a/lib/features.c b/lib/features.c
>> index f15ec42bb..571e24ded 100644
>> --- a/lib/features.c
>> +++ b/lib/features.c
>> @@ -26,6 +26,7 @@
>>  #include "openvswitch/rconn.h"
>>  #include "openvswitch/ofp-msgs.h"
>>  #include "openvswitch/ofp-meter.h"
>> +#include "openvswitch/ofp-util.h"
>>  #include "ovn/features.h"
>>
>>  VLOG_DEFINE_THIS_MODULE(features);
>> @@ -90,6 +91,8 @@ ovs_feature_rconn_setup(const char *br_name)
>>  static bool
>>  ovs_feature_get_openflow_cap(const char *br_name)
>>  {
>> +struct ofpbuf *msg;
>> +
>>  if (!br_name) {
>>  return false;
>>  }
>> @@ -102,15 +105,14 @@ ovs_feature_get_openflow_cap(const char *br_name)
>>  }
>>
>>  /* send new requests just after reconnect. */
>> -if (conn_seq_no == rconn_get_connection_seqno(swconn)) {
>> -return false;
>> +if (conn_seq_no != rconn_get_connection_seqno(swconn)) {
>> +/* dump datapath meter capabilities. */
>> +msg = ofpraw_alloc(OFPRAW_OFPST13_METER_FEATURES_REQUEST,
>> +   rconn_get_version(swconn), 0);
>> +rconn_send(swconn, msg, NULL);
>>  }
>>
>>  bool ret = false;
>> -/* dump datapath meter capabilities. */
>> -struct ofpbuf *msg =
>> ofpraw_alloc(OFPRAW_OFPST13_METER_FEATURES_REQUEST,
>> -  rconn_get_version(swconn), 0);
>> -rconn_send(swconn, msg, NULL);
>>  for (int i = 0; i < 50; i++) {
>>  msg = rconn_recv(swconn);
>>  if (!msg) {
>> @@ -137,6 +139,8 @@ ovs_feature_get_openflow_cap(const char *br_name)
>>  }
>>  }
>>  conn_seq_no = rconn_get_connection_seqno(swconn);
>> +} else if (type == OFPTYPE_ECHO_REQUEST) {
>> +rconn_send(swconn, ofputil_encode_echo_reply(oh), NULL);
>>  }
>>  ofpbuf_delete(msg);
>>  }
>> diff --git a/tests/ovn.at b/tests/ovn.at
>> index cf1ea991d..3a1c231f5 100644
>> --- a/tests/ovn.at
>> +++ b/tests/ovn.at
>> @@ -33069,3 +33069,21 @@ check ovn-nbctl --wait=hv sync
>>  OVN_CLEANUP([hv1])
>>  AT_CLEANUP
>>  ])
>> +
>> +OVN_FOR_EACH_NORTHD([
>> +AT_SETUP([feature inactivity probe])
>> +ovn_start
>> +net_add n1
>> +
>> +sim_add hv1
>> +as hv1
>> +check ovs-vsctl add-br br-phys
>> +ovn_attach n1 br-phys 192.168.0.1
>> +
>> +# Wait for more than 2x 60 seconds
>> +sleep 125
>> +
>> +AT_CHECK([test -z "`grep disconnecting hv1/ovs-vswitchd.log`"])
>> +OVN_CLEANUP([hv1])
>> +AT_CLEANUP
>> +])
>> --
>> 2.31.1
>>
>> ___
>> dev mailing list
>> d...@openvswitch.org
>> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>>
>>
> Hi Xavier,
> 

Hi Xavier, Ales,

> the change itself looks good, however I'm very concerned about the test.
> I wonder if we can roll with confirmation that this just works, without
> actually having
> test that sleeps for more than 2 minutes.
> 
> Dumitru, Mark, Han any thoughts?
> 

I agree, this is too long.  I played a bit with it and this is what I
came up with:

diff --git a/tests/ovn.at b/tests/ovn.at
index fc5cf257ef..6d21446ddd 100644
--- a/tests/ovn.at
+++ b/tests/ovn.at
@@ -33253,8 +33253,21 @@ as hv1
 check ovs-vsctl add-br br-phys
 ovn_attach n1 br-phys 192.168.0.1
 
-# Wait for more than 2x 60 seconds
-sleep 125
+dnl Ensure that there are at least 3 openflow connections.
+OVS_WAIT_UNTIL([test "$(grep -c 'negotiated OpenFlow version' 
hv1/ovs-vswitchd.log)" -eq "3"])
+
+dnl "Wait" 3 times 60 seconds and ensure ovn-controller writes to the
+dnl openflow connections in the meantime.  This should allow ovs-vswitchd
+dnl to probe the openflow connections at least twice.
+
+as hv1 ovs-appctl time/warp 6
+check ovn-nbctl --wait=hv sync
+
+as hv1 ovs-appctl time/warp 6
+check ovn-nbctl --wait=hv sync
+
+as hv1 ovs-appctl time/warp 6
+check ovn-nbctl --wait=hv sync
 
 AT_CHECK([test -z "`grep disconnecting hv1/ovs-vswitchd.log`"])
 OVN_CLEANUP([hv1])
---

It's still not 100% bullet proof but I think it should be ok.

However, this brings me to the next issue: if the features
openflow connection goes down we don't reconnect from
ovn-controller.

I think that's because on the vswitchd side, closing the punix
stream doesn't do anything.  And on the ovn-controller side we
have no chance of detecting the connection drop either because
probes are disabled and because we only request meter features
after a reconnect.

So I made this change to explicit request probes on the connection
(like we do for ofctrl):

diff --git a/lib/features.c b/lib/features.c
index a1e2e6dc1c..898a0cade5 100644
--- a/lib/features.c
+++ b/lib/features.c

Re: [ovs-dev] [PATCH ovn] controller: Fixed ovs/ovn(features) connection lost when running more than 120 seconds

2022-11-23 Thread Ales Musil
On Tue, Nov 22, 2022 at 9:24 PM Xavier Simonart  wrote:

> Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=2144084
>
> Signed-off-by: Xavier Simonart 
> ---
>  lib/features.c | 16 ++--
>  tests/ovn.at   | 18 ++
>  2 files changed, 28 insertions(+), 6 deletions(-)
>
> diff --git a/lib/features.c b/lib/features.c
> index f15ec42bb..571e24ded 100644
> --- a/lib/features.c
> +++ b/lib/features.c
> @@ -26,6 +26,7 @@
>  #include "openvswitch/rconn.h"
>  #include "openvswitch/ofp-msgs.h"
>  #include "openvswitch/ofp-meter.h"
> +#include "openvswitch/ofp-util.h"
>  #include "ovn/features.h"
>
>  VLOG_DEFINE_THIS_MODULE(features);
> @@ -90,6 +91,8 @@ ovs_feature_rconn_setup(const char *br_name)
>  static bool
>  ovs_feature_get_openflow_cap(const char *br_name)
>  {
> +struct ofpbuf *msg;
> +
>  if (!br_name) {
>  return false;
>  }
> @@ -102,15 +105,14 @@ ovs_feature_get_openflow_cap(const char *br_name)
>  }
>
>  /* send new requests just after reconnect. */
> -if (conn_seq_no == rconn_get_connection_seqno(swconn)) {
> -return false;
> +if (conn_seq_no != rconn_get_connection_seqno(swconn)) {
> +/* dump datapath meter capabilities. */
> +msg = ofpraw_alloc(OFPRAW_OFPST13_METER_FEATURES_REQUEST,
> +   rconn_get_version(swconn), 0);
> +rconn_send(swconn, msg, NULL);
>  }
>
>  bool ret = false;
> -/* dump datapath meter capabilities. */
> -struct ofpbuf *msg =
> ofpraw_alloc(OFPRAW_OFPST13_METER_FEATURES_REQUEST,
> -  rconn_get_version(swconn), 0);
> -rconn_send(swconn, msg, NULL);
>  for (int i = 0; i < 50; i++) {
>  msg = rconn_recv(swconn);
>  if (!msg) {
> @@ -137,6 +139,8 @@ ovs_feature_get_openflow_cap(const char *br_name)
>  }
>  }
>  conn_seq_no = rconn_get_connection_seqno(swconn);
> +} else if (type == OFPTYPE_ECHO_REQUEST) {
> +rconn_send(swconn, ofputil_encode_echo_reply(oh), NULL);
>  }
>  ofpbuf_delete(msg);
>  }
> diff --git a/tests/ovn.at b/tests/ovn.at
> index cf1ea991d..3a1c231f5 100644
> --- a/tests/ovn.at
> +++ b/tests/ovn.at
> @@ -33069,3 +33069,21 @@ check ovn-nbctl --wait=hv sync
>  OVN_CLEANUP([hv1])
>  AT_CLEANUP
>  ])
> +
> +OVN_FOR_EACH_NORTHD([
> +AT_SETUP([feature inactivity probe])
> +ovn_start
> +net_add n1
> +
> +sim_add hv1
> +as hv1
> +check ovs-vsctl add-br br-phys
> +ovn_attach n1 br-phys 192.168.0.1
> +
> +# Wait for more than 2x 60 seconds
> +sleep 125
> +
> +AT_CHECK([test -z "`grep disconnecting hv1/ovs-vswitchd.log`"])
> +OVN_CLEANUP([hv1])
> +AT_CLEANUP
> +])
> --
> 2.31.1
>
> ___
> dev mailing list
> d...@openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>
>
Hi Xavier,

the change itself looks good, however I'm very concerned about the test.
I wonder if we can roll with confirmation that this just works, without
actually having
test that sleeps for more than 2 minutes.

Dumitru, Mark, Han any thoughts?

-- 

Ales Musil

Senior Software Engineer - OVN Core

Red Hat EMEA 

amu...@redhat.comIM: amusil

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH ovn] controller: Fixed ovs/ovn(features) connection lost when running more than 120 seconds

2022-11-22 Thread Xavier Simonart
Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=2144084

Signed-off-by: Xavier Simonart 
---
 lib/features.c | 16 ++--
 tests/ovn.at   | 18 ++
 2 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/lib/features.c b/lib/features.c
index f15ec42bb..571e24ded 100644
--- a/lib/features.c
+++ b/lib/features.c
@@ -26,6 +26,7 @@
 #include "openvswitch/rconn.h"
 #include "openvswitch/ofp-msgs.h"
 #include "openvswitch/ofp-meter.h"
+#include "openvswitch/ofp-util.h"
 #include "ovn/features.h"
 
 VLOG_DEFINE_THIS_MODULE(features);
@@ -90,6 +91,8 @@ ovs_feature_rconn_setup(const char *br_name)
 static bool
 ovs_feature_get_openflow_cap(const char *br_name)
 {
+struct ofpbuf *msg;
+
 if (!br_name) {
 return false;
 }
@@ -102,15 +105,14 @@ ovs_feature_get_openflow_cap(const char *br_name)
 }
 
 /* send new requests just after reconnect. */
-if (conn_seq_no == rconn_get_connection_seqno(swconn)) {
-return false;
+if (conn_seq_no != rconn_get_connection_seqno(swconn)) {
+/* dump datapath meter capabilities. */
+msg = ofpraw_alloc(OFPRAW_OFPST13_METER_FEATURES_REQUEST,
+   rconn_get_version(swconn), 0);
+rconn_send(swconn, msg, NULL);
 }
 
 bool ret = false;
-/* dump datapath meter capabilities. */
-struct ofpbuf *msg = ofpraw_alloc(OFPRAW_OFPST13_METER_FEATURES_REQUEST,
-  rconn_get_version(swconn), 0);
-rconn_send(swconn, msg, NULL);
 for (int i = 0; i < 50; i++) {
 msg = rconn_recv(swconn);
 if (!msg) {
@@ -137,6 +139,8 @@ ovs_feature_get_openflow_cap(const char *br_name)
 }
 }
 conn_seq_no = rconn_get_connection_seqno(swconn);
+} else if (type == OFPTYPE_ECHO_REQUEST) {
+rconn_send(swconn, ofputil_encode_echo_reply(oh), NULL);
 }
 ofpbuf_delete(msg);
 }
diff --git a/tests/ovn.at b/tests/ovn.at
index cf1ea991d..3a1c231f5 100644
--- a/tests/ovn.at
+++ b/tests/ovn.at
@@ -33069,3 +33069,21 @@ check ovn-nbctl --wait=hv sync
 OVN_CLEANUP([hv1])
 AT_CLEANUP
 ])
+
+OVN_FOR_EACH_NORTHD([
+AT_SETUP([feature inactivity probe])
+ovn_start
+net_add n1
+
+sim_add hv1
+as hv1
+check ovs-vsctl add-br br-phys
+ovn_attach n1 br-phys 192.168.0.1
+
+# Wait for more than 2x 60 seconds
+sleep 125
+
+AT_CHECK([test -z "`grep disconnecting hv1/ovs-vswitchd.log`"])
+OVN_CLEANUP([hv1])
+AT_CLEANUP
+])
-- 
2.31.1

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev