Re: Wifi bug

2021-09-27 Thread e9hack

Am 27.09.2021 um 19:02 schrieb Felix Fietkau:


Fix pushed, thanks for testing.

- Felix



It fixes my issue too.

In bonding_enable_port() and bridge_enable_member() is in the middle a 'return 
-1'. In all error cases before and afterwards is located a 'goto ...' to revert 
some things. Is 'return -1' correct or a bug?

Regards,
Hartmut

___
openwrt-devel mailing list
openwrt-devel@lists.openwrt.org
https://lists.openwrt.org/mailman/listinfo/openwrt-devel


Re: Wifi bug

2021-09-27 Thread Felix Fietkau


On 2021-09-27 18:30, Hannu Nyman wrote:
> Felix Fietkau kirjoitti 27.9.2021 klo 19.17:
>> On 2021-09-27 17:45, Hannu Nyman wrote:
>>> Felix Fietkau kirjoitti 27.9.2021 klo 13.59:
 On a crash, it should drop a .core file to /tmp. Please copy that to
 your build host and use ./scripts/remote-gdb to obtain a backtrace from
 it. I'd like to know, which line of code in netifd it crashes on, so I
 can fix it. So far the bug has not shown up in my own tests...

 - Felix
>>>
>>> This is probably what you are looking for...
>>> To me it looks like it might actually be a list handling bug in libubox.
> 
>> Can you please try this netifd patch?
> 
> At the first glance, the impact looks ok to me:
>wifi goes down with "with down", netifd stays alive and wifi remains down. 
>  :-)
> 
> Hannu

Fix pushed, thanks for testing.

- Felix

___
openwrt-devel mailing list
openwrt-devel@lists.openwrt.org
https://lists.openwrt.org/mailman/listinfo/openwrt-devel


Re: Wifi bug

2021-09-27 Thread Hannu Nyman

Felix Fietkau kirjoitti 27.9.2021 klo 19.17:

On 2021-09-27 17:45, Hannu Nyman wrote:

Felix Fietkau kirjoitti 27.9.2021 klo 13.59:

On a crash, it should drop a .core file to /tmp. Please copy that to
your build host and use ./scripts/remote-gdb to obtain a backtrace from
it. I'd like to know, which line of code in netifd it crashes on, so I
can fix it. So far the bug has not shown up in my own tests...

- Felix


This is probably what you are looking for...
To me it looks like it might actually be a list handling bug in libubox.



Can you please try this netifd patch?


At the first glance, the impact looks ok to me:
  wifi goes down with "with down", netifd stays alive and wifi remains down.  
:-)

Hannu



Thanks,

- Felix

---
diff --git a/alias.c b/alias.c
index 951e046bb3f1..98d54100fef9 100644
--- a/alias.c
+++ b/alias.c
@@ -178,13 +178,9 @@ alias_notify_device(const char *name, struct device *dev)
  {
struct alias_device *alias;
  
-	device_lock();

-
alias = avl_find_element(, name, alias, avl);
if (alias)
alias_set_device(alias, dev);
-
-   device_unlock();
  }
  
  struct device *

diff --git a/bonding.c b/bonding.c
index 0bf4f9a331ef..457fe5159899 100644
--- a/bonding.c
+++ b/bonding.c
@@ -566,8 +566,6 @@ bonding_free_port(struct bonding_port *bp)
  
  	bonding_remove_port(bp);
  
-	device_lock();

-
device_remove_user(>dev);
  
  	/*

@@ -582,8 +580,6 @@ bonding_free_port(struct bonding_port *bp)
device_set_present(dev, true);
}
  
-	device_unlock();

-
free(bp);
  }
  
diff --git a/bridge.c b/bridge.c

index 2ce5c2b11b49..7e61b9df8326 100644
--- a/bridge.c
+++ b/bridge.c
@@ -512,8 +512,6 @@ restart:
goto restart;
}
  
-	device_lock();

-
device_remove_user(>dev);
uloop_timeout_cancel(>check_timer);
  
@@ -529,8 +527,6 @@ restart:

device_set_present(dev, true);
}
  
-	device_unlock();

-
free(bm);
  }
  
diff --git a/config.c b/config.c

index d83ea9cb6b6c..9bbda39d3fb5 100644
--- a/config.c
+++ b/config.c
@@ -762,7 +762,6 @@ config_init_all(void)
  
  	vlist_update();

config_init = true;
-   device_lock();
  
  	device_reset_config();

config_init_devices(true);
@@ -775,12 +774,10 @@ config_init_all(void)
config_init_wireless();
  
  	config_init = false;

-   device_unlock();
  
  	device_reset_old();

device_init_pending();
vlist_flush();
-   device_free_unused(NULL);
interface_refresh_assignments(false);
interface_start_pending();
wireless_start_pending();
diff --git a/device.c b/device.c
index bb39ea7f8d71..b3d0e85f8550 100644
--- a/device.c
+++ b/device.c
@@ -99,18 +99,6 @@ device_type_get(const char *tname)
return NULL;
  }
  
-void device_lock(void)

-{
-   __devlock++;
-}
-
-void device_unlock(void)
-{
-   __devlock--;
-   if (!__devlock)
-   device_free_unused(NULL);
-}
-
  static int device_vlan_len(struct kvlist *kv, const void *data)
  {
return sizeof(unsigned int);
@@ -895,14 +883,27 @@ device_free(struct device *dev)
  }
  
  static void

-__device_free_unused(struct device *dev)
+__device_free_unused(struct uloop_timeout *timeout)
  {
-   if (!safe_list_empty(>users) ||
-   !safe_list_empty(>aliases) ||
-   dev->current_config || __devlock)
-   return;
+   struct device *dev, *tmp;
+
+   avl_for_each_element_safe(, dev, avl, tmp) {
+   if (!safe_list_empty(>users) ||
+   !safe_list_empty(>aliases) ||
+   dev->current_config)
+   continue;
+
+   device_free(dev);
+   }
+}
+
+void device_free_unused(void)
+{
+   static struct uloop_timeout free_timer = {
+   .cb = __device_free_unused,
+   };
  
-	device_free(dev);

+   uloop_timeout_set(_timer, 1);
  }
  
  void device_remove_user(struct device_user *dep)

@@ -919,19 +920,7 @@ void device_remove_user(struct device_user *dep)
safe_list_del(>list);
dep->dev = NULL;
D(DEVICE, "Remove user for device '%s', refcount=%d\n", dev->ifname, 
device_refcount(dev));
-   __device_free_unused(dev);
-}
-
-void
-device_free_unused(struct device *dev)
-{
-   struct device *tmp;
-
-   if (dev)
-   return __device_free_unused(dev);
-
-   avl_for_each_element_safe(, dev, avl, tmp)
-   __device_free_unused(dev);
+   device_free_unused();
  }
  
  void

diff --git a/device.h b/device.h
index 0496e893cbc9..37f8c37c58a3 100644
--- a/device.h
+++ b/device.h
@@ -300,9 +300,6 @@ extern const struct uci_blob_param_list device_attr_list;
  extern struct device_type simple_device_type;
  extern struct device_type tunnel_device_type;
  
-void device_lock(void);

-void device_unlock(void);
-
  void device_vlan_update(bool done);
  void device_stp_init(void);
  
@@ 

Re: Wifi bug

2021-09-27 Thread Felix Fietkau
On 2021-09-27 17:45, Hannu Nyman wrote:
> Felix Fietkau kirjoitti 27.9.2021 klo 13.59:
>> On a crash, it should drop a .core file to /tmp. Please copy that to
>> your build host and use ./scripts/remote-gdb to obtain a backtrace from
>> it. I'd like to know, which line of code in netifd it crashes on, so I
>> can fix it. So far the bug has not shown up in my own tests...
>>
>> - Felix
> 
> 
> This is probably what you are looking for...
> To me it looks like it might actually be a list handling bug in libubox.Can 
> you please try this netifd patch?

Thanks,

- Felix

---
diff --git a/alias.c b/alias.c
index 951e046bb3f1..98d54100fef9 100644
--- a/alias.c
+++ b/alias.c
@@ -178,13 +178,9 @@ alias_notify_device(const char *name, struct device *dev)
 {
struct alias_device *alias;
 
-   device_lock();
-
alias = avl_find_element(, name, alias, avl);
if (alias)
alias_set_device(alias, dev);
-
-   device_unlock();
 }
 
 struct device *
diff --git a/bonding.c b/bonding.c
index 0bf4f9a331ef..457fe5159899 100644
--- a/bonding.c
+++ b/bonding.c
@@ -566,8 +566,6 @@ bonding_free_port(struct bonding_port *bp)
 
bonding_remove_port(bp);
 
-   device_lock();
-
device_remove_user(>dev);
 
/*
@@ -582,8 +580,6 @@ bonding_free_port(struct bonding_port *bp)
device_set_present(dev, true);
}
 
-   device_unlock();
-
free(bp);
 }
 
diff --git a/bridge.c b/bridge.c
index 2ce5c2b11b49..7e61b9df8326 100644
--- a/bridge.c
+++ b/bridge.c
@@ -512,8 +512,6 @@ restart:
goto restart;
}
 
-   device_lock();
-
device_remove_user(>dev);
uloop_timeout_cancel(>check_timer);
 
@@ -529,8 +527,6 @@ restart:
device_set_present(dev, true);
}
 
-   device_unlock();
-
free(bm);
 }
 
diff --git a/config.c b/config.c
index d83ea9cb6b6c..9bbda39d3fb5 100644
--- a/config.c
+++ b/config.c
@@ -762,7 +762,6 @@ config_init_all(void)
 
vlist_update();
config_init = true;
-   device_lock();
 
device_reset_config();
config_init_devices(true);
@@ -775,12 +774,10 @@ config_init_all(void)
config_init_wireless();
 
config_init = false;
-   device_unlock();
 
device_reset_old();
device_init_pending();
vlist_flush();
-   device_free_unused(NULL);
interface_refresh_assignments(false);
interface_start_pending();
wireless_start_pending();
diff --git a/device.c b/device.c
index bb39ea7f8d71..b3d0e85f8550 100644
--- a/device.c
+++ b/device.c
@@ -99,18 +99,6 @@ device_type_get(const char *tname)
return NULL;
 }
 
-void device_lock(void)
-{
-   __devlock++;
-}
-
-void device_unlock(void)
-{
-   __devlock--;
-   if (!__devlock)
-   device_free_unused(NULL);
-}
-
 static int device_vlan_len(struct kvlist *kv, const void *data)
 {
return sizeof(unsigned int);
@@ -895,14 +883,27 @@ device_free(struct device *dev)
 }
 
 static void
-__device_free_unused(struct device *dev)
+__device_free_unused(struct uloop_timeout *timeout)
 {
-   if (!safe_list_empty(>users) ||
-   !safe_list_empty(>aliases) ||
-   dev->current_config || __devlock)
-   return;
+   struct device *dev, *tmp;
+
+   avl_for_each_element_safe(, dev, avl, tmp) {
+   if (!safe_list_empty(>users) ||
+   !safe_list_empty(>aliases) ||
+   dev->current_config)
+   continue;
+
+   device_free(dev);
+   }
+}
+
+void device_free_unused(void)
+{
+   static struct uloop_timeout free_timer = {
+   .cb = __device_free_unused,
+   };
 
-   device_free(dev);
+   uloop_timeout_set(_timer, 1);
 }
 
 void device_remove_user(struct device_user *dep)
@@ -919,19 +920,7 @@ void device_remove_user(struct device_user *dep)
safe_list_del(>list);
dep->dev = NULL;
D(DEVICE, "Remove user for device '%s', refcount=%d\n", dev->ifname, 
device_refcount(dev));
-   __device_free_unused(dev);
-}
-
-void
-device_free_unused(struct device *dev)
-{
-   struct device *tmp;
-
-   if (dev)
-   return __device_free_unused(dev);
-
-   avl_for_each_element_safe(, dev, avl, tmp)
-   __device_free_unused(dev);
+   device_free_unused();
 }
 
 void
diff --git a/device.h b/device.h
index 0496e893cbc9..37f8c37c58a3 100644
--- a/device.h
+++ b/device.h
@@ -300,9 +300,6 @@ extern const struct uci_blob_param_list device_attr_list;
 extern struct device_type simple_device_type;
 extern struct device_type tunnel_device_type;
 
-void device_lock(void);
-void device_unlock(void);
-
 void device_vlan_update(bool done);
 void device_stp_init(void);
 
@@ -346,7 +343,7 @@ void device_release(struct device_user *dep);
 int device_check_state(struct device *dev);
 void device_dump_status(struct blob_buf *b, struct 

Re: Wifi bug

2021-09-27 Thread Hannu Nyman

Felix Fietkau kirjoitti 27.9.2021 klo 13.59:

On a crash, it should drop a .core file to /tmp. Please copy that to
your build host and use ./scripts/remote-gdb to obtain a backtrace from
it. I'd like to know, which line of code in netifd it crashes on, so I
can fix it. So far the bug has not shown up in my own tests...

- Felix



This is probably what you are looking for...
To me it looks like it might actually be a list handling bug in libubox.


perus@ub2104:/Openwrt/r7800$ 
./build_dir/toolchain-arm_cortex-a15+neon-vfpv4_gcc-11.2.0_musl_eabi/gdb-10.1/gdb/gdb 
./staging_dir/target-arm_cortex-a15+neon-vfpv4_musl_eabi/root-ipq806x/sbin/netifd 
netifd.1632756907.1577.11.core

GNU gdb (GDB) 10.1
Copyright (C) 2020 Free Software Foundation, Inc.
...
Type "apropos word" to search for commands related to "word"...
Reading symbols from 
./staging_dir/target-arm_cortex-a15+neon-vfpv4_musl_eabi/root-ipq806x/sbin/netifd...


warning: Can't open file /sbin/netifd during file-backed mapping note processing
...
warning: Can't open file /lib/libc.so during file-backed mapping note processing
[New LWP 1577]

warning: Could not load shared library symbols for 8 libraries, e.g. 
/lib/libubox.so.20210819.

Use the "info sharedlibrary" command to see the complete listing.
Do you need "set solib-search-path" or "set sysroot"?
Core was generated by `/sbin/netifd'.
Program terminated with signal SIGSEGV, Segmentation fault.
#0  0xb6ea058c in ?? ()
(gdb) bt
#0  0xb6ea058c in ?? ()
#1  0x0001edc0 in device_broadcast_cb (ctx=, list=)
    at 
/Openwrt/r7800/build_dir/target-arm_cortex-a15+neon-vfpv4_musl_eabi/netifd-2021-09-21-08e954e1/device.c:497

#2  0xb6f2f228 in ?? ()
Backtrace stopped: previous frame identical to this frame (corrupt stack?)
(gdb) set solib-search-path 
./staging_dir/target-arm_cortex-a15+neon-vfpv4_musl_eabi/

host/ packages/ pkginfo/  root-ipq806x/ stamp/    usr/
(gdb) set solib-search-path 
./staging_dir/target-arm_cortex-a15+neon-vfpv4_musl_eabi/usr/lib/
Reading symbols from 
/Openwrt/r7800/staging_dir/target-arm_cortex-a15+neon-vfpv4_musl_eabi/usr/lib/libubox.so.20210819...
Reading symbols from 
/Openwrt/r7800/staging_dir/target-arm_cortex-a15+neon-vfpv4_musl_eabi/usr/lib/libubus.so.20210630...
Reading symbols from 
/Openwrt/r7800/staging_dir/target-arm_cortex-a15+neon-vfpv4_musl_eabi/usr/lib/libuci.so...
Reading symbols from 
/Openwrt/r7800/staging_dir/target-arm_cortex-a15+neon-vfpv4_musl_eabi/usr/lib/libjson-c.so.5.1.0...
Reading symbols from 
/Openwrt/r7800/staging_dir/target-arm_cortex-a15+neon-vfpv4_musl_eabi/usr/lib/libblobmsg_json.so.20210819...
Reading symbols from 
/Openwrt/r7800/staging_dir/target-arm_cortex-a15+neon-vfpv4_musl_eabi/usr/lib/libnl-tiny.so...

(gdb) bt
#0  0xb6ea058c in __safe_list_del_iterator (i=0xbee1ed0c)
    at 
/Openwrt/r7800/build_dir/target-arm_cortex-a15+neon-vfpv4_musl_eabi/libubox-2021-08-19-d716ac4b/safe_list.c:49

#1  __safe_list_move_iterator (i=0xbee1ed0c, list=0xb6f2f220)
    at 
/Openwrt/r7800/build_dir/target-arm_cortex-a15+neon-vfpv4_musl_eabi/libubox-2021-08-19-d716ac4b/safe_list.c:58
#2  safe_list_for_each (head=head@entry=0xb6f2f220, cb=cb@entry=0x1ed94 
, ctx=0xbee1ed3c,

    ctx@entry=0xbee1ed34)
    at 
/Openwrt/r7800/build_dir/target-arm_cortex-a15+neon-vfpv4_musl_eabi/libubox-2021-08-19-d716ac4b/safe_list.c:73

#3  0x000210d0 in device_broadcast_event (dev=0xb6f2f200, ev=)
    at 
/Openwrt/r7800/build_dir/target-arm_cortex-a15+neon-vfpv4_musl_eabi/netifd-2021-09-21-08e954e1/device.c:506

#4  0x000289a4 in handle_hotplug_msg (size=, data=)
    at 
/Openwrt/r7800/build_dir/target-arm_cortex-a15+neon-vfpv4_musl_eabi/netifd-2021-09-21-08e954e1/system-linux.c:777

#5  handle_hotplug_event (u=0x46ea4 , events=)
    at 
/Openwrt/r7800/build_dir/target-arm_cortex-a15+neon-vfpv4_musl_eabi/netifd-2021-09-21-08e954e1/system-linux.c:793

#6  0xb6e9e9f4 in uloop_run_events (timeout=)
    at 
/Openwrt/r7800/build_dir/target-arm_cortex-a15+neon-vfpv4_musl_eabi/libubox-2021-08-19-d716ac4b/uloop.c:198

#7  uloop_run_timeout (timeout=timeout@entry=-1)
    at 
/Openwrt/r7800/build_dir/target-arm_cortex-a15+neon-vfpv4_musl_eabi/libubox-2021-08-19-d716ac4b/uloop.c:555

#8  0x00013ae8 in uloop_run ()
    at 
/Openwrt/r7800/staging_dir/target-arm_cortex-a15+neon-vfpv4_musl_eabi/usr/include/libubox/uloop.h:111

#9  main (argc=1, argv=)
    at 
/Openwrt/r7800/build_dir/target-arm_cortex-a15+neon-vfpv4_musl_eabi/netifd-2021-09-21-08e954e1/main.c:339

(gdb)



There is also a few seconds' gap in the system log, when netifd starts again:


Mon Sep 27 18:35:06 2021 kern.warn kernel: [  178.272585] ath10k_pci 
:01:00.0: peer-unmap-event: unknown peer id 0
Mon Sep 27 18:35:06 2021 kern.warn kernel: [  178.272647] ath10k_pci 
:01:00.0: peer-unmap-event: unknown peer id 0
Mon Sep 27 18:35:07 2021 daemon.notice netifd: Network device 'wlan0' link is 
down

Mon Sep 27 18:35:12 2021 user.notice : Added device handler type: bonding
Mon Sep 27 18:35:12 2021 

Re: Wifi bug

2021-09-27 Thread Hannu Nyman

e9hack kirjoitti 27.9.2021 klo 16.39:

Am 27.09.2021 um 14:01 schrieb Felix Fietkau:

Normally it should be active by default. Is CONFIG_KERNEL_ELF_CORE set
in your .config?


It was not activated, but the output from gdb looks not so helpful:

hb@vbox-linux6:~/src/openwrt/LEDE/archer-C7-ath79-5.10.x-dsa/build_dir/target-mips_74kc_musl/netifd-2021-09-21-08e954e1> 
../../../scripts/remote-gdb netifd.1632747166.9335.11.core netifd

Choose target:
  1) mips_74kc (musl )
GNU gdb (GDB) 10.1
...
Reading symbols from netifd...
[New LWP 9335]
Core was generated by `/sbin/netifd'.
Program terminated with signal SIGSEGV, Segmentation fault.
#0  0x77d79ed5 in ?? (warning: GDB can't find the start of the function at 
0x77d79ed5.


    GDB is unable to find the start of the function at 0x77d79ed5
and thus can't determine the size of that function's stack frame. 



Did you remember to provide it with a unstripped "netifd" binary from the 
staging_dir ?
(or did you try with the stripped final binary in build_dir or router? which 
will fail unless you have disabled stripping and have the full binary intact)




___
openwrt-devel mailing list
openwrt-devel@lists.openwrt.org
https://lists.openwrt.org/mailman/listinfo/openwrt-devel


Re: Wifi bug

2021-09-27 Thread e9hack

Am 27.09.2021 um 14:01 schrieb Felix Fietkau:

Normally it should be active by default. Is CONFIG_KERNEL_ELF_CORE set
in your .config?


It was not activated, but the output from gdb looks not so helpful:

hb@vbox-linux6:~/src/openwrt/LEDE/archer-C7-ath79-5.10.x-dsa/build_dir/target-mips_74kc_musl/netifd-2021-09-21-08e954e1>
 ../../../scripts/remote-gdb netifd.1632747166.9335.11.core netifd
Choose target:
  1) mips_74kc (musl )
GNU gdb (GDB) 10.1
Copyright (C) 2020 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later 
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law.
Type "show copying" and "show warranty" for details.
This GDB was configured as "--host=x86_64-pc-linux-gnu 
--target=mips-openwrt-linux-musl".
Type "show configuration" for configuration details.
For bug reporting instructions, please see:
.
Find the GDB manual and other documentation resources online at:
.

For help, type "help".
Type "apropos word" to search for commands related to "word"...
Reading symbols from netifd...
[New LWP 9335]
Core was generated by `/sbin/netifd'.
Program terminated with signal SIGSEGV, Segmentation fault.
#0  0x77d79ed5 in ?? (warning: GDB can't find the start of the function at 
0x77d79ed5.

GDB is unable to find the start of the function at 0x77d79ed5
and thus can't determine the size of that function's stack frame.
This means that GDB may be unable to access that stack frame, or
the frames below it.
This problem is most likely caused by an invalid program counter or
stack pointer.
However, if you think GDB should simply search farther back
from 0x77d79ed5 for code which looks like the beginning of a
function, you can increase the range of the search using the `set
heuristic-fence-post' command.
)
(gdb)

Regards,
Hartmut

___
openwrt-devel mailing list
openwrt-devel@lists.openwrt.org
https://lists.openwrt.org/mailman/listinfo/openwrt-devel


Re: Wifi bug

2021-09-27 Thread Felix Fietkau
On 2021-09-27 13:33, e9hack wrote:
> Am 27.09.2021 um 12:59 schrieb Felix Fietkau:
>> 
>> Hi,
>> 
>> 
>> On 2021-09-26 14:48, e9hack wrote:
>>> Do you see a page fault from netifd in the log? If it does crash, it is 
>>> restarted by procd. This does restart the network stack. If I start netifd 
>>> with strace, I got this lines immediately before the page fault:
>>>
>>> unlink("/tmp/resolv.conf.d/resolv.conf.auto.tmp") = -1 ENOENT (No such file 
>>> or directory)
>>> open("/tmp/resolv.conf.d/resolv.conf.auto.tmp", 
>>> O_RDWR|O_CREAT|O_TRUNC|O_LARGEFILE, 0666) = 14
>>> ioctl(14, TIOCGWINSZ, 0x7f717a74)   = -1 ENOTTY (Not a tty)
>>> _llseek(14, 0, [0], SEEK_SET)   = 0
>>> readv(14, [{iov_base="", iov_len=1023}, {iov_base="", iov_len=1024}], 2) = 0
>>> close(14)   = 0
>>> open("/tmp/resolv.conf.d/resolv.conf.auto", O_RDONLY|O_LARGEFILE) = 14
>>> readv(14, [{iov_base="", iov_len=1023}, {iov_base="", iov_len=1024}], 2) = 0
>>> close(14)   = 0
>>> unlink("/tmp/resolv.conf.d/resolv.conf.auto.tmp") = 0
>>> munmap(0x77d61000, 8192)= 0
>>> --- SIGSEGV {si_signo=SIGSEGV, si_code=SEGV_MAPERR, si_addr=0x77d61038} ---
>>> +++ killed by SIGSEGV +++
>>>
>>> The part 0x77d61... does change on several starts. The offset 0x.038 is 
>>> the same.
>> On a crash, it should drop a .core file to /tmp. Please copy that to
>> your build host and use ./scripts/remote-gdb to obtain a backtrace from
>> it. I'd like to know, which line of code in netifd it crashes on, so I
>> can fix it. So far the bug has not shown up in my own tests...
>> 
>> - Felix
> 
> It doesn't generate a core-dump. How can I activate core-dumps?
Normally it should be active by default. Is CONFIG_KERNEL_ELF_CORE set
in your .config?

- Felix

___
openwrt-devel mailing list
openwrt-devel@lists.openwrt.org
https://lists.openwrt.org/mailman/listinfo/openwrt-devel


Re: Wifi bug

2021-09-27 Thread e9hack

Am 27.09.2021 um 12:59 schrieb Felix Fietkau:


Hi,


On 2021-09-26 14:48, e9hack wrote:

Do you see a page fault from netifd in the log? If it does crash, it is 
restarted by procd. This does restart the network stack. If I start netifd with 
strace, I got this lines immediately before the page fault:

unlink("/tmp/resolv.conf.d/resolv.conf.auto.tmp") = -1 ENOENT (No such file or 
directory)
open("/tmp/resolv.conf.d/resolv.conf.auto.tmp", 
O_RDWR|O_CREAT|O_TRUNC|O_LARGEFILE, 0666) = 14
ioctl(14, TIOCGWINSZ, 0x7f717a74)   = -1 ENOTTY (Not a tty)
_llseek(14, 0, [0], SEEK_SET)   = 0
readv(14, [{iov_base="", iov_len=1023}, {iov_base="", iov_len=1024}], 2) = 0
close(14)   = 0
open("/tmp/resolv.conf.d/resolv.conf.auto", O_RDONLY|O_LARGEFILE) = 14
readv(14, [{iov_base="", iov_len=1023}, {iov_base="", iov_len=1024}], 2) = 0
close(14)   = 0
unlink("/tmp/resolv.conf.d/resolv.conf.auto.tmp") = 0
munmap(0x77d61000, 8192)= 0
--- SIGSEGV {si_signo=SIGSEGV, si_code=SEGV_MAPERR, si_addr=0x77d61038} ---
+++ killed by SIGSEGV +++

The part 0x77d61... does change on several starts. The offset 0x.038 is the 
same.

On a crash, it should drop a .core file to /tmp. Please copy that to
your build host and use ./scripts/remote-gdb to obtain a backtrace from
it. I'd like to know, which line of code in netifd it crashes on, so I
can fix it. So far the bug has not shown up in my own tests...

- Felix


It doesn't generate a core-dump. How can I activate core-dumps?

Regards,
Hartmut


___
openwrt-devel mailing list
openwrt-devel@lists.openwrt.org
https://lists.openwrt.org/mailman/listinfo/openwrt-devel


Re: Wifi bug

2021-09-27 Thread Felix Fietkau


Hi,


On 2021-09-26 14:48, e9hack wrote:
> Do you see a page fault from netifd in the log? If it does crash, it is 
> restarted by procd. This does restart the network stack. If I start netifd 
> with strace, I got this lines immediately before the page fault:
> 
> unlink("/tmp/resolv.conf.d/resolv.conf.auto.tmp") = -1 ENOENT (No such file 
> or directory)
> open("/tmp/resolv.conf.d/resolv.conf.auto.tmp", 
> O_RDWR|O_CREAT|O_TRUNC|O_LARGEFILE, 0666) = 14
> ioctl(14, TIOCGWINSZ, 0x7f717a74)   = -1 ENOTTY (Not a tty)
> _llseek(14, 0, [0], SEEK_SET)   = 0
> readv(14, [{iov_base="", iov_len=1023}, {iov_base="", iov_len=1024}], 2) = 0
> close(14)   = 0
> open("/tmp/resolv.conf.d/resolv.conf.auto", O_RDONLY|O_LARGEFILE) = 14
> readv(14, [{iov_base="", iov_len=1023}, {iov_base="", iov_len=1024}], 2) = 0
> close(14)   = 0
> unlink("/tmp/resolv.conf.d/resolv.conf.auto.tmp") = 0
> munmap(0x77d61000, 8192)= 0
> --- SIGSEGV {si_signo=SIGSEGV, si_code=SEGV_MAPERR, si_addr=0x77d61038} ---
> +++ killed by SIGSEGV +++
> 
> The part 0x77d61... does change on several starts. The offset 0x.038 is 
> the same.
On a crash, it should drop a .core file to /tmp. Please copy that to
your build host and use ./scripts/remote-gdb to obtain a backtrace from
it. I'd like to know, which line of code in netifd it crashes on, so I
can fix it. So far the bug has not shown up in my own tests...

- Felix

___
openwrt-devel mailing list
openwrt-devel@lists.openwrt.org
https://lists.openwrt.org/mailman/listinfo/openwrt-devel


Re: Wifi bug

2021-09-26 Thread Henrique de Moraes Holschuh

On 26/09/2021 06:28, Henrique de Moraes Holschuh wrote:

On 24/09/2021 17:04, e9hack wrote:
In the past (a few days ago), it was possible to disable or shut-down 
wifi by introduce the command 'wifi down'. This doesn't work 
currently. After some seconds, wifi is start again.


What version of openwrt, please?  And if you could be more specific than 
"a few days ago", it might help...


Never mind, rest of thread has enough data. Thanks :-)

--
Henrique de Moraes Holschuh
Analista de Projetos
Centro de Estudos e Pesquisas em Tecnologias de Redes e Operações 
(Ceptro.br)

+55 11 5509-3537 R.:4023
INOC 22548*625
www.nic.br

___
openwrt-devel mailing list
openwrt-devel@lists.openwrt.org
https://lists.openwrt.org/mailman/listinfo/openwrt-devel


Re: Wifi bug

2021-09-26 Thread Hannu Nyman

e9hack kirjoitti 26.9.2021 klo 15.48:

Am 26.09.2021 um 12:54 schrieb Hannu Nyman:

e9hack kirjoitti 26.9.2021 klo 10.02:

Am 24.09.2021 um 22:04 schrieb e9hack:
In the past (a few days ago), it was possible to disable or shut-down 
wifi by introduce the command 'wifi down'. This doesn't work currently. 
After some seconds, wifi is start again.


It may be related to a page fault of netifd. Netifd is restart afterwards:

[  236.658379] do_page_fault(): sending SIGSEGV to netifd for invalid 
write access to 77cdd048

[  236.666942] epc = 77d7ded5 in libubox.so.20210819[77d78000+18000]
[  236.673212] ra  = 77d7dec9 in libubox.so.20210819[77d78000+18000]

This occurs after 'wifi down'.



Wifi coming up again happens also with R7800 (ipq806x, ath10k), master 
build from two days ago.



System log shoes wifi goes down, then the whole network stack gets 
restarted (?), and finally wifi comes back up.



  OpenWrt SNAPSHOT, r17581-2c9a07ed28
  -
root@router1:~# wifi down; logread -f
Sun Sep 26 13:42:39 2021 daemon.notice hostapd: Remove interface 'wlan0'
Sun Sep 26 13:42:39 2021 daemon.notice hostapd: wlan0: interface state 
ENABLED->DISABLED


Do you see a page fault from netifd in the log? If it does crash, it is 
restarted by procd. This does restart the network stack.



I do not see any crash in the logs.

But I do see netifd PID first disappearing from the proccess list and then 
re-appearing with a changed PID, so apparently netifd closes/crashes/whatever 
and restarts:


root@router1:~# pgrep netifd
8773
root@router1:~# wifi down
root@router1:~# pgrep netifd
root@router1:~# pgrep netifd
root@router1:~# pgrep netifd
18139



___
openwrt-devel mailing list
openwrt-devel@lists.openwrt.org
https://lists.openwrt.org/mailman/listinfo/openwrt-devel


Re: Wifi bug

2021-09-26 Thread e9hack

Am 26.09.2021 um 12:54 schrieb Hannu Nyman:

e9hack kirjoitti 26.9.2021 klo 10.02:

Am 24.09.2021 um 22:04 schrieb e9hack:

In the past (a few days ago), it was possible to disable or shut-down wifi by 
introduce the command 'wifi down'. This doesn't work currently. After some 
seconds, wifi is start again.


It may be related to a page fault of netifd. Netifd is restart afterwards:

[  236.658379] do_page_fault(): sending SIGSEGV to netifd for invalid write 
access to 77cdd048
[  236.666942] epc = 77d7ded5 in libubox.so.20210819[77d78000+18000]
[  236.673212] ra  = 77d7dec9 in libubox.so.20210819[77d78000+18000]

This occurs after 'wifi down'.



Wifi coming up again happens also with R7800 (ipq806x, ath10k), master build 
from two days ago.


System log shoes wifi goes down, then the whole network stack gets restarted 
(?), and finally wifi comes back up.


  OpenWrt SNAPSHOT, r17581-2c9a07ed28
  -
root@router1:~# wifi down; logread -f
Sun Sep 26 13:42:39 2021 daemon.notice hostapd: Remove interface 'wlan0'
Sun Sep 26 13:42:39 2021 daemon.notice hostapd: wlan0: interface state 
ENABLED->DISABLED


Do you see a page fault from netifd in the log? If it does crash, it is 
restarted by procd. This does restart the network stack. If I start netifd with 
strace, I got this lines immediately before the page fault:

unlink("/tmp/resolv.conf.d/resolv.conf.auto.tmp") = -1 ENOENT (No such file or 
directory)
open("/tmp/resolv.conf.d/resolv.conf.auto.tmp", 
O_RDWR|O_CREAT|O_TRUNC|O_LARGEFILE, 0666) = 14
ioctl(14, TIOCGWINSZ, 0x7f717a74)   = -1 ENOTTY (Not a tty)
_llseek(14, 0, [0], SEEK_SET)   = 0
readv(14, [{iov_base="", iov_len=1023}, {iov_base="", iov_len=1024}], 2) = 0
close(14)   = 0
open("/tmp/resolv.conf.d/resolv.conf.auto", O_RDONLY|O_LARGEFILE) = 14
readv(14, [{iov_base="", iov_len=1023}, {iov_base="", iov_len=1024}], 2) = 0
close(14)   = 0
unlink("/tmp/resolv.conf.d/resolv.conf.auto.tmp") = 0
munmap(0x77d61000, 8192)= 0
--- SIGSEGV {si_signo=SIGSEGV, si_code=SEGV_MAPERR, si_addr=0x77d61038} ---
+++ killed by SIGSEGV +++

The part 0x77d61... does change on several starts. The offset 0x.038 is the 
same.

Regards,
Hartmut

___
openwrt-devel mailing list
openwrt-devel@lists.openwrt.org
https://lists.openwrt.org/mailman/listinfo/openwrt-devel


Re: Wifi bug

2021-09-26 Thread Hannu Nyman

e9hack kirjoitti 26.9.2021 klo 10.02:

Am 24.09.2021 um 22:04 schrieb e9hack:
In the past (a few days ago), it was possible to disable or shut-down wifi 
by introduce the command 'wifi down'. This doesn't work currently. After 
some seconds, wifi is start again.


It may be related to a page fault of netifd. Netifd is restart afterwards:

[  236.658379] do_page_fault(): sending SIGSEGV to netifd for invalid write 
access to 77cdd048

[  236.666942] epc = 77d7ded5 in libubox.so.20210819[77d78000+18000]
[  236.673212] ra  = 77d7dec9 in libubox.so.20210819[77d78000+18000]

This occurs after 'wifi down'.



Wifi coming up again happens also with R7800 (ipq806x, ath10k), master build 
from two days ago.



System log shoes wifi goes down, then the whole network stack gets restarted 
(?), and finally wifi comes back up.



 OpenWrt SNAPSHOT, r17581-2c9a07ed28
 -
root@router1:~# wifi down; logread -f
Sun Sep 26 13:42:39 2021 daemon.notice hostapd: Remove interface 'wlan0'
Sun Sep 26 13:42:39 2021 daemon.notice hostapd: wlan0: interface state 
ENABLED->DISABLED
Sun Sep 26 13:42:39 2021 daemon.notice hostapd: wlan0: AP-STA-DISCONNECTED 
ac:57:75:56:c1:e0
Sun Sep 26 13:42:39 2021 daemon.notice hostapd: wlan0: AP-STA-DISCONNECTED 
e0:c3:77:ae:0a:30

Sun Sep 26 13:42:39 2021 daemon.notice hostapd: wlan0: AP-DISABLED
Sun Sep 26 13:42:39 2021 daemon.notice hostapd: wlan0: CTRL-EVENT-TERMINATING
Sun Sep 26 13:42:39 2021 daemon.notice hostapd: nl80211: deinit ifname=wlan0 
disabled_11b_rates=0
Sun Sep 26 13:42:39 2021 kern.info kernel: [96769.105006] device wlan0 left 
promiscuous mode
Sun Sep 26 13:42:39 2021 kern.info kernel: [96769.105082] br-lan: port 
2(wlan0) entered disabled state
Sun Sep 26 13:42:39 2021 daemon.notice netifd: Network device 'wlan0' link is 
down
Sun Sep 26 13:42:39 2021 kern.info kernel: [96769.128708] ath10k_pci 
:01:00.0: mac flush null vif, drop 0 queues 0x
Sun Sep 26 13:42:39 2021 kern.warn kernel: [96769.130466] ath10k_pci 
:01:00.0: peer-unmap-event: unknown peer id 0
Sun Sep 26 13:42:39 2021 kern.warn kernel: [96769.134851] ath10k_pci 
:01:00.0: peer-unmap-event: unknown peer id 0
Sun Sep 26 13:42:39 2021 kern.warn kernel: [96769.141696] ath10k_pci 
:01:00.0: peer-unmap-event: unknown peer id 0

Sun Sep 26 13:42:39 2021 daemon.notice hostapd: Remove interface 'wlan1'
Sun Sep 26 13:42:39 2021 daemon.notice hostapd: wlan1: interface state 
ENABLED->DISABLED
Sun Sep 26 13:42:39 2021 daemon.notice hostapd: wlan1: AP-STA-DISCONNECTED 
30:cd:a7:b3:33:5d

Sun Sep 26 13:42:39 2021 daemon.notice hostapd: wlan1: AP-DISABLED
Sun Sep 26 13:42:39 2021 daemon.notice hostapd: wlan1: CTRL-EVENT-TERMINATING
Sun Sep 26 13:42:39 2021 daemon.notice hostapd: nl80211: deinit ifname=wlan1 
disabled_11b_rates=0
Sun Sep 26 13:42:39 2021 kern.info kernel: [96769.328057] device wlan1 left 
promiscuous mode
Sun Sep 26 13:42:39 2021 kern.info kernel: [96769.328145] br-lan: port 
3(wlan1) entered disabled state
Sun Sep 26 13:42:39 2021 kern.info kernel: [96769.378176] ath10k_pci 
0001:01:00.0: mac flush null vif, drop 0 queues 0x
Sun Sep 26 13:42:39 2021 kern.warn kernel: [96769.379365] ath10k_pci 
0001:01:00.0: peer-unmap-event: unknown peer id 0
Sun Sep 26 13:42:39 2021 kern.warn kernel: [96769.384347] ath10k_pci 
0001:01:00.0: peer-unmap-event: unknown peer id 0
Sun Sep 26 13:42:39 2021 kern.warn kernel: [96769.391288] ath10k_pci 
0001:01:00.0: peer-unmap-event: unknown peer id 0
Sun Sep 26 13:42:39 2021 daemon.notice netifd: Network device 'wlan1' link is 
down

Sun Sep 26 13:42:44 2021 user.notice : Added device handler type: bonding
Sun Sep 26 13:42:44 2021 user.notice : Added device handler type: 8021ad
Sun Sep 26 13:42:44 2021 user.notice : Added device handler type: 8021q
Sun Sep 26 13:42:44 2021 user.notice : Added device handler type: macvlan
Sun Sep 26 13:42:44 2021 user.notice : Added device handler type: veth
Sun Sep 26 13:42:44 2021 user.notice : Added device handler type: bridge
Sun Sep 26 13:42:44 2021 user.notice : Added device handler type: Network device
Sun Sep 26 13:42:44 2021 user.notice : Added device handler type: tunnel
Sun Sep 26 13:42:45 2021 kern.info kernel: [96775.307649] br-lan: port 
1(eth1.1) entered disabled state
Sun Sep 26 13:42:45 2021 user.notice odhcpd: *** ODHCPD triggers DNSMASQ 
reload ***
Sun Sep 26 13:42:45 2021 kern.info kernel: [96775.319347] device eth1.1 left 
promiscuous mode
Sun Sep 26 13:42:45 2021 kern.info kernel: [96775.319372] device eth1 left 
promiscuous mode
Sun Sep 26 13:42:45 2021 kern.info kernel: [96775.322799] br-lan: port 
1(eth1.1) entered disabled state

Sun Sep 26 13:42:45 2021 daemon.info dnsmasq[5668]: read /etc/hosts - 4 
addresses
Sun Sep 26 13:42:45 2021 daemon.info dnsmasq[5668]: read 
/tmp/hosts/dhcp.cfg01411c - 3 addresses
Sun Sep 26 13:42:45 2021 daemon.info dnsmasq[5668]: read /tmp/hosts/odhcpd - 
0 addresses
Sun Sep 26 13:42:45 2021 daemon.info dnsmasq-dhcp[5668]: 

Re: Wifi bug

2021-09-26 Thread Henrique de Moraes Holschuh

On 24/09/2021 17:04, e9hack wrote:
In the past (a few days ago), it was possible to disable or shut-down 
wifi by introduce the command 'wifi down'. This doesn't work currently. 
After some seconds, wifi is start again.


What version of openwrt, please?  And if you could be more specific than 
"a few days ago", it might help...


Thanks for the report!

--
Henrique de Moraes Holschuh

___
openwrt-devel mailing list
openwrt-devel@lists.openwrt.org
https://lists.openwrt.org/mailman/listinfo/openwrt-devel


Re: Wifi bug

2021-09-26 Thread e9hack

Am 24.09.2021 um 22:04 schrieb e9hack:

In the past (a few days ago), it was possible to disable or shut-down wifi by 
introduce the command 'wifi down'. This doesn't work currently. After some 
seconds, wifi is start again.


It may be related to a page fault of netifd. Netifd is restart afterwards:

[  236.658379] do_page_fault(): sending SIGSEGV to netifd for invalid write 
access to 77cdd048
[  236.666942] epc = 77d7ded5 in libubox.so.20210819[77d78000+18000]
[  236.673212] ra  = 77d7dec9 in libubox.so.20210819[77d78000+18000]

This occurs after 'wifi down'.

Regards,
Hartmut


___
openwrt-devel mailing list
openwrt-devel@lists.openwrt.org
https://lists.openwrt.org/mailman/listinfo/openwrt-devel


Wifi bug

2021-09-24 Thread e9hack

In the past (a few days ago), it was possible to disable or shut-down wifi by 
introduce the command 'wifi down'. This doesn't work currently. After some 
seconds, wifi is start again.

Regards,
Hartmut

___
openwrt-devel mailing list
openwrt-devel@lists.openwrt.org
https://lists.openwrt.org/mailman/listinfo/openwrt-devel