[PATCH v2 2/2] iio: trigger: Fix strange (ladder-type) indentation
In some cases indentation looks a bit weird with starting from = sign and being in a ladder-type style. Unify it across the module. While at it, add blank line after definition block where it needed, Signed-off-by: Andy Shevchenko --- v2: fixed typo in the commit message (tupe->type) drivers/iio/industrialio-trigger.c | 25 - 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/drivers/iio/industrialio-trigger.c b/drivers/iio/industrialio-trigger.c index 78e30f0f915c..ec72ff04b38d 100644 --- a/drivers/iio/industrialio-trigger.c +++ b/drivers/iio/industrialio-trigger.c @@ -211,6 +211,7 @@ EXPORT_SYMBOL(iio_trigger_notify_done); static int iio_trigger_get_irq(struct iio_trigger *trig) { int ret; + mutex_lock(&trig->pool_lock); ret = bitmap_find_free_region(trig->pool, CONFIG_IIO_CONSUMERS_PER_TRIGGER, @@ -239,9 +240,9 @@ static void iio_trigger_put_irq(struct iio_trigger *trig, int irq) int iio_trigger_attach_poll_func(struct iio_trigger *trig, struct iio_poll_func *pf) { + bool notinuse = + bitmap_empty(trig->pool, CONFIG_IIO_CONSUMERS_PER_TRIGGER); int ret = 0; - bool notinuse - = bitmap_empty(trig->pool, CONFIG_IIO_CONSUMERS_PER_TRIGGER); /* Prevent the module from being removed whilst attached to a trigger */ __module_get(pf->indio_dev->driver_module); @@ -290,11 +291,10 @@ int iio_trigger_attach_poll_func(struct iio_trigger *trig, int iio_trigger_detach_poll_func(struct iio_trigger *trig, struct iio_poll_func *pf) { + bool no_other_users = + bitmap_weight(trig->pool, CONFIG_IIO_CONSUMERS_PER_TRIGGER) == 1; int ret = 0; - bool no_other_users - = (bitmap_weight(trig->pool, -CONFIG_IIO_CONSUMERS_PER_TRIGGER) - == 1); + if (trig->ops && trig->ops->set_trigger_state && no_other_users) { ret = trig->ops->set_trigger_state(trig, false); if (ret) @@ -312,6 +312,7 @@ int iio_trigger_detach_poll_func(struct iio_trigger *trig, irqreturn_t iio_pollfunc_store_time(int irq, void *p) { struct iio_poll_func *pf = p; + pf->timestamp = iio_get_time_ns(pf->indio_dev); return IRQ_WAKE_THREAD; } @@ -498,18 +499,16 @@ static const struct device_type iio_trig_type = { static void iio_trig_subirqmask(struct irq_data *d) { struct irq_chip *chip = irq_data_get_irq_chip(d); - struct iio_trigger *trig - = container_of(chip, - struct iio_trigger, subirq_chip); + struct iio_trigger *trig = container_of(chip, struct iio_trigger, subirq_chip); + trig->subirqs[d->irq - trig->subirq_base].enabled = false; } static void iio_trig_subirqunmask(struct irq_data *d) { struct irq_chip *chip = irq_data_get_irq_chip(d); - struct iio_trigger *trig - = container_of(chip, - struct iio_trigger, subirq_chip); + struct iio_trigger *trig = container_of(chip, struct iio_trigger, subirq_chip); + trig->subirqs[d->irq - trig->subirq_base].enabled = true; } @@ -695,7 +694,7 @@ EXPORT_SYMBOL(iio_trigger_using_own); * device, -EINVAL otherwise. */ int iio_trigger_validate_own_device(struct iio_trigger *trig, - struct iio_dev *indio_dev) + struct iio_dev *indio_dev) { if (indio_dev->dev.parent != trig->dev.parent) return -EINVAL; -- 2.30.2
[PATCH v1 2/2] iio: trigger: Fix strange (ladder-tupe) indentation
In some cases indentation looks a bit weird with starting from = sign and being in a ladder-type style. Unify it across the module. While at it, add blank line after definition block where it needed, Signed-off-by: Andy Shevchenko --- drivers/iio/industrialio-trigger.c | 25 - 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/drivers/iio/industrialio-trigger.c b/drivers/iio/industrialio-trigger.c index 77fca24147b2..f998900a34f5 100644 --- a/drivers/iio/industrialio-trigger.c +++ b/drivers/iio/industrialio-trigger.c @@ -211,6 +211,7 @@ EXPORT_SYMBOL(iio_trigger_notify_done); static int iio_trigger_get_irq(struct iio_trigger *trig) { int ret; + mutex_lock(&trig->pool_lock); ret = bitmap_find_free_region(trig->pool, CONFIG_IIO_CONSUMERS_PER_TRIGGER, @@ -239,9 +240,9 @@ static void iio_trigger_put_irq(struct iio_trigger *trig, int irq) int iio_trigger_attach_poll_func(struct iio_trigger *trig, struct iio_poll_func *pf) { + bool notinuse = + bitmap_empty(trig->pool, CONFIG_IIO_CONSUMERS_PER_TRIGGER); int ret = 0; - bool notinuse - = bitmap_empty(trig->pool, CONFIG_IIO_CONSUMERS_PER_TRIGGER); /* Prevent the module from being removed whilst attached to a trigger */ __module_get(pf->indio_dev->driver_module); @@ -290,11 +291,10 @@ int iio_trigger_attach_poll_func(struct iio_trigger *trig, int iio_trigger_detach_poll_func(struct iio_trigger *trig, struct iio_poll_func *pf) { + bool no_other_users = + bitmap_weight(trig->pool, CONFIG_IIO_CONSUMERS_PER_TRIGGER) == 1; int ret = 0; - bool no_other_users - = (bitmap_weight(trig->pool, -CONFIG_IIO_CONSUMERS_PER_TRIGGER) - == 1); + if (trig->ops && trig->ops->set_trigger_state && no_other_users) { ret = trig->ops->set_trigger_state(trig, false); if (ret) @@ -312,6 +312,7 @@ int iio_trigger_detach_poll_func(struct iio_trigger *trig, irqreturn_t iio_pollfunc_store_time(int irq, void *p) { struct iio_poll_func *pf = p; + pf->timestamp = iio_get_time_ns(pf->indio_dev); return IRQ_WAKE_THREAD; } @@ -498,18 +499,16 @@ static const struct device_type iio_trig_type = { static void iio_trig_subirqmask(struct irq_data *d) { struct irq_chip *chip = irq_data_get_irq_chip(d); - struct iio_trigger *trig - = container_of(chip, - struct iio_trigger, subirq_chip); + struct iio_trigger *trig = container_of(chip, struct iio_trigger, subirq_chip); + trig->subirqs[d->irq - trig->subirq_base].enabled = false; } static void iio_trig_subirqunmask(struct irq_data *d) { struct irq_chip *chip = irq_data_get_irq_chip(d); - struct iio_trigger *trig - = container_of(chip, - struct iio_trigger, subirq_chip); + struct iio_trigger *trig = container_of(chip, struct iio_trigger, subirq_chip); + trig->subirqs[d->irq - trig->subirq_base].enabled = true; } @@ -695,7 +694,7 @@ EXPORT_SYMBOL(iio_trigger_using_own); * device, -EINVAL otherwise. */ int iio_trigger_validate_own_device(struct iio_trigger *trig, - struct iio_dev *indio_dev) + struct iio_dev *indio_dev) { if (indio_dev->dev.parent != trig->dev.parent) return -EINVAL; -- 2.30.2
BUG: broken overlay causes very strange kernel crash
Hi folks, while playing around with overlays, I've encountered a funny crash, that even seems to affect the filesystem. No idea what really happens, as oftree code detected the broken phandle. What I did: * i've written a driver that loads a builtin oftree overlay and tries to apply it. * as its running on x86 (acpi-based), I'm also creating a of_root node and add some properties to it. (yes, calling of_node_init()) * using of_overlay_fdt_apply(), which seemed to work, but still trying to find out how to make it add new top-level nodes ... * if the call fails, the driver does nothing (except printing the err) * when adding a fragment with target <0> the crash happens The crash *much* later than loading the overlay, NULL pointer deref in ext2_error(). Since I can't see any relation between oftree and ext2, this smells that oftree code is overwriting some unrelated memory. Maybe something's creating broken pointers and then writing there ? Obviously my driver code shit, but those strange things happending smells like some weird is going on deep inside the oftree code, that maybe *could* provide an attack surface. Does anyone have an idea what's going here ? thx --mtx [0.629870] OF: overlay: find target, node: /fragment@0, phandle 0x0 not found [0.631603] OF: overlay: init_overlay_changeset() failed, ret = -22 [0.633131] ofboard: ret=-22 ovcs_id=0 [0.634039] ofboard: dumping all nodes ... [0.634932] ofboard: ==> of node: [0.635579] ofboard: --> property: model [0.636333] ofboard: --> property: compatible [0.637202] ofboard: ret=-22 ovcs_id=0 [0.637919] ofboard: ofdrv done [0.638529] IPI shorthand broadcast: enabled [0.640553] VFS: Mounted root (ext2 filesystem) readonly on device 254:0. [0.642639] Freeing unused kernel image (initmem) memory: 700K [0.649080] Write protecting the kernel read-only data: 10240k [0.651415] Freeing unused kernel image (text/rodata gap) memory: 2044K [0.653478] Freeing unused kernel image (rodata/data gap) memory: 1124K [0.655178] Run /sbin/init as init process [0.665905] BUG: kernel NULL pointer dereference, address: 003a [0.667634] #PF: supervisor write access in kernel mode [0.668919] #PF: error_code(0x0002) - not-present page [0.669011] PGD 0 P4D 0 [0.669011] Oops: 0002 [#1] PREEMPT SMP PTI [0.669011] CPU: 0 PID: 25 Comm: rcS Not tainted 5.11.0-rc7-00105-g4fb1c4f664da-dirty #247 [0.669011] Hardware name: PC engines Standard PC (i440FX + PIIX, 1996)/APU3, BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 044 [0.669011] RIP: 0010:ext2_error+0x6d/0x90 [0.669011] Code: 30 31 c0 f6 47 50 01 0f 85 04 32 15 00 4d 8d bc 24 80 01 00 00 4c 89 ff e8 f0 a4 16 00 4c 89 ff 66 41 83 8c 24 9f [0.669011] RSP: 0018:c90d7aa8 EFLAGS: 00010206 [0.669011] RAX: RBX: 888000256000 RCX: 0077 [0.669011] RDX: 0001 RSI: 81895e52 RDI: 88800025e380 [0.669011] RBP: c90d7b38 R08: 88800048da78 R09: 8880019f8ff4 [0.669011] R10: R11: 8f9a8d98 R12: 88800025e200 [0.669011] R13: R14: 81895e52 R15: 88800025e380 [0.669011] FS: 7f500a373740() GS:888007a0() knlGS: [0.669011] CS: 0010 DS: ES: CR0: 80050033 [0.669011] CR2: 003a CR3: 009cc000 CR4: 06b0 [0.669011] Call Trace: [0.669011] ? kmem_cache_alloc+0x1a/0x150 [0.669011] ext2_get_inode+0x5e/0x130 [0.669011] ? iget_locked+0x1e3/0x1f0 [0.669011] ext2_iget+0x81/0x420 [0.669011] ext2_lookup+0x79/0xb0 [0.669011] __lookup_slow+0x79/0x130 [0.669011] walk_component+0x139/0x1b0 [0.669011] ? path_init+0x2bd/0x3e0 [0.669011] path_lookupat+0x6d/0x1b0 [0.669011] filename_lookup+0xa5/0x170 [0.669011] ? strncpy_from_user+0x53/0x140 [0.669011] user_path_at_empty+0x35/0x40 [0.669011] vfs_statx+0x6e/0x110 [0.669011] ? handle_mm_fault+0x11ee/0x1280 [0.669011] __do_sys_newstat+0x3e/0x70 [0.669011] ? irqentry_exit+0x3c/0x60 [0.669011] ? exc_page_fault+0x22c/0x380 [0.669011] ? __do_sys_rt_sigreturn+0xc5/0xe0 [0.669011] __x64_sys_newstat+0x11/0x20 [0.669011] do_syscall_64+0x32/0x50 [0.669011] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [0.669011] RIP: 0033:0x7f500a462ee6 [0.669011] Code: 00 00 75 05 48 83 c4 18 c3 e8 e6 ef 01 00 66 0f 1f 44 00 00 41 89 f8 48 89 f7 48 89 d6 41 83 f8 01 77 29 b8 04 02 [0.669011] RSP: 002b:7ffd1fb01848 EFLAGS: 0246 ORIG_RAX: 0004 [0.669011] RAX: ffda RBX: 7ffd1fb019d0 RCX: 7f500a462ee6 [0.669011] RDX: 7ffd1fb01890 RSI: 7ffd1fb01890 RDI: 561c13db2498 [0.669011] RBP: 561c13db1778 R08: 0001 R09: ff736cff6f64
Re: Strange problem with SCTP+IPv6
> On 26. Jun 2020, at 18:13, David Laight wrote: > > From: Xin Long >> Sent: 23 June 2020 11:14 It looks like a bug to me. Testing with this test app here, I can see the INIT_ACK being sent with a bunch of ipv4 addresses in it and that's unexpected for a v6only socket. As is, it's the server saying "I'm available at these other addresses too, but not." >>> I agree. >> Then we need a fix in sctp_bind_addrs_to_raw(): >> >> @@ -238,6 +240,9 @@ union sctp_params sctp_bind_addrs_to_raw(const >> struct sctp_bind_addr *bp, >>addrparms = retval; >> >>list_for_each_entry(addr, &bp->address_list, list) { >> + if ((PF_INET6 == sk->sk_family) && inet_v6_ipv6only(sk) && >> + (AF_INET == addr->a.sa.sa_family)) >> + continue; >>af = sctp_get_af_specific(addr->a.v4.sin_family); >>len = af->to_addr_param(&addr->a, &rawaddr); >>memcpy(addrparms.v, &rawaddr, len); > > Thought. > > Does it make any sense to offer addresses in the INIT_ACK that don't > have routes to those proposed in the received INIT? > > 'routes' probably isn't exactly the right word. > You probably only want the local address that will be used > as the source address for the probes. > Or, at least, sources addresses that could be used for the probes. > > So if the INIT only contains IPv6 addresses should the INIT_ACK > ever contain IPv4 ones. The client (if it not using an IPv6 socket having IPv6 only enabled) could add an IPv4 address during the lifetime of the association by using the address reconfiguration extension. What could be done is to not send IPv4 addresses if the INIT contains a Supported Address Types parameter indicating IPv6, but not IPv4 support. As a client you might want to send this parameter, when the IPv6 socket has enabled the IPV6_ONLY socket option. Also if the client uses an IPv4 socket, it can indicate in the Supported Address Parameter that it only support IPv4, and the server does not need to list IPv6 addresses. Best regards Michael > > David. > > - > Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 > 1PT, UK > Registration No: 1397386 (Wales)
RE: Strange problem with SCTP+IPv6
From: Xin Long > Sent: 23 June 2020 11:14 > > > It looks like a bug to me. Testing with this test app here, I can see > > > the INIT_ACK being sent with a bunch of ipv4 addresses in it and > > > that's unexpected for a v6only socket. As is, it's the server saying > > > "I'm available at these other addresses too, but not." > > I agree. > Then we need a fix in sctp_bind_addrs_to_raw(): > > @@ -238,6 +240,9 @@ union sctp_params sctp_bind_addrs_to_raw(const > struct sctp_bind_addr *bp, > addrparms = retval; > > list_for_each_entry(addr, &bp->address_list, list) { > + if ((PF_INET6 == sk->sk_family) && inet_v6_ipv6only(sk) && > + (AF_INET == addr->a.sa.sa_family)) > + continue; > af = sctp_get_af_specific(addr->a.v4.sin_family); > len = af->to_addr_param(&addr->a, &rawaddr); > memcpy(addrparms.v, &rawaddr, len); Thought. Does it make any sense to offer addresses in the INIT_ACK that don't have routes to those proposed in the received INIT? 'routes' probably isn't exactly the right word. You probably only want the local address that will be used as the source address for the probes. Or, at least, sources addresses that could be used for the probes. So if the INIT only contains IPv6 addresses should the INIT_ACK ever contain IPv4 ones. David. - Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK Registration No: 1397386 (Wales)
Re: Strange problem with SCTP+IPv6
> On 24. Jun 2020, at 09:25, Xin Long wrote: > > On Wed, Jun 24, 2020 at 5:48 AM Michael Tuexen > wrote: >> >>> On 23. Jun 2020, at 23:31, Marcelo Ricardo Leitner >>> wrote: >>> >>> On Tue, Jun 23, 2020 at 11:24:59PM +0200, Michael Tuexen wrote: >>>>> On 23. Jun 2020, at 23:21, Marcelo Ricardo Leitner >>>>> wrote: >>>>> >>>>> On Tue, Jun 23, 2020 at 11:17:56AM -0500, Corey Minyard wrote: >>>>>> On Tue, Jun 23, 2020 at 01:17:28PM +, David Laight wrote: >>>>>>> From: Marcelo Ricardo Leitner >>>>>>>> Sent: 22 June 2020 19:33 >>>>>>>> On Mon, Jun 22, 2020 at 08:01:24PM +0200, Michael Tuexen wrote: >>>>>>>>>> On 22. Jun 2020, at 18:57, Corey Minyard wrote: >>>>>>>>>> >>>>>>>>>> On Mon, Jun 22, 2020 at 08:01:23PM +0800, Xin Long wrote: >>>>>>>>>>> On Sun, Jun 21, 2020 at 11:56 PM Corey Minyard >>>>>>>>>>> wrote: >>>>>>>>>>>> >>>>>>>>>>>> I've stumbled upon a strange problem with SCTP and IPv6. If I >>>>>>>>>>>> create an >>>>>>>>>>>> sctp listening socket on :: and set the IPV6_V6ONLY socket option >>>>>>>>>>>> on it, >>>>>>>>>>>> then I make a connection to it using ::1, the connection will drop >>>>>>>>>>>> after >>>>>>>>>>>> 2.5 seconds with an ECONNRESET error. >>>>>>>>>>>> >>>>>>>>>>>> It only happens on SCTP, it doesn't have the issue if you connect >>>>>>>>>>>> to a >>>>>>>>>>>> full IPv6 address instead of ::1, and it doesn't happen if you >>>>>>>>>>>> don't >>>>>>>>>>>> set IPV6_V6ONLY. I have verified current end of tree kernel.org. >>>>>>>>>>>> I tried on an ARM system and x86_64. >>>>>>>>>>>> >>>>>>>>>>>> I haven't dug into the kernel to see if I could find anything yet, >>>>>>>>>>>> but I >>>>>>>>>>>> thought I would go ahead and report it. I am attaching a >>>>>>>>>>>> reproducer. >>>>>>>>>>>> Basically, compile the following code: >>>>>>>>>>> The code only set IPV6_V6ONLY on server side, so the client side >>>>>>>>>>> will >>>>>>>>>>> still bind all the local ipv4 addresses (as you didn't call bind() >>>>>>>>>>> to >>>>>>>>>>> bind any specific addresses ). Then after the connection is created, >>>>>>>>>>> the client will send HB on the v4 paths to the server. The server >>>>>>>>>>> will abort the connection, as it can't support v4. >>>>>>>>>>> >>>>>>>>>>> So you can work around it by either: >>>>>>>>>>> >>>>>>>>>>> - set IPV6_V6ONLY on client side. >>>>>>>>>>> >>>>>>>>>>> or >>>>>>>>>>> >>>>>>>>>>> - bind to the specific v6 addresses on the client side. >>>>>>>>>>> >>>>>>>>>>> I don't see RFC said something about this. >>>>>>>>>>> So it may not be a good idea to change the current behaviour >>>>>>>>>>> to not establish the connection in this case, which may cause >>>>>>>>>>> regression. >>>>>>>>>> >>>>>>>>>> Ok, I understand this. It's a little strange, but I see why it works >>>>>>>>>> this way. >>>>>>>>> I don't. I would expect it to work as I described in my email. >>>>>>>>> Could someone explain me how and why it is behaving different from >>>>>>>>> my expectation? >>>>>>>&
Re: Strange problem with SCTP+IPv6
On Wed, Jun 24, 2020 at 5:48 AM Michael Tuexen wrote: > > > On 23. Jun 2020, at 23:31, Marcelo Ricardo Leitner > > wrote: > > > > On Tue, Jun 23, 2020 at 11:24:59PM +0200, Michael Tuexen wrote: > >>> On 23. Jun 2020, at 23:21, Marcelo Ricardo Leitner > >>> wrote: > >>> > >>> On Tue, Jun 23, 2020 at 11:17:56AM -0500, Corey Minyard wrote: > >>>> On Tue, Jun 23, 2020 at 01:17:28PM +, David Laight wrote: > >>>>> From: Marcelo Ricardo Leitner > >>>>>> Sent: 22 June 2020 19:33 > >>>>>> On Mon, Jun 22, 2020 at 08:01:24PM +0200, Michael Tuexen wrote: > >>>>>>>> On 22. Jun 2020, at 18:57, Corey Minyard wrote: > >>>>>>>> > >>>>>>>> On Mon, Jun 22, 2020 at 08:01:23PM +0800, Xin Long wrote: > >>>>>>>>> On Sun, Jun 21, 2020 at 11:56 PM Corey Minyard > >>>>>>>>> wrote: > >>>>>>>>>> > >>>>>>>>>> I've stumbled upon a strange problem with SCTP and IPv6. If I > >>>>>>>>>> create an > >>>>>>>>>> sctp listening socket on :: and set the IPV6_V6ONLY socket option > >>>>>>>>>> on it, > >>>>>>>>>> then I make a connection to it using ::1, the connection will drop > >>>>>>>>>> after > >>>>>>>>>> 2.5 seconds with an ECONNRESET error. > >>>>>>>>>> > >>>>>>>>>> It only happens on SCTP, it doesn't have the issue if you connect > >>>>>>>>>> to a > >>>>>>>>>> full IPv6 address instead of ::1, and it doesn't happen if you > >>>>>>>>>> don't > >>>>>>>>>> set IPV6_V6ONLY. I have verified current end of tree kernel.org. > >>>>>>>>>> I tried on an ARM system and x86_64. > >>>>>>>>>> > >>>>>>>>>> I haven't dug into the kernel to see if I could find anything yet, > >>>>>>>>>> but I > >>>>>>>>>> thought I would go ahead and report it. I am attaching a > >>>>>>>>>> reproducer. > >>>>>>>>>> Basically, compile the following code: > >>>>>>>>> The code only set IPV6_V6ONLY on server side, so the client side > >>>>>>>>> will > >>>>>>>>> still bind all the local ipv4 addresses (as you didn't call bind() > >>>>>>>>> to > >>>>>>>>> bind any specific addresses ). Then after the connection is created, > >>>>>>>>> the client will send HB on the v4 paths to the server. The server > >>>>>>>>> will abort the connection, as it can't support v4. > >>>>>>>>> > >>>>>>>>> So you can work around it by either: > >>>>>>>>> > >>>>>>>>> - set IPV6_V6ONLY on client side. > >>>>>>>>> > >>>>>>>>> or > >>>>>>>>> > >>>>>>>>> - bind to the specific v6 addresses on the client side. > >>>>>>>>> > >>>>>>>>> I don't see RFC said something about this. > >>>>>>>>> So it may not be a good idea to change the current behaviour > >>>>>>>>> to not establish the connection in this case, which may cause > >>>>>>>>> regression. > >>>>>>>> > >>>>>>>> Ok, I understand this. It's a little strange, but I see why it works > >>>>>>>> this way. > >>>>>>> I don't. I would expect it to work as I described in my email. > >>>>>>> Could someone explain me how and why it is behaving different from > >>>>>>> my expectation? > >>>>>> > >>>>>> It looks like a bug to me. Testing with this test app here, I can see > >>>>>> the INIT_ACK being sent with a bunch of ipv4 addresses in it and > >>>>>> that's unexpected for a v6only socket. As i
Re: Strange problem with SCTP+IPv6
On Wed, Jun 24, 2020 at 12:00 AM Corey Minyard wrote: > > On Tue, Jun 23, 2020 at 11:40:21PM +0800, Xin Long wrote: > > On Tue, Jun 23, 2020 at 9:29 PM Corey Minyard wrote: > > > > > > On Tue, Jun 23, 2020 at 06:13:30PM +0800, Xin Long wrote: > > > > On Tue, Jun 23, 2020 at 2:34 AM Michael Tuexen > > > > wrote: > > > > > > > > > > > On 22. Jun 2020, at 20:32, Marcelo Ricardo Leitner > > > > > > wrote: > > > > > > > > > > > > On Mon, Jun 22, 2020 at 08:01:24PM +0200, Michael Tuexen wrote: > > > > > >>> On 22. Jun 2020, at 18:57, Corey Minyard wrote: > > > > > >>> > > > > > >>> On Mon, Jun 22, 2020 at 08:01:23PM +0800, Xin Long wrote: > > > > > >>>> On Sun, Jun 21, 2020 at 11:56 PM Corey Minyard > > > > > >>>> wrote: > > > > > >>>>> > > > > > >>>>> I've stumbled upon a strange problem with SCTP and IPv6. If I > > > > > >>>>> create an > > > > > >>>>> sctp listening socket on :: and set the IPV6_V6ONLY socket > > > > > >>>>> option on it, > > > > > >>>>> then I make a connection to it using ::1, the connection will > > > > > >>>>> drop after > > > > > >>>>> 2.5 seconds with an ECONNRESET error. > > > > > >>>>> > > > > > >>>>> It only happens on SCTP, it doesn't have the issue if you > > > > > >>>>> connect to a > > > > > >>>>> full IPv6 address instead of ::1, and it doesn't happen if you > > > > > >>>>> don't > > > > > >>>>> set IPV6_V6ONLY. I have verified current end of tree > > > > > >>>>> kernel.org. > > > > > >>>>> I tried on an ARM system and x86_64. > > > > > >>>>> > > > > > >>>>> I haven't dug into the kernel to see if I could find anything > > > > > >>>>> yet, but I > > > > > >>>>> thought I would go ahead and report it. I am attaching a > > > > > >>>>> reproducer. > > > > > >>>>> Basically, compile the following code: > > > > > >>>> The code only set IPV6_V6ONLY on server side, so the client side > > > > > >>>> will > > > > > >>>> still bind all the local ipv4 addresses (as you didn't call > > > > > >>>> bind() to > > > > > >>>> bind any specific addresses ). Then after the connection is > > > > > >>>> created, > > > > > >>>> the client will send HB on the v4 paths to the server. The server > > > > > >>>> will abort the connection, as it can't support v4. > > > > > >>>> > > > > > >>>> So you can work around it by either: > > > > > >>>> > > > > > >>>> - set IPV6_V6ONLY on client side. > > > > > >>>> > > > > > >>>> or > > > > > >>>> > > > > > >>>> - bind to the specific v6 addresses on the client side. > > > > > >>>> > > > > > >>>> I don't see RFC said something about this. > > > > > >>>> So it may not be a good idea to change the current behaviour > > > > > >>>> to not establish the connection in this case, which may cause > > > > > >>>> regression. > > > > > >>> > > > > > >>> Ok, I understand this. It's a little strange, but I see why it > > > > > >>> works > > > > > >>> this way. > > > > > >> I don't. I would expect it to work as I described in my email. > > > > > >> Could someone explain me how and why it is behaving different from > > > > > >> my expectation? > > > > > > > > > > > > It looks like a bug to me. Testing with this test app here, I can > > > > > > see > > > > > > the INIT_ACK being sent with a bunch of i
Re: Strange problem with SCTP+IPv6
> On 23. Jun 2020, at 23:31, Marcelo Ricardo Leitner > wrote: > > On Tue, Jun 23, 2020 at 11:24:59PM +0200, Michael Tuexen wrote: >>> On 23. Jun 2020, at 23:21, Marcelo Ricardo Leitner >>> wrote: >>> >>> On Tue, Jun 23, 2020 at 11:17:56AM -0500, Corey Minyard wrote: >>>> On Tue, Jun 23, 2020 at 01:17:28PM +, David Laight wrote: >>>>> From: Marcelo Ricardo Leitner >>>>>> Sent: 22 June 2020 19:33 >>>>>> On Mon, Jun 22, 2020 at 08:01:24PM +0200, Michael Tuexen wrote: >>>>>>>> On 22. Jun 2020, at 18:57, Corey Minyard wrote: >>>>>>>> >>>>>>>> On Mon, Jun 22, 2020 at 08:01:23PM +0800, Xin Long wrote: >>>>>>>>> On Sun, Jun 21, 2020 at 11:56 PM Corey Minyard >>>>>>>>> wrote: >>>>>>>>>> >>>>>>>>>> I've stumbled upon a strange problem with SCTP and IPv6. If I >>>>>>>>>> create an >>>>>>>>>> sctp listening socket on :: and set the IPV6_V6ONLY socket option on >>>>>>>>>> it, >>>>>>>>>> then I make a connection to it using ::1, the connection will drop >>>>>>>>>> after >>>>>>>>>> 2.5 seconds with an ECONNRESET error. >>>>>>>>>> >>>>>>>>>> It only happens on SCTP, it doesn't have the issue if you connect to >>>>>>>>>> a >>>>>>>>>> full IPv6 address instead of ::1, and it doesn't happen if you don't >>>>>>>>>> set IPV6_V6ONLY. I have verified current end of tree kernel.org. >>>>>>>>>> I tried on an ARM system and x86_64. >>>>>>>>>> >>>>>>>>>> I haven't dug into the kernel to see if I could find anything yet, >>>>>>>>>> but I >>>>>>>>>> thought I would go ahead and report it. I am attaching a reproducer. >>>>>>>>>> Basically, compile the following code: >>>>>>>>> The code only set IPV6_V6ONLY on server side, so the client side will >>>>>>>>> still bind all the local ipv4 addresses (as you didn't call bind() to >>>>>>>>> bind any specific addresses ). Then after the connection is created, >>>>>>>>> the client will send HB on the v4 paths to the server. The server >>>>>>>>> will abort the connection, as it can't support v4. >>>>>>>>> >>>>>>>>> So you can work around it by either: >>>>>>>>> >>>>>>>>> - set IPV6_V6ONLY on client side. >>>>>>>>> >>>>>>>>> or >>>>>>>>> >>>>>>>>> - bind to the specific v6 addresses on the client side. >>>>>>>>> >>>>>>>>> I don't see RFC said something about this. >>>>>>>>> So it may not be a good idea to change the current behaviour >>>>>>>>> to not establish the connection in this case, which may cause >>>>>>>>> regression. >>>>>>>> >>>>>>>> Ok, I understand this. It's a little strange, but I see why it works >>>>>>>> this way. >>>>>>> I don't. I would expect it to work as I described in my email. >>>>>>> Could someone explain me how and why it is behaving different from >>>>>>> my expectation? >>>>>> >>>>>> It looks like a bug to me. Testing with this test app here, I can see >>>>>> the INIT_ACK being sent with a bunch of ipv4 addresses in it and >>>>>> that's unexpected for a v6only socket. As is, it's the server saying >>>>>> "I'm available at these other addresses too, but not." >>>>> >>>>> Does it even make sense to mix IPv4 and IPv6 addresses on the same >>>>> connection? >>>>> I don't remember ever seeing both types of address in a message, >>>>> but may not have looked. >>>> >>>> That's an interesting question. Do the RFCs say anything? I would >>>> assume it was ok
Re: Strange problem with SCTP+IPv6
On Tue, Jun 23, 2020 at 11:24:59PM +0200, Michael Tuexen wrote: > > On 23. Jun 2020, at 23:21, Marcelo Ricardo Leitner > > wrote: > > > > On Tue, Jun 23, 2020 at 11:17:56AM -0500, Corey Minyard wrote: > >> On Tue, Jun 23, 2020 at 01:17:28PM +, David Laight wrote: > >>> From: Marcelo Ricardo Leitner > >>>> Sent: 22 June 2020 19:33 > >>>> On Mon, Jun 22, 2020 at 08:01:24PM +0200, Michael Tuexen wrote: > >>>>>> On 22. Jun 2020, at 18:57, Corey Minyard wrote: > >>>>>> > >>>>>> On Mon, Jun 22, 2020 at 08:01:23PM +0800, Xin Long wrote: > >>>>>>> On Sun, Jun 21, 2020 at 11:56 PM Corey Minyard > >>>>>>> wrote: > >>>>>>>> > >>>>>>>> I've stumbled upon a strange problem with SCTP and IPv6. If I > >>>>>>>> create an > >>>>>>>> sctp listening socket on :: and set the IPV6_V6ONLY socket option on > >>>>>>>> it, > >>>>>>>> then I make a connection to it using ::1, the connection will drop > >>>>>>>> after > >>>>>>>> 2.5 seconds with an ECONNRESET error. > >>>>>>>> > >>>>>>>> It only happens on SCTP, it doesn't have the issue if you connect to > >>>>>>>> a > >>>>>>>> full IPv6 address instead of ::1, and it doesn't happen if you don't > >>>>>>>> set IPV6_V6ONLY. I have verified current end of tree kernel.org. > >>>>>>>> I tried on an ARM system and x86_64. > >>>>>>>> > >>>>>>>> I haven't dug into the kernel to see if I could find anything yet, > >>>>>>>> but I > >>>>>>>> thought I would go ahead and report it. I am attaching a reproducer. > >>>>>>>> Basically, compile the following code: > >>>>>>> The code only set IPV6_V6ONLY on server side, so the client side will > >>>>>>> still bind all the local ipv4 addresses (as you didn't call bind() to > >>>>>>> bind any specific addresses ). Then after the connection is created, > >>>>>>> the client will send HB on the v4 paths to the server. The server > >>>>>>> will abort the connection, as it can't support v4. > >>>>>>> > >>>>>>> So you can work around it by either: > >>>>>>> > >>>>>>> - set IPV6_V6ONLY on client side. > >>>>>>> > >>>>>>> or > >>>>>>> > >>>>>>> - bind to the specific v6 addresses on the client side. > >>>>>>> > >>>>>>> I don't see RFC said something about this. > >>>>>>> So it may not be a good idea to change the current behaviour > >>>>>>> to not establish the connection in this case, which may cause > >>>>>>> regression. > >>>>>> > >>>>>> Ok, I understand this. It's a little strange, but I see why it works > >>>>>> this way. > >>>>> I don't. I would expect it to work as I described in my email. > >>>>> Could someone explain me how and why it is behaving different from > >>>>> my expectation? > >>>> > >>>> It looks like a bug to me. Testing with this test app here, I can see > >>>> the INIT_ACK being sent with a bunch of ipv4 addresses in it and > >>>> that's unexpected for a v6only socket. As is, it's the server saying > >>>> "I'm available at these other addresses too, but not." > >>> > >>> Does it even make sense to mix IPv4 and IPv6 addresses on the same > >>> connection? > >>> I don't remember ever seeing both types of address in a message, > >>> but may not have looked. > >> > >> That's an interesting question. Do the RFCs say anything? I would > >> assume it was ok unless ipv6only was set. > >> > >>> > >>> I also wonder whether the connection should be dropped for an error > >>> response on a path that has never been validated. > >> > >> That actually bothered me a bit more. Should
Re: Strange problem with SCTP+IPv6
> On 23. Jun 2020, at 23:21, Marcelo Ricardo Leitner > wrote: > > On Tue, Jun 23, 2020 at 11:17:56AM -0500, Corey Minyard wrote: >> On Tue, Jun 23, 2020 at 01:17:28PM +, David Laight wrote: >>> From: Marcelo Ricardo Leitner >>>> Sent: 22 June 2020 19:33 >>>> On Mon, Jun 22, 2020 at 08:01:24PM +0200, Michael Tuexen wrote: >>>>>> On 22. Jun 2020, at 18:57, Corey Minyard wrote: >>>>>> >>>>>> On Mon, Jun 22, 2020 at 08:01:23PM +0800, Xin Long wrote: >>>>>>> On Sun, Jun 21, 2020 at 11:56 PM Corey Minyard wrote: >>>>>>>> >>>>>>>> I've stumbled upon a strange problem with SCTP and IPv6. If I create >>>>>>>> an >>>>>>>> sctp listening socket on :: and set the IPV6_V6ONLY socket option on >>>>>>>> it, >>>>>>>> then I make a connection to it using ::1, the connection will drop >>>>>>>> after >>>>>>>> 2.5 seconds with an ECONNRESET error. >>>>>>>> >>>>>>>> It only happens on SCTP, it doesn't have the issue if you connect to a >>>>>>>> full IPv6 address instead of ::1, and it doesn't happen if you don't >>>>>>>> set IPV6_V6ONLY. I have verified current end of tree kernel.org. >>>>>>>> I tried on an ARM system and x86_64. >>>>>>>> >>>>>>>> I haven't dug into the kernel to see if I could find anything yet, but >>>>>>>> I >>>>>>>> thought I would go ahead and report it. I am attaching a reproducer. >>>>>>>> Basically, compile the following code: >>>>>>> The code only set IPV6_V6ONLY on server side, so the client side will >>>>>>> still bind all the local ipv4 addresses (as you didn't call bind() to >>>>>>> bind any specific addresses ). Then after the connection is created, >>>>>>> the client will send HB on the v4 paths to the server. The server >>>>>>> will abort the connection, as it can't support v4. >>>>>>> >>>>>>> So you can work around it by either: >>>>>>> >>>>>>> - set IPV6_V6ONLY on client side. >>>>>>> >>>>>>> or >>>>>>> >>>>>>> - bind to the specific v6 addresses on the client side. >>>>>>> >>>>>>> I don't see RFC said something about this. >>>>>>> So it may not be a good idea to change the current behaviour >>>>>>> to not establish the connection in this case, which may cause >>>>>>> regression. >>>>>> >>>>>> Ok, I understand this. It's a little strange, but I see why it works >>>>>> this way. >>>>> I don't. I would expect it to work as I described in my email. >>>>> Could someone explain me how and why it is behaving different from >>>>> my expectation? >>>> >>>> It looks like a bug to me. Testing with this test app here, I can see >>>> the INIT_ACK being sent with a bunch of ipv4 addresses in it and >>>> that's unexpected for a v6only socket. As is, it's the server saying >>>> "I'm available at these other addresses too, but not." >>> >>> Does it even make sense to mix IPv4 and IPv6 addresses on the same >>> connection? >>> I don't remember ever seeing both types of address in a message, >>> but may not have looked. >> >> That's an interesting question. Do the RFCs say anything? I would >> assume it was ok unless ipv6only was set. >> >>> >>> I also wonder whether the connection should be dropped for an error >>> response on a path that has never been validated. >> >> That actually bothered me a bit more. Shouldn't it stay up if any path >> is up? That's kind of the whole point of multihoming. > > Michael explained it on the other email. What he described is what I > observed in my tests. > >> >>> >>> OTOH the whole 'multi-homing' part of SCTP sucks. >> >> I don't think so. >> >>> The IP addresses a server needs to bind to depend on where the >>> incoming connection will come from. >>> A local connection may be able to use a 192.168.x.x address >>> but a remote connection must not - as it may be defined locally >>> at the remote system. >>> But both connections can come into the public (routable) address. >>> We have to tell customers to explicitly configure the local IP >>> addresses - which means the application has to know what they are. >>> Fortunately these apps are pretty static - usually M3UA. >> >> Umm, no, If you have a private address, it better be behind a firewall, >> and the firewall should handle rewriting the packet to fix the addresses. >> >> It doesn't appear that Linux netfilter does this. There is a TODO in >> the code for this. But that's how it *should* work. > > Right, we don't support SCTP aware NAT [1]. > > 1.https://tools.ietf.org/html/draft-stewart-behave-sctpnat-04 The current version is: https://tools.ietf.org/html/draft-ietf-tsvwg-natsupp-16 Another possibility for NAT traversal is UDP encapsulation... Best regards Michael > > Marcelo > >> >> -corey >> >>> >>> David >>> >>> - >>> Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 >>> 1PT, UK >>> Registration No: 1397386 (Wales) >>>
Re: Strange problem with SCTP+IPv6
On Tue, Jun 23, 2020 at 11:17:56AM -0500, Corey Minyard wrote: > On Tue, Jun 23, 2020 at 01:17:28PM +, David Laight wrote: > > From: Marcelo Ricardo Leitner > > > Sent: 22 June 2020 19:33 > > > On Mon, Jun 22, 2020 at 08:01:24PM +0200, Michael Tuexen wrote: > > > > > On 22. Jun 2020, at 18:57, Corey Minyard wrote: > > > > > > > > > > On Mon, Jun 22, 2020 at 08:01:23PM +0800, Xin Long wrote: > > > > >> On Sun, Jun 21, 2020 at 11:56 PM Corey Minyard > > > > >> wrote: > > > > >>> > > > > >>> I've stumbled upon a strange problem with SCTP and IPv6. If I > > > > >>> create an > > > > >>> sctp listening socket on :: and set the IPV6_V6ONLY socket option > > > > >>> on it, > > > > >>> then I make a connection to it using ::1, the connection will drop > > > > >>> after > > > > >>> 2.5 seconds with an ECONNRESET error. > > > > >>> > > > > >>> It only happens on SCTP, it doesn't have the issue if you connect > > > > >>> to a > > > > >>> full IPv6 address instead of ::1, and it doesn't happen if you don't > > > > >>> set IPV6_V6ONLY. I have verified current end of tree kernel.org. > > > > >>> I tried on an ARM system and x86_64. > > > > >>> > > > > >>> I haven't dug into the kernel to see if I could find anything yet, > > > > >>> but I > > > > >>> thought I would go ahead and report it. I am attaching a > > > > >>> reproducer. > > > > >>> Basically, compile the following code: > > > > >> The code only set IPV6_V6ONLY on server side, so the client side will > > > > >> still bind all the local ipv4 addresses (as you didn't call bind() to > > > > >> bind any specific addresses ). Then after the connection is created, > > > > >> the client will send HB on the v4 paths to the server. The server > > > > >> will abort the connection, as it can't support v4. > > > > >> > > > > >> So you can work around it by either: > > > > >> > > > > >> - set IPV6_V6ONLY on client side. > > > > >> > > > > >> or > > > > >> > > > > >> - bind to the specific v6 addresses on the client side. > > > > >> > > > > >> I don't see RFC said something about this. > > > > >> So it may not be a good idea to change the current behaviour > > > > >> to not establish the connection in this case, which may cause > > > > >> regression. > > > > > > > > > > Ok, I understand this. It's a little strange, but I see why it works > > > > > this way. > > > > I don't. I would expect it to work as I described in my email. > > > > Could someone explain me how and why it is behaving different from > > > > my expectation? > > > > > > It looks like a bug to me. Testing with this test app here, I can see > > > the INIT_ACK being sent with a bunch of ipv4 addresses in it and > > > that's unexpected for a v6only socket. As is, it's the server saying > > > "I'm available at these other addresses too, but not." > > > > Does it even make sense to mix IPv4 and IPv6 addresses on the same > > connection? > > I don't remember ever seeing both types of address in a message, > > but may not have looked. > > That's an interesting question. Do the RFCs say anything? I would > assume it was ok unless ipv6only was set. > > > > > I also wonder whether the connection should be dropped for an error > > response on a path that has never been validated. > > That actually bothered me a bit more. Shouldn't it stay up if any path > is up? That's kind of the whole point of multihoming. Michael explained it on the other email. What he described is what I observed in my tests. > > > > > OTOH the whole 'multi-homing' part of SCTP sucks. > > I don't think so. > > > The IP addresses a server needs to bind to depend on where the > > incoming connection will come from. > > A local connection may be able to use a 192.168.x.x address > > but a remote connection must not - as it may be defined locally > > at the remote system. > > But both connections can come into the public (routable) address. > > We have to tell customers to explicitly configure the local IP > > addresses - which means the application has to know what they are. > > Fortunately these apps are pretty static - usually M3UA. > > Umm, no, If you have a private address, it better be behind a firewall, > and the firewall should handle rewriting the packet to fix the addresses. > > It doesn't appear that Linux netfilter does this. There is a TODO in > the code for this. But that's how it *should* work. Right, we don't support SCTP aware NAT [1]. 1.https://tools.ietf.org/html/draft-stewart-behave-sctpnat-04 Marcelo > > -corey > > > > > David > > > > - > > Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 > > 1PT, UK > > Registration No: 1397386 (Wales) > >
Re: Strange problem with SCTP+IPv6
> On 23. Jun 2020, at 15:17, David Laight wrote: > > From: Marcelo Ricardo Leitner >> Sent: 22 June 2020 19:33 >> On Mon, Jun 22, 2020 at 08:01:24PM +0200, Michael Tuexen wrote: >>>> On 22. Jun 2020, at 18:57, Corey Minyard wrote: >>>> >>>> On Mon, Jun 22, 2020 at 08:01:23PM +0800, Xin Long wrote: >>>>> On Sun, Jun 21, 2020 at 11:56 PM Corey Minyard wrote: >>>>>> >>>>>> I've stumbled upon a strange problem with SCTP and IPv6. If I create an >>>>>> sctp listening socket on :: and set the IPV6_V6ONLY socket option on it, >>>>>> then I make a connection to it using ::1, the connection will drop after >>>>>> 2.5 seconds with an ECONNRESET error. >>>>>> >>>>>> It only happens on SCTP, it doesn't have the issue if you connect to a >>>>>> full IPv6 address instead of ::1, and it doesn't happen if you don't >>>>>> set IPV6_V6ONLY. I have verified current end of tree kernel.org. >>>>>> I tried on an ARM system and x86_64. >>>>>> >>>>>> I haven't dug into the kernel to see if I could find anything yet, but I >>>>>> thought I would go ahead and report it. I am attaching a reproducer. >>>>>> Basically, compile the following code: >>>>> The code only set IPV6_V6ONLY on server side, so the client side will >>>>> still bind all the local ipv4 addresses (as you didn't call bind() to >>>>> bind any specific addresses ). Then after the connection is created, >>>>> the client will send HB on the v4 paths to the server. The server >>>>> will abort the connection, as it can't support v4. >>>>> >>>>> So you can work around it by either: >>>>> >>>>> - set IPV6_V6ONLY on client side. >>>>> >>>>> or >>>>> >>>>> - bind to the specific v6 addresses on the client side. >>>>> >>>>> I don't see RFC said something about this. >>>>> So it may not be a good idea to change the current behaviour >>>>> to not establish the connection in this case, which may cause regression. >>>> >>>> Ok, I understand this. It's a little strange, but I see why it works >>>> this way. >>> I don't. I would expect it to work as I described in my email. >>> Could someone explain me how and why it is behaving different from >>> my expectation? >> >> It looks like a bug to me. Testing with this test app here, I can see >> the INIT_ACK being sent with a bunch of ipv4 addresses in it and >> that's unexpected for a v6only socket. As is, it's the server saying >> "I'm available at these other addresses too, but not." > > Does it even make sense to mix IPv4 and IPv6 addresses on the same > connection? Sure, if you have an IPv6 socket, which has not enabled the IPV6ONLY socket option. > I don't remember ever seeing both types of address in a message, > but may not have looked. > > I also wonder whether the connection should be dropped for an error > response on a path that has never been validated. Assuming that it is not an ERROR chunk which comes back, but an ABORT, this should happen as long as the verification tag is OK. > > OTOH the whole 'multi-homing' part of SCTP sucks. > The IP addresses a server needs to bind to depend on where the > incoming connection will come from. Not sure what this means. The application can bind a wildcard address or a specific subset. However, when an INIT comes in, the INIT-ACK might contain only a subset of there due to scoping. > A local connection may be able to use a 192.168.x.x address > but a remote connection must not - as it may be defined locally > at the remote system. Yepp. Not sure what you can do about it. > But both connections can come into the public (routable) address. > We have to tell customers to explicitly configure the local IP > addresses - which means the application has to know what they are. > Fortunately these apps are pretty static - usually M3UA. Please note that in SIGRTRAN scenarios you normally not have NATs involved as you have usually in setups used at home. Best regards Michael > > David > > - > Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 > 1PT, UK > Registration No: 1397386 (Wales) >
Re: Strange problem with SCTP+IPv6
On Tue, Jun 23, 2020 at 01:17:28PM +, David Laight wrote: > From: Marcelo Ricardo Leitner > > Sent: 22 June 2020 19:33 > > On Mon, Jun 22, 2020 at 08:01:24PM +0200, Michael Tuexen wrote: > > > > On 22. Jun 2020, at 18:57, Corey Minyard wrote: > > > > > > > > On Mon, Jun 22, 2020 at 08:01:23PM +0800, Xin Long wrote: > > > >> On Sun, Jun 21, 2020 at 11:56 PM Corey Minyard wrote: > > > >>> > > > >>> I've stumbled upon a strange problem with SCTP and IPv6. If I create > > > >>> an > > > >>> sctp listening socket on :: and set the IPV6_V6ONLY socket option on > > > >>> it, > > > >>> then I make a connection to it using ::1, the connection will drop > > > >>> after > > > >>> 2.5 seconds with an ECONNRESET error. > > > >>> > > > >>> It only happens on SCTP, it doesn't have the issue if you connect to a > > > >>> full IPv6 address instead of ::1, and it doesn't happen if you don't > > > >>> set IPV6_V6ONLY. I have verified current end of tree kernel.org. > > > >>> I tried on an ARM system and x86_64. > > > >>> > > > >>> I haven't dug into the kernel to see if I could find anything yet, > > > >>> but I > > > >>> thought I would go ahead and report it. I am attaching a reproducer. > > > >>> Basically, compile the following code: > > > >> The code only set IPV6_V6ONLY on server side, so the client side will > > > >> still bind all the local ipv4 addresses (as you didn't call bind() to > > > >> bind any specific addresses ). Then after the connection is created, > > > >> the client will send HB on the v4 paths to the server. The server > > > >> will abort the connection, as it can't support v4. > > > >> > > > >> So you can work around it by either: > > > >> > > > >> - set IPV6_V6ONLY on client side. > > > >> > > > >> or > > > >> > > > >> - bind to the specific v6 addresses on the client side. > > > >> > > > >> I don't see RFC said something about this. > > > >> So it may not be a good idea to change the current behaviour > > > >> to not establish the connection in this case, which may cause > > > >> regression. > > > > > > > > Ok, I understand this. It's a little strange, but I see why it works > > > > this way. > > > I don't. I would expect it to work as I described in my email. > > > Could someone explain me how and why it is behaving different from > > > my expectation? > > > > It looks like a bug to me. Testing with this test app here, I can see > > the INIT_ACK being sent with a bunch of ipv4 addresses in it and > > that's unexpected for a v6only socket. As is, it's the server saying > > "I'm available at these other addresses too, but not." > > Does it even make sense to mix IPv4 and IPv6 addresses on the same > connection? > I don't remember ever seeing both types of address in a message, > but may not have looked. That's an interesting question. Do the RFCs say anything? I would assume it was ok unless ipv6only was set. > > I also wonder whether the connection should be dropped for an error > response on a path that has never been validated. That actually bothered me a bit more. Shouldn't it stay up if any path is up? That's kind of the whole point of multihoming. > > OTOH the whole 'multi-homing' part of SCTP sucks. I don't think so. > The IP addresses a server needs to bind to depend on where the > incoming connection will come from. > A local connection may be able to use a 192.168.x.x address > but a remote connection must not - as it may be defined locally > at the remote system. > But both connections can come into the public (routable) address. > We have to tell customers to explicitly configure the local IP > addresses - which means the application has to know what they are. > Fortunately these apps are pretty static - usually M3UA. Umm, no, If you have a private address, it better be behind a firewall, and the firewall should handle rewriting the packet to fix the addresses. It doesn't appear that Linux netfilter does this. There is a TODO in the code for this. But that's how it *should* work. -corey > > David > > - > Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 > 1PT, UK > Registration No: 1397386 (Wales) >
Re: Strange problem with SCTP+IPv6
On Tue, Jun 23, 2020 at 11:40:21PM +0800, Xin Long wrote: > On Tue, Jun 23, 2020 at 9:29 PM Corey Minyard wrote: > > > > On Tue, Jun 23, 2020 at 06:13:30PM +0800, Xin Long wrote: > > > On Tue, Jun 23, 2020 at 2:34 AM Michael Tuexen > > > wrote: > > > > > > > > > On 22. Jun 2020, at 20:32, Marcelo Ricardo Leitner > > > > > wrote: > > > > > > > > > > On Mon, Jun 22, 2020 at 08:01:24PM +0200, Michael Tuexen wrote: > > > > >>> On 22. Jun 2020, at 18:57, Corey Minyard wrote: > > > > >>> > > > > >>> On Mon, Jun 22, 2020 at 08:01:23PM +0800, Xin Long wrote: > > > > >>>> On Sun, Jun 21, 2020 at 11:56 PM Corey Minyard > > > > >>>> wrote: > > > > >>>>> > > > > >>>>> I've stumbled upon a strange problem with SCTP and IPv6. If I > > > > >>>>> create an > > > > >>>>> sctp listening socket on :: and set the IPV6_V6ONLY socket option > > > > >>>>> on it, > > > > >>>>> then I make a connection to it using ::1, the connection will > > > > >>>>> drop after > > > > >>>>> 2.5 seconds with an ECONNRESET error. > > > > >>>>> > > > > >>>>> It only happens on SCTP, it doesn't have the issue if you connect > > > > >>>>> to a > > > > >>>>> full IPv6 address instead of ::1, and it doesn't happen if you > > > > >>>>> don't > > > > >>>>> set IPV6_V6ONLY. I have verified current end of tree kernel.org. > > > > >>>>> I tried on an ARM system and x86_64. > > > > >>>>> > > > > >>>>> I haven't dug into the kernel to see if I could find anything > > > > >>>>> yet, but I > > > > >>>>> thought I would go ahead and report it. I am attaching a > > > > >>>>> reproducer. > > > > >>>>> Basically, compile the following code: > > > > >>>> The code only set IPV6_V6ONLY on server side, so the client side > > > > >>>> will > > > > >>>> still bind all the local ipv4 addresses (as you didn't call bind() > > > > >>>> to > > > > >>>> bind any specific addresses ). Then after the connection is > > > > >>>> created, > > > > >>>> the client will send HB on the v4 paths to the server. The server > > > > >>>> will abort the connection, as it can't support v4. > > > > >>>> > > > > >>>> So you can work around it by either: > > > > >>>> > > > > >>>> - set IPV6_V6ONLY on client side. > > > > >>>> > > > > >>>> or > > > > >>>> > > > > >>>> - bind to the specific v6 addresses on the client side. > > > > >>>> > > > > >>>> I don't see RFC said something about this. > > > > >>>> So it may not be a good idea to change the current behaviour > > > > >>>> to not establish the connection in this case, which may cause > > > > >>>> regression. > > > > >>> > > > > >>> Ok, I understand this. It's a little strange, but I see why it > > > > >>> works > > > > >>> this way. > > > > >> I don't. I would expect it to work as I described in my email. > > > > >> Could someone explain me how and why it is behaving different from > > > > >> my expectation? > > > > > > > > > > It looks like a bug to me. Testing with this test app here, I can see > > > > > the INIT_ACK being sent with a bunch of ipv4 addresses in it and > > > > > that's unexpected for a v6only socket. As is, it's the server saying > > > > > "I'm available at these other addresses too, but not." > > > > I agree. > > > Then we need a fix in sctp_bind_addrs_to_raw(): > > > > > > @@ -238,6 +240,9 @@ union sctp_params sctp_bind_addrs_to_raw(const > > > struct sctp_bind_
Re: Strange problem with SCTP+IPv6
On Tue, Jun 23, 2020 at 9:29 PM Corey Minyard wrote: > > On Tue, Jun 23, 2020 at 06:13:30PM +0800, Xin Long wrote: > > On Tue, Jun 23, 2020 at 2:34 AM Michael Tuexen > > wrote: > > > > > > > On 22. Jun 2020, at 20:32, Marcelo Ricardo Leitner > > > > wrote: > > > > > > > > On Mon, Jun 22, 2020 at 08:01:24PM +0200, Michael Tuexen wrote: > > > >>> On 22. Jun 2020, at 18:57, Corey Minyard wrote: > > > >>> > > > >>> On Mon, Jun 22, 2020 at 08:01:23PM +0800, Xin Long wrote: > > > >>>> On Sun, Jun 21, 2020 at 11:56 PM Corey Minyard > > > >>>> wrote: > > > >>>>> > > > >>>>> I've stumbled upon a strange problem with SCTP and IPv6. If I > > > >>>>> create an > > > >>>>> sctp listening socket on :: and set the IPV6_V6ONLY socket option > > > >>>>> on it, > > > >>>>> then I make a connection to it using ::1, the connection will drop > > > >>>>> after > > > >>>>> 2.5 seconds with an ECONNRESET error. > > > >>>>> > > > >>>>> It only happens on SCTP, it doesn't have the issue if you connect > > > >>>>> to a > > > >>>>> full IPv6 address instead of ::1, and it doesn't happen if you don't > > > >>>>> set IPV6_V6ONLY. I have verified current end of tree kernel.org. > > > >>>>> I tried on an ARM system and x86_64. > > > >>>>> > > > >>>>> I haven't dug into the kernel to see if I could find anything yet, > > > >>>>> but I > > > >>>>> thought I would go ahead and report it. I am attaching a > > > >>>>> reproducer. > > > >>>>> Basically, compile the following code: > > > >>>> The code only set IPV6_V6ONLY on server side, so the client side will > > > >>>> still bind all the local ipv4 addresses (as you didn't call bind() to > > > >>>> bind any specific addresses ). Then after the connection is created, > > > >>>> the client will send HB on the v4 paths to the server. The server > > > >>>> will abort the connection, as it can't support v4. > > > >>>> > > > >>>> So you can work around it by either: > > > >>>> > > > >>>> - set IPV6_V6ONLY on client side. > > > >>>> > > > >>>> or > > > >>>> > > > >>>> - bind to the specific v6 addresses on the client side. > > > >>>> > > > >>>> I don't see RFC said something about this. > > > >>>> So it may not be a good idea to change the current behaviour > > > >>>> to not establish the connection in this case, which may cause > > > >>>> regression. > > > >>> > > > >>> Ok, I understand this. It's a little strange, but I see why it works > > > >>> this way. > > > >> I don't. I would expect it to work as I described in my email. > > > >> Could someone explain me how and why it is behaving different from > > > >> my expectation? > > > > > > > > It looks like a bug to me. Testing with this test app here, I can see > > > > the INIT_ACK being sent with a bunch of ipv4 addresses in it and > > > > that's unexpected for a v6only socket. As is, it's the server saying > > > > "I'm available at these other addresses too, but not." > > > I agree. > > Then we need a fix in sctp_bind_addrs_to_raw(): > > > > @@ -238,6 +240,9 @@ union sctp_params sctp_bind_addrs_to_raw(const > > struct sctp_bind_addr *bp, > > addrparms = retval; > > > > list_for_each_entry(addr, &bp->address_list, list) { > > + if ((PF_INET6 == sk->sk_family) && inet_v6_ipv6only(sk) && > > + (AF_INET == addr->a.sa.sa_family)) > > + continue; > > This does not compile in the latest mainline. sk is not defined. > Also, if you could send a normal git patch, that would be easier to > manage. sorry, that was just the code to show the idea. For the compilable one, pls see: https://paste.cento
Re: Strange problem with SCTP+IPv6
On Tue, Jun 23, 2020 at 06:13:30PM +0800, Xin Long wrote: > On Tue, Jun 23, 2020 at 2:34 AM Michael Tuexen > wrote: > > > > > On 22. Jun 2020, at 20:32, Marcelo Ricardo Leitner > > > wrote: > > > > > > On Mon, Jun 22, 2020 at 08:01:24PM +0200, Michael Tuexen wrote: > > >>> On 22. Jun 2020, at 18:57, Corey Minyard wrote: > > >>> > > >>> On Mon, Jun 22, 2020 at 08:01:23PM +0800, Xin Long wrote: > > >>>> On Sun, Jun 21, 2020 at 11:56 PM Corey Minyard wrote: > > >>>>> > > >>>>> I've stumbled upon a strange problem with SCTP and IPv6. If I create > > >>>>> an > > >>>>> sctp listening socket on :: and set the IPV6_V6ONLY socket option on > > >>>>> it, > > >>>>> then I make a connection to it using ::1, the connection will drop > > >>>>> after > > >>>>> 2.5 seconds with an ECONNRESET error. > > >>>>> > > >>>>> It only happens on SCTP, it doesn't have the issue if you connect to a > > >>>>> full IPv6 address instead of ::1, and it doesn't happen if you don't > > >>>>> set IPV6_V6ONLY. I have verified current end of tree kernel.org. > > >>>>> I tried on an ARM system and x86_64. > > >>>>> > > >>>>> I haven't dug into the kernel to see if I could find anything yet, > > >>>>> but I > > >>>>> thought I would go ahead and report it. I am attaching a reproducer. > > >>>>> Basically, compile the following code: > > >>>> The code only set IPV6_V6ONLY on server side, so the client side will > > >>>> still bind all the local ipv4 addresses (as you didn't call bind() to > > >>>> bind any specific addresses ). Then after the connection is created, > > >>>> the client will send HB on the v4 paths to the server. The server > > >>>> will abort the connection, as it can't support v4. > > >>>> > > >>>> So you can work around it by either: > > >>>> > > >>>> - set IPV6_V6ONLY on client side. > > >>>> > > >>>> or > > >>>> > > >>>> - bind to the specific v6 addresses on the client side. > > >>>> > > >>>> I don't see RFC said something about this. > > >>>> So it may not be a good idea to change the current behaviour > > >>>> to not establish the connection in this case, which may cause > > >>>> regression. > > >>> > > >>> Ok, I understand this. It's a little strange, but I see why it works > > >>> this way. > > >> I don't. I would expect it to work as I described in my email. > > >> Could someone explain me how and why it is behaving different from > > >> my expectation? > > > > > > It looks like a bug to me. Testing with this test app here, I can see > > > the INIT_ACK being sent with a bunch of ipv4 addresses in it and > > > that's unexpected for a v6only socket. As is, it's the server saying > > > "I'm available at these other addresses too, but not." > > I agree. > Then we need a fix in sctp_bind_addrs_to_raw(): > > @@ -238,6 +240,9 @@ union sctp_params sctp_bind_addrs_to_raw(const > struct sctp_bind_addr *bp, > addrparms = retval; > > list_for_each_entry(addr, &bp->address_list, list) { > + if ((PF_INET6 == sk->sk_family) && inet_v6_ipv6only(sk) && > + (AF_INET == addr->a.sa.sa_family)) > + continue; This does not compile in the latest mainline. sk is not defined. Also, if you could send a normal git patch, that would be easier to manage. Thanks, -corey > af = sctp_get_af_specific(addr->a.v4.sin_family); > len = af->to_addr_param(&addr->a, &rawaddr); > memcpy(addrparms.v, &rawaddr, len); > > > > > Best regards > > Michael > > > > > > Thanks, > > > Marcelo > > > > > >> > > >> Best regards > > >> Michael > > >>> > > >>> Thanks, > > >>> > > >>> -corey > > >>> > > >>>> > > >>>>> > > >>>&g
RE: Strange problem with SCTP+IPv6
From: Marcelo Ricardo Leitner > Sent: 22 June 2020 19:33 > On Mon, Jun 22, 2020 at 08:01:24PM +0200, Michael Tuexen wrote: > > > On 22. Jun 2020, at 18:57, Corey Minyard wrote: > > > > > > On Mon, Jun 22, 2020 at 08:01:23PM +0800, Xin Long wrote: > > >> On Sun, Jun 21, 2020 at 11:56 PM Corey Minyard wrote: > > >>> > > >>> I've stumbled upon a strange problem with SCTP and IPv6. If I create an > > >>> sctp listening socket on :: and set the IPV6_V6ONLY socket option on it, > > >>> then I make a connection to it using ::1, the connection will drop after > > >>> 2.5 seconds with an ECONNRESET error. > > >>> > > >>> It only happens on SCTP, it doesn't have the issue if you connect to a > > >>> full IPv6 address instead of ::1, and it doesn't happen if you don't > > >>> set IPV6_V6ONLY. I have verified current end of tree kernel.org. > > >>> I tried on an ARM system and x86_64. > > >>> > > >>> I haven't dug into the kernel to see if I could find anything yet, but I > > >>> thought I would go ahead and report it. I am attaching a reproducer. > > >>> Basically, compile the following code: > > >> The code only set IPV6_V6ONLY on server side, so the client side will > > >> still bind all the local ipv4 addresses (as you didn't call bind() to > > >> bind any specific addresses ). Then after the connection is created, > > >> the client will send HB on the v4 paths to the server. The server > > >> will abort the connection, as it can't support v4. > > >> > > >> So you can work around it by either: > > >> > > >> - set IPV6_V6ONLY on client side. > > >> > > >> or > > >> > > >> - bind to the specific v6 addresses on the client side. > > >> > > >> I don't see RFC said something about this. > > >> So it may not be a good idea to change the current behaviour > > >> to not establish the connection in this case, which may cause regression. > > > > > > Ok, I understand this. It's a little strange, but I see why it works > > > this way. > > I don't. I would expect it to work as I described in my email. > > Could someone explain me how and why it is behaving different from > > my expectation? > > It looks like a bug to me. Testing with this test app here, I can see > the INIT_ACK being sent with a bunch of ipv4 addresses in it and > that's unexpected for a v6only socket. As is, it's the server saying > "I'm available at these other addresses too, but not." Does it even make sense to mix IPv4 and IPv6 addresses on the same connection? I don't remember ever seeing both types of address in a message, but may not have looked. I also wonder whether the connection should be dropped for an error response on a path that has never been validated. OTOH the whole 'multi-homing' part of SCTP sucks. The IP addresses a server needs to bind to depend on where the incoming connection will come from. A local connection may be able to use a 192.168.x.x address but a remote connection must not - as it may be defined locally at the remote system. But both connections can come into the public (routable) address. We have to tell customers to explicitly configure the local IP addresses - which means the application has to know what they are. Fortunately these apps are pretty static - usually M3UA. David - Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK Registration No: 1397386 (Wales)
Re: Strange problem with SCTP+IPv6
On Tue, Jun 23, 2020 at 2:34 AM Michael Tuexen wrote: > > > On 22. Jun 2020, at 20:32, Marcelo Ricardo Leitner > > wrote: > > > > On Mon, Jun 22, 2020 at 08:01:24PM +0200, Michael Tuexen wrote: > >>> On 22. Jun 2020, at 18:57, Corey Minyard wrote: > >>> > >>> On Mon, Jun 22, 2020 at 08:01:23PM +0800, Xin Long wrote: > >>>> On Sun, Jun 21, 2020 at 11:56 PM Corey Minyard wrote: > >>>>> > >>>>> I've stumbled upon a strange problem with SCTP and IPv6. If I create an > >>>>> sctp listening socket on :: and set the IPV6_V6ONLY socket option on it, > >>>>> then I make a connection to it using ::1, the connection will drop after > >>>>> 2.5 seconds with an ECONNRESET error. > >>>>> > >>>>> It only happens on SCTP, it doesn't have the issue if you connect to a > >>>>> full IPv6 address instead of ::1, and it doesn't happen if you don't > >>>>> set IPV6_V6ONLY. I have verified current end of tree kernel.org. > >>>>> I tried on an ARM system and x86_64. > >>>>> > >>>>> I haven't dug into the kernel to see if I could find anything yet, but I > >>>>> thought I would go ahead and report it. I am attaching a reproducer. > >>>>> Basically, compile the following code: > >>>> The code only set IPV6_V6ONLY on server side, so the client side will > >>>> still bind all the local ipv4 addresses (as you didn't call bind() to > >>>> bind any specific addresses ). Then after the connection is created, > >>>> the client will send HB on the v4 paths to the server. The server > >>>> will abort the connection, as it can't support v4. > >>>> > >>>> So you can work around it by either: > >>>> > >>>> - set IPV6_V6ONLY on client side. > >>>> > >>>> or > >>>> > >>>> - bind to the specific v6 addresses on the client side. > >>>> > >>>> I don't see RFC said something about this. > >>>> So it may not be a good idea to change the current behaviour > >>>> to not establish the connection in this case, which may cause regression. > >>> > >>> Ok, I understand this. It's a little strange, but I see why it works > >>> this way. > >> I don't. I would expect it to work as I described in my email. > >> Could someone explain me how and why it is behaving different from > >> my expectation? > > > > It looks like a bug to me. Testing with this test app here, I can see > > the INIT_ACK being sent with a bunch of ipv4 addresses in it and > > that's unexpected for a v6only socket. As is, it's the server saying > > "I'm available at these other addresses too, but not." > I agree. Then we need a fix in sctp_bind_addrs_to_raw(): @@ -238,6 +240,9 @@ union sctp_params sctp_bind_addrs_to_raw(const struct sctp_bind_addr *bp, addrparms = retval; list_for_each_entry(addr, &bp->address_list, list) { + if ((PF_INET6 == sk->sk_family) && inet_v6_ipv6only(sk) && + (AF_INET == addr->a.sa.sa_family)) + continue; af = sctp_get_af_specific(addr->a.v4.sin_family); len = af->to_addr_param(&addr->a, &rawaddr); memcpy(addrparms.v, &rawaddr, len); > > Best regards > Michael > > > > Thanks, > > Marcelo > > > >> > >> Best regards > >> Michael > >>> > >>> Thanks, > >>> > >>> -corey > >>> > >>>> > >>>>> > >>>>> gcc -g -o sctptest -Wall sctptest.c > >>>>> > >>>>> and run it in one window as a server: > >>>>> > >>>>> ./sctptest a > >>>>> > >>>>> (Pass in any option to be the server) and run the following in another > >>>>> window as the client: > >>>>> > >>>>> ./sctptest > >>>>> > >>>>> It disconnects after about 2.5 seconds. If it works, it should just sit > >>>>> there forever. > >>>>> > >>>>> -corey > >>>>> > >>>>> > >>>>> #include > >>>>> #include > >>>>>
Re: Strange problem with SCTP+IPv6
> On 22. Jun 2020, at 20:32, Marcelo Ricardo Leitner > wrote: > > On Mon, Jun 22, 2020 at 08:01:24PM +0200, Michael Tuexen wrote: >>> On 22. Jun 2020, at 18:57, Corey Minyard wrote: >>> >>> On Mon, Jun 22, 2020 at 08:01:23PM +0800, Xin Long wrote: >>>> On Sun, Jun 21, 2020 at 11:56 PM Corey Minyard wrote: >>>>> >>>>> I've stumbled upon a strange problem with SCTP and IPv6. If I create an >>>>> sctp listening socket on :: and set the IPV6_V6ONLY socket option on it, >>>>> then I make a connection to it using ::1, the connection will drop after >>>>> 2.5 seconds with an ECONNRESET error. >>>>> >>>>> It only happens on SCTP, it doesn't have the issue if you connect to a >>>>> full IPv6 address instead of ::1, and it doesn't happen if you don't >>>>> set IPV6_V6ONLY. I have verified current end of tree kernel.org. >>>>> I tried on an ARM system and x86_64. >>>>> >>>>> I haven't dug into the kernel to see if I could find anything yet, but I >>>>> thought I would go ahead and report it. I am attaching a reproducer. >>>>> Basically, compile the following code: >>>> The code only set IPV6_V6ONLY on server side, so the client side will >>>> still bind all the local ipv4 addresses (as you didn't call bind() to >>>> bind any specific addresses ). Then after the connection is created, >>>> the client will send HB on the v4 paths to the server. The server >>>> will abort the connection, as it can't support v4. >>>> >>>> So you can work around it by either: >>>> >>>> - set IPV6_V6ONLY on client side. >>>> >>>> or >>>> >>>> - bind to the specific v6 addresses on the client side. >>>> >>>> I don't see RFC said something about this. >>>> So it may not be a good idea to change the current behaviour >>>> to not establish the connection in this case, which may cause regression. >>> >>> Ok, I understand this. It's a little strange, but I see why it works >>> this way. >> I don't. I would expect it to work as I described in my email. >> Could someone explain me how and why it is behaving different from >> my expectation? > > It looks like a bug to me. Testing with this test app here, I can see > the INIT_ACK being sent with a bunch of ipv4 addresses in it and > that's unexpected for a v6only socket. As is, it's the server saying > "I'm available at these other addresses too, but not." I agree. Best regards Michael > > Thanks, > Marcelo > >> >> Best regards >> Michael >>> >>> Thanks, >>> >>> -corey >>> >>>> >>>>> >>>>> gcc -g -o sctptest -Wall sctptest.c >>>>> >>>>> and run it in one window as a server: >>>>> >>>>> ./sctptest a >>>>> >>>>> (Pass in any option to be the server) and run the following in another >>>>> window as the client: >>>>> >>>>> ./sctptest >>>>> >>>>> It disconnects after about 2.5 seconds. If it works, it should just sit >>>>> there forever. >>>>> >>>>> -corey >>>>> >>>>> >>>>> #include >>>>> #include >>>>> #include >>>>> #include >>>>> #include >>>>> #include >>>>> #include >>>>> #include >>>>> #include >>>>> #include >>>>> #include >>>>> >>>>> static int >>>>> getaddr(const char *addr, const char *port, bool listen, >>>>> struct addrinfo **rai) >>>>> { >>>>> struct addrinfo *ai, hints; >>>>> >>>>> memset(&hints, 0, sizeof(hints)); >>>>> hints.ai_flags = AI_ADDRCONFIG; >>>>> if (listen) >>>>> hints.ai_flags |= AI_PASSIVE; >>>>> hints.ai_family = AF_UNSPEC; >>>>> hints.ai_socktype = SOCK_STREAM; >>>>> hints.ai_protocol = IPPROTO_SCTP; >>>>> if (getaddrinfo(addr, port, &hints, &ai)) { >>>>> perror("getaddrinfo"); >>>>> return -1; >>>>
Re: Strange problem with SCTP+IPv6
On Mon, Jun 22, 2020 at 08:01:24PM +0200, Michael Tuexen wrote: > > On 22. Jun 2020, at 18:57, Corey Minyard wrote: > > > > On Mon, Jun 22, 2020 at 08:01:23PM +0800, Xin Long wrote: > >> On Sun, Jun 21, 2020 at 11:56 PM Corey Minyard wrote: > >>> > >>> I've stumbled upon a strange problem with SCTP and IPv6. If I create an > >>> sctp listening socket on :: and set the IPV6_V6ONLY socket option on it, > >>> then I make a connection to it using ::1, the connection will drop after > >>> 2.5 seconds with an ECONNRESET error. > >>> > >>> It only happens on SCTP, it doesn't have the issue if you connect to a > >>> full IPv6 address instead of ::1, and it doesn't happen if you don't > >>> set IPV6_V6ONLY. I have verified current end of tree kernel.org. > >>> I tried on an ARM system and x86_64. > >>> > >>> I haven't dug into the kernel to see if I could find anything yet, but I > >>> thought I would go ahead and report it. I am attaching a reproducer. > >>> Basically, compile the following code: > >> The code only set IPV6_V6ONLY on server side, so the client side will > >> still bind all the local ipv4 addresses (as you didn't call bind() to > >> bind any specific addresses ). Then after the connection is created, > >> the client will send HB on the v4 paths to the server. The server > >> will abort the connection, as it can't support v4. > >> > >> So you can work around it by either: > >> > >> - set IPV6_V6ONLY on client side. > >> > >> or > >> > >> - bind to the specific v6 addresses on the client side. > >> > >> I don't see RFC said something about this. > >> So it may not be a good idea to change the current behaviour > >> to not establish the connection in this case, which may cause regression. > > > > Ok, I understand this. It's a little strange, but I see why it works > > this way. > I don't. I would expect it to work as I described in my email. > Could someone explain me how and why it is behaving different from > my expectation? It looks like a bug to me. Testing with this test app here, I can see the INIT_ACK being sent with a bunch of ipv4 addresses in it and that's unexpected for a v6only socket. As is, it's the server saying "I'm available at these other addresses too, but not." Thanks, Marcelo > > Best regards > Michael > > > > Thanks, > > > > -corey > > > >> > >>> > >>> gcc -g -o sctptest -Wall sctptest.c > >>> > >>> and run it in one window as a server: > >>> > >>> ./sctptest a > >>> > >>> (Pass in any option to be the server) and run the following in another > >>> window as the client: > >>> > >>> ./sctptest > >>> > >>> It disconnects after about 2.5 seconds. If it works, it should just sit > >>> there forever. > >>> > >>> -corey > >>> > >>> > >>> #include > >>> #include > >>> #include > >>> #include > >>> #include > >>> #include > >>> #include > >>> #include > >>> #include > >>> #include > >>> #include > >>> > >>> static int > >>> getaddr(const char *addr, const char *port, bool listen, > >>>struct addrinfo **rai) > >>> { > >>>struct addrinfo *ai, hints; > >>> > >>>memset(&hints, 0, sizeof(hints)); > >>>hints.ai_flags = AI_ADDRCONFIG; > >>>if (listen) > >>>hints.ai_flags |= AI_PASSIVE; > >>>hints.ai_family = AF_UNSPEC; > >>>hints.ai_socktype = SOCK_STREAM; > >>>hints.ai_protocol = IPPROTO_SCTP; > >>>if (getaddrinfo(addr, port, &hints, &ai)) { > >>>perror("getaddrinfo"); > >>>return -1; > >>>} > >>> > >>>*rai = ai; > >>>return 0; > >>> } > >>> > >>> static int > >>> waitread(int s) > >>> { > >>>char data[1]; > >>>ssize_t rv; > >>> > >>>rv = read(s, data, sizeof(data)); > >>>if (rv == -1) { > >>>perror("
Re: Strange problem with SCTP+IPv6
> On 22. Jun 2020, at 18:57, Corey Minyard wrote: > > On Mon, Jun 22, 2020 at 08:01:23PM +0800, Xin Long wrote: >> On Sun, Jun 21, 2020 at 11:56 PM Corey Minyard wrote: >>> >>> I've stumbled upon a strange problem with SCTP and IPv6. If I create an >>> sctp listening socket on :: and set the IPV6_V6ONLY socket option on it, >>> then I make a connection to it using ::1, the connection will drop after >>> 2.5 seconds with an ECONNRESET error. >>> >>> It only happens on SCTP, it doesn't have the issue if you connect to a >>> full IPv6 address instead of ::1, and it doesn't happen if you don't >>> set IPV6_V6ONLY. I have verified current end of tree kernel.org. >>> I tried on an ARM system and x86_64. >>> >>> I haven't dug into the kernel to see if I could find anything yet, but I >>> thought I would go ahead and report it. I am attaching a reproducer. >>> Basically, compile the following code: >> The code only set IPV6_V6ONLY on server side, so the client side will >> still bind all the local ipv4 addresses (as you didn't call bind() to >> bind any specific addresses ). Then after the connection is created, >> the client will send HB on the v4 paths to the server. The server >> will abort the connection, as it can't support v4. >> >> So you can work around it by either: >> >> - set IPV6_V6ONLY on client side. >> >> or >> >> - bind to the specific v6 addresses on the client side. >> >> I don't see RFC said something about this. >> So it may not be a good idea to change the current behaviour >> to not establish the connection in this case, which may cause regression. > > Ok, I understand this. It's a little strange, but I see why it works > this way. I don't. I would expect it to work as I described in my email. Could someone explain me how and why it is behaving different from my expectation? Best regards Michael > > Thanks, > > -corey > >> >>> >>> gcc -g -o sctptest -Wall sctptest.c >>> >>> and run it in one window as a server: >>> >>> ./sctptest a >>> >>> (Pass in any option to be the server) and run the following in another >>> window as the client: >>> >>> ./sctptest >>> >>> It disconnects after about 2.5 seconds. If it works, it should just sit >>> there forever. >>> >>> -corey >>> >>> >>> #include >>> #include >>> #include >>> #include >>> #include >>> #include >>> #include >>> #include >>> #include >>> #include >>> #include >>> >>> static int >>> getaddr(const char *addr, const char *port, bool listen, >>>struct addrinfo **rai) >>> { >>>struct addrinfo *ai, hints; >>> >>>memset(&hints, 0, sizeof(hints)); >>>hints.ai_flags = AI_ADDRCONFIG; >>>if (listen) >>>hints.ai_flags |= AI_PASSIVE; >>>hints.ai_family = AF_UNSPEC; >>>hints.ai_socktype = SOCK_STREAM; >>>hints.ai_protocol = IPPROTO_SCTP; >>>if (getaddrinfo(addr, port, &hints, &ai)) { >>>perror("getaddrinfo"); >>>return -1; >>>} >>> >>>*rai = ai; >>>return 0; >>> } >>> >>> static int >>> waitread(int s) >>> { >>>char data[1]; >>>ssize_t rv; >>> >>>rv = read(s, data, sizeof(data)); >>>if (rv == -1) { >>>perror("read"); >>>return -1; >>>} >>>printf("Read %d bytes\n", (int) rv); >>>return 0; >>> } >>> >>> static int >>> do_server(void) >>> { >>>int err, ls, s, optval; >>>struct addrinfo *ai; >>> >>>printf("Server\n"); >>> >>>err = getaddr("::", "3023", true, &ai); >>>if (err) >>>return err; >>> >>>ls = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol); >>>if (ls == -1) { >>>perror("socket"); >>>return -1; >>>} >>> >>>optval = 1; >>>if (setsockopt(ls, SOL_SOCKET, SO_REUSEADDR, >>> (void *)&optval, sizeof(o
Re: Strange problem with SCTP+IPv6
On Mon, Jun 22, 2020 at 08:01:23PM +0800, Xin Long wrote: > On Sun, Jun 21, 2020 at 11:56 PM Corey Minyard wrote: > > > > I've stumbled upon a strange problem with SCTP and IPv6. If I create an > > sctp listening socket on :: and set the IPV6_V6ONLY socket option on it, > > then I make a connection to it using ::1, the connection will drop after > > 2.5 seconds with an ECONNRESET error. > > > > It only happens on SCTP, it doesn't have the issue if you connect to a > > full IPv6 address instead of ::1, and it doesn't happen if you don't > > set IPV6_V6ONLY. I have verified current end of tree kernel.org. > > I tried on an ARM system and x86_64. > > > > I haven't dug into the kernel to see if I could find anything yet, but I > > thought I would go ahead and report it. I am attaching a reproducer. > > Basically, compile the following code: > The code only set IPV6_V6ONLY on server side, so the client side will > still bind all the local ipv4 addresses (as you didn't call bind() to > bind any specific addresses ). Then after the connection is created, > the client will send HB on the v4 paths to the server. The server > will abort the connection, as it can't support v4. > > So you can work around it by either: > > - set IPV6_V6ONLY on client side. > > or > > - bind to the specific v6 addresses on the client side. > > I don't see RFC said something about this. > So it may not be a good idea to change the current behaviour > to not establish the connection in this case, which may cause regression. Ok, I understand this. It's a little strange, but I see why it works this way. Thanks, -corey > > > > > gcc -g -o sctptest -Wall sctptest.c > > > > and run it in one window as a server: > > > > ./sctptest a > > > > (Pass in any option to be the server) and run the following in another > > window as the client: > > > > ./sctptest > > > > It disconnects after about 2.5 seconds. If it works, it should just sit > > there forever. > > > > -corey > > > > > > #include > > #include > > #include > > #include > > #include > > #include > > #include > > #include > > #include > > #include > > #include > > > > static int > > getaddr(const char *addr, const char *port, bool listen, > > struct addrinfo **rai) > > { > > struct addrinfo *ai, hints; > > > > memset(&hints, 0, sizeof(hints)); > > hints.ai_flags = AI_ADDRCONFIG; > > if (listen) > > hints.ai_flags |= AI_PASSIVE; > > hints.ai_family = AF_UNSPEC; > > hints.ai_socktype = SOCK_STREAM; > > hints.ai_protocol = IPPROTO_SCTP; > > if (getaddrinfo(addr, port, &hints, &ai)) { > > perror("getaddrinfo"); > > return -1; > > } > > > > *rai = ai; > > return 0; > > } > > > > static int > > waitread(int s) > > { > > char data[1]; > > ssize_t rv; > > > > rv = read(s, data, sizeof(data)); > > if (rv == -1) { > > perror("read"); > > return -1; > > } > > printf("Read %d bytes\n", (int) rv); > > return 0; > > } > > > > static int > > do_server(void) > > { > > int err, ls, s, optval; > > struct addrinfo *ai; > > > > printf("Server\n"); > > > > err = getaddr("::", "3023", true, &ai); > > if (err) > > return err; > > > > ls = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol); > > if (ls == -1) { > > perror("socket"); > > return -1; > > } > > > > optval = 1; > > if (setsockopt(ls, SOL_SOCKET, SO_REUSEADDR, > >(void *)&optval, sizeof(optval)) == -1) { > > perror("setsockopt reuseaddr"); > > return -1; > > } > > > > /* Comment this out and it will work. */ > > if (setsockopt(ls, IPPROTO_IPV6, IPV6_V6ONLY, &optval, > >sizeof(optval)) == -1) { > > perror("setsockopt ipv6 only"); > > return -1; > > } > > > > err = bind(ls, ai->ai_addr, ai->ai_addrlen); > > if (err == -1) { > > perror("bind"); > > return -1; > > } > > > >
Re: Strange problem with SCTP+IPv6
> On 22. Jun 2020, at 14:01, Xin Long wrote: > > On Sun, Jun 21, 2020 at 11:56 PM Corey Minyard wrote: >> >> I've stumbled upon a strange problem with SCTP and IPv6. If I create an >> sctp listening socket on :: and set the IPV6_V6ONLY socket option on it, >> then I make a connection to it using ::1, the connection will drop after >> 2.5 seconds with an ECONNRESET error. >> >> It only happens on SCTP, it doesn't have the issue if you connect to a >> full IPv6 address instead of ::1, and it doesn't happen if you don't >> set IPV6_V6ONLY. I have verified current end of tree kernel.org. >> I tried on an ARM system and x86_64. >> >> I haven't dug into the kernel to see if I could find anything yet, but I >> thought I would go ahead and report it. I am attaching a reproducer. >> Basically, compile the following code: > The code only set IPV6_V6ONLY on server side, so the client side will > still bind all the local ipv4 addresses (as you didn't call bind() to > bind any specific addresses ). Then after the connection is created, Let's focus on the loopback addresses ::1 and 127.0.0.1. So the server will only use ::1. The client will send an INIT from ::1 to ::1 and lists 127.0.0.1 and ::1. That is what I would expect. Is that happening? The server would respond with an INIT-ACK from ::1 to ::1 and would not list any IP addresses. Especially not 127.0.0.1, since it is IPv6 only. After the association has beed established, the client can't send any IPv4 packet to the server, since the server did not announce any. The server can't send any IPv4 packets since it is IPv6 only. This is what I would expect and this scenario should just work. What am I missing? Best regards Michael > the client will send HB on the v4 paths to the server. The server > will abort the connection, as it can't support v4. > > So you can work around it by either: > > - set IPV6_V6ONLY on client side. > > or > > - bind to the specific v6 addresses on the client side. > > I don't see RFC said something about this. > So it may not be a good idea to change the current behaviour > to not establish the connection in this case, which may cause regression. > >> >> gcc -g -o sctptest -Wall sctptest.c >> >> and run it in one window as a server: >> >> ./sctptest a >> >> (Pass in any option to be the server) and run the following in another >> window as the client: >> >> ./sctptest >> >> It disconnects after about 2.5 seconds. If it works, it should just sit >> there forever. >> >> -corey >> >> >> #include >> #include >> #include >> #include >> #include >> #include >> #include >> #include >> #include >> #include >> #include >> >> static int >> getaddr(const char *addr, const char *port, bool listen, >>struct addrinfo **rai) >> { >>struct addrinfo *ai, hints; >> >>memset(&hints, 0, sizeof(hints)); >>hints.ai_flags = AI_ADDRCONFIG; >>if (listen) >>hints.ai_flags |= AI_PASSIVE; >>hints.ai_family = AF_UNSPEC; >>hints.ai_socktype = SOCK_STREAM; >>hints.ai_protocol = IPPROTO_SCTP; >>if (getaddrinfo(addr, port, &hints, &ai)) { >>perror("getaddrinfo"); >>return -1; >>} >> >>*rai = ai; >>return 0; >> } >> >> static int >> waitread(int s) >> { >>char data[1]; >>ssize_t rv; >> >>rv = read(s, data, sizeof(data)); >>if (rv == -1) { >>perror("read"); >>return -1; >>} >>printf("Read %d bytes\n", (int) rv); >>return 0; >> } >> >> static int >> do_server(void) >> { >>int err, ls, s, optval; >>struct addrinfo *ai; >> >>printf("Server\n"); >> >>err = getaddr("::", "3023", true, &ai); >>if (err) >>return err; >> >>ls = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol); >>if (ls == -1) { >>perror("socket"); >>return -1; >>} >> >>optval = 1; >>if (setsockopt(ls, SOL_SOCKET, SO_REUSEADDR, >> (void *)&optval, sizeof(optval)) == -1) { >>perror("setsockopt reuseaddr"); >>return -1; >>} >> >>/* Comment this out and it will work. */ &g
Re: Strange problem with SCTP+IPv6
On Sun, Jun 21, 2020 at 11:56 PM Corey Minyard wrote: > > I've stumbled upon a strange problem with SCTP and IPv6. If I create an > sctp listening socket on :: and set the IPV6_V6ONLY socket option on it, > then I make a connection to it using ::1, the connection will drop after > 2.5 seconds with an ECONNRESET error. > > It only happens on SCTP, it doesn't have the issue if you connect to a > full IPv6 address instead of ::1, and it doesn't happen if you don't > set IPV6_V6ONLY. I have verified current end of tree kernel.org. > I tried on an ARM system and x86_64. > > I haven't dug into the kernel to see if I could find anything yet, but I > thought I would go ahead and report it. I am attaching a reproducer. > Basically, compile the following code: The code only set IPV6_V6ONLY on server side, so the client side will still bind all the local ipv4 addresses (as you didn't call bind() to bind any specific addresses ). Then after the connection is created, the client will send HB on the v4 paths to the server. The server will abort the connection, as it can't support v4. So you can work around it by either: - set IPV6_V6ONLY on client side. or - bind to the specific v6 addresses on the client side. I don't see RFC said something about this. So it may not be a good idea to change the current behaviour to not establish the connection in this case, which may cause regression. > > gcc -g -o sctptest -Wall sctptest.c > > and run it in one window as a server: > > ./sctptest a > > (Pass in any option to be the server) and run the following in another > window as the client: > > ./sctptest > > It disconnects after about 2.5 seconds. If it works, it should just sit > there forever. > > -corey > > > #include > #include > #include > #include > #include > #include > #include > #include > #include > #include > #include > > static int > getaddr(const char *addr, const char *port, bool listen, > struct addrinfo **rai) > { > struct addrinfo *ai, hints; > > memset(&hints, 0, sizeof(hints)); > hints.ai_flags = AI_ADDRCONFIG; > if (listen) > hints.ai_flags |= AI_PASSIVE; > hints.ai_family = AF_UNSPEC; > hints.ai_socktype = SOCK_STREAM; > hints.ai_protocol = IPPROTO_SCTP; > if (getaddrinfo(addr, port, &hints, &ai)) { > perror("getaddrinfo"); > return -1; > } > > *rai = ai; > return 0; > } > > static int > waitread(int s) > { > char data[1]; > ssize_t rv; > > rv = read(s, data, sizeof(data)); > if (rv == -1) { > perror("read"); > return -1; > } > printf("Read %d bytes\n", (int) rv); > return 0; > } > > static int > do_server(void) > { > int err, ls, s, optval; > struct addrinfo *ai; > > printf("Server\n"); > > err = getaddr("::", "3023", true, &ai); > if (err) > return err; > > ls = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol); > if (ls == -1) { > perror("socket"); > return -1; > } > > optval = 1; > if (setsockopt(ls, SOL_SOCKET, SO_REUSEADDR, >(void *)&optval, sizeof(optval)) == -1) { > perror("setsockopt reuseaddr"); > return -1; > } > > /* Comment this out and it will work. */ > if (setsockopt(ls, IPPROTO_IPV6, IPV6_V6ONLY, &optval, >sizeof(optval)) == -1) { > perror("setsockopt ipv6 only"); > return -1; > } > > err = bind(ls, ai->ai_addr, ai->ai_addrlen); > if (err == -1) { > perror("bind"); > return -1; > } > > err = listen(ls, 5); > if (err == -1) { > perror("listen"); > return -1; > } > > s = accept(ls, NULL, NULL); > if (s == -1) { > perror("accept"); > return -1; > } > > close(ls); > > err = waitread(s); > close(s); > return err; > } > > static int > do_client(void) > { > int err, s; > struct addrinfo *ai; > > printf("Client\n"); > > err = getaddr("::1", "3023", false, &ai); > if (err) > return err; > > s = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol); > if (s == -1) { > perror("socket"); > return -1; > } > > err = connect(s, ai->ai_addr, ai->ai_addrlen); > if (err == -1) { > perror("connect"); > return -1; > } > > err = waitread(s); > close(s); > return err; > } > > int > main(int argc, char *argv[]) > { > int err; > > if (argc > 1) > err = do_server(); > else > err = do_client(); > return !!err; > } >
Strange problem with SCTP+IPv6
I've stumbled upon a strange problem with SCTP and IPv6. If I create an sctp listening socket on :: and set the IPV6_V6ONLY socket option on it, then I make a connection to it using ::1, the connection will drop after 2.5 seconds with an ECONNRESET error. It only happens on SCTP, it doesn't have the issue if you connect to a full IPv6 address instead of ::1, and it doesn't happen if you don't set IPV6_V6ONLY. I have verified current end of tree kernel.org. I tried on an ARM system and x86_64. I haven't dug into the kernel to see if I could find anything yet, but I thought I would go ahead and report it. I am attaching a reproducer. Basically, compile the following code: gcc -g -o sctptest -Wall sctptest.c and run it in one window as a server: ./sctptest a (Pass in any option to be the server) and run the following in another window as the client: ./sctptest It disconnects after about 2.5 seconds. If it works, it should just sit there forever. -corey #include #include #include #include #include #include #include #include #include #include #include static int getaddr(const char *addr, const char *port, bool listen, struct addrinfo **rai) { struct addrinfo *ai, hints; memset(&hints, 0, sizeof(hints)); hints.ai_flags = AI_ADDRCONFIG; if (listen) hints.ai_flags |= AI_PASSIVE; hints.ai_family = AF_UNSPEC; hints.ai_socktype = SOCK_STREAM; hints.ai_protocol = IPPROTO_SCTP; if (getaddrinfo(addr, port, &hints, &ai)) { perror("getaddrinfo"); return -1; } *rai = ai; return 0; } static int waitread(int s) { char data[1]; ssize_t rv; rv = read(s, data, sizeof(data)); if (rv == -1) { perror("read"); return -1; } printf("Read %d bytes\n", (int) rv); return 0; } static int do_server(void) { int err, ls, s, optval; struct addrinfo *ai; printf("Server\n"); err = getaddr("::", "3023", true, &ai); if (err) return err; ls = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol); if (ls == -1) { perror("socket"); return -1; } optval = 1; if (setsockopt(ls, SOL_SOCKET, SO_REUSEADDR, (void *)&optval, sizeof(optval)) == -1) { perror("setsockopt reuseaddr"); return -1; } /* Comment this out and it will work. */ if (setsockopt(ls, IPPROTO_IPV6, IPV6_V6ONLY, &optval, sizeof(optval)) == -1) { perror("setsockopt ipv6 only"); return -1; } err = bind(ls, ai->ai_addr, ai->ai_addrlen); if (err == -1) { perror("bind"); return -1; } err = listen(ls, 5); if (err == -1) { perror("listen"); return -1; } s = accept(ls, NULL, NULL); if (s == -1) { perror("accept"); return -1; } close(ls); err = waitread(s); close(s); return err; } static int do_client(void) { int err, s; struct addrinfo *ai; printf("Client\n"); err = getaddr("::1", "3023", false, &ai); if (err) return err; s = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol); if (s == -1) { perror("socket"); return -1; } err = connect(s, ai->ai_addr, ai->ai_addrlen); if (err == -1) { perror("connect"); return -1; } err = waitread(s); close(s); return err; } int main(int argc, char *argv[]) { int err; if (argc > 1) err = do_server(); else err = do_client(); return !!err; }
linux-next: strange set of patches in the y2038 tree
Hi Arnd, You seem t have remerged a whole series of patches in your tree: commits a55aa89aab90..ecc43067d9a5 are identical to commits a55aa89aab90..846e9b109913 apart from the commit message for commit e83dd16c24d8/654f7717ef51. And you have both sets fo commits merged. -- Cheers, Stephen Rothwell pgp9xpWQdmzrS.pgp Description: OpenPGP digital signature
Re: [PATCH v2] fat: Add nobarrier to workaround the strange behavior of device
On Fri, 28 Jun 2019 23:32:06 +0900 OGAWA Hirofumi wrote: > > v2: > Just cleanup, changed the place of options under comment of fat. > > --- Please be careful with the "^---$" - it denotes "end of changelog", so I ended up without a changelog! > There was the report of strange behavior of device with recent > blkdev_issue_flush() position change. A Reported-by: would be nice, but not necessary. > The following is simplified usbmon trace. > > 4203 9.160230 host -> 1.25.1 USBMS 95 SCSI: Synchronize > Cache(10) LUN: 0x00 (LBA: 0x, Len: 0) > 4206 9.164911 1.25.1 -> host USBMS 77 SCSI: Response LUN: > 0x00 (Synchronize Cache(10)) (Good) > 4207 9.323927 host -> 1.25.1 USBMS 95 SCSI: Read(10) LUN: > 0x00 (LBA: 0x00279950, Len: 240) > 4212 9.327138 1.25.1 -> host USBMS 77 SCSI: Response LUN: > 0x00 (Read(10)) (Good) > > [...] > > 7323 10.202167 host -> 1.25.1 USBMS 95 SCSI: Synchronize > Cache(10) LUN: 0x00 (LBA: 0x, Len: 0) > 7326 10.432266 1.25.1 -> host USBMS 77 SCSI: Response LUN: > 0x00 (Synchronize Cache(10)) (Good) > 7327 10.769092 host -> 1.25.1 USBMS 95 SCSI: Test Unit Ready > LUN: 0x00 > 7330 10.769192 1.25.1 -> host USBMS 77 SCSI: Response LUN: > 0x00 (Test Unit Ready) (Good) > 7335 12.849093 host -> 1.25.1 USBMS 95 SCSI: Test Unit Ready > LUN: 0x00 > 7338 12.849206 1.25.1 -> host USBMS 77 SCSI: Response LUN: > 0x00 (Test Unit Ready) (Check Condition) > 7339 12.849209 host -> 1.25.1 USBMS 95 SCSI: Request Sense > LUN: 0x00 > > If "Synchronize Cache" command issued then there is idle time, the > device stop to process further commands, and behave as like no media. > (it returns NOT_READY [MEDIUM NOT PRESENT] for SENSE command, and this > happened on Kindle) [just a guess, the device is trying to detect the > "safe-unplug" operation of Windows or such?] > > To workaround those devices and provide flexibility, this adds > "barrier"/"nobarrier" mount options to fat driver. I think it would be helpful if the changelog were to at least describe the longer-term plan which hch described. > --- linux/fs/fat/fat.h~fat-nobarrier 2019-06-28 21:22:18.146191739 +0900 > +++ linux-hirofumi/fs/fat/fat.h 2019-06-28 23:26:04.881215721 +0900 > @@ -51,6 +51,7 @@ struct fat_mount_options { >tz_set:1, /* Filesystem timestamps' offset set */ >rodir:1, /* allow ATTR_RO for directory */ >discard:1,/* Issue discard requests on deletions */ > + barrier:1,/* Issue FLUSH command */ >dos1xfloppy:1;/* Assume default BPB for DOS 1.x floppies */ Documentation/filesystems/vfat.txt should be updated to describe this new option please. And perhaps to mention that a device-level quirk should be used in preference, if it is available.
Re: [PATCH] fat: Add nobarrier to workaround the strange behavior of device
Christoph Hellwig writes: > On Sat, Jun 29, 2019 at 12:03:46AM +0900, OGAWA Hirofumi wrote: >> I see, sounds like good though. Does it work for all stable versions? >> Can it disable only flush command without other effect? And it would be >> better to be normal user controllable easily. > > The option was added in 2.6.17, so it's been around forever. But > no, it obviously is not user exposed as using it on a normal drive > can lead to data loss. I see. It sounds like good as long term solution though (if no effect other than disabling flush command), it may need some monitor daemon and detect the device, and apply user policy as root. (BTW, I meant, workaround is normal user controllable with config by root or such) I think, it may not be good as short term solution for especially stable users. If there is no better short term solution, I would like to still push this patch as short term workaround. Thanks. -- OGAWA Hirofumi
Re: [PATCH] fat: Add nobarrier to workaround the strange behavior of device
On Sat, Jun 29, 2019 at 12:03:46AM +0900, OGAWA Hirofumi wrote: > I see, sounds like good though. Does it work for all stable versions? > Can it disable only flush command without other effect? And it would be > better to be normal user controllable easily. The option was added in 2.6.17, so it's been around forever. But no, it obviously is not user exposed as using it on a normal drive can lead to data loss.
Re: [PATCH] fat: Add nobarrier to workaround the strange behavior of device
Christoph Hellwig writes: > On Fri, Jun 28, 2019 at 11:18:19PM +0900, OGAWA Hirofumi wrote: >> To workaround those devices and provide flexibility, this adds >> "barrier"/"nobarrier" mount options to fat driver. > > We have deprecated these rather misnamed options, and now instead allow > tweaking the 'cache_type' attribute on the SCSI device. I see, sounds like good though. Does it work for all stable versions? Can it disable only flush command without other effect? And it would be better to be normal user controllable easily. This happened on normal user's calibre app that mount via udisks. With this option, user can workaround with /etc/fstab for immediate users. > That being said if the device behave this buggy you should also report > it to to the usb-storage and scsi maintainers so that we can add a > quirk for it. It might not be able to say as buggy simply. The device looks work if no idle and not hit pattern of usage, so quirk can be overkill. Anyway, I don't have the device, if you can get the device and investigate, it can be fine. Thanks. -- OGAWA Hirofumi
[PATCH v2] fat: Add nobarrier to workaround the strange behavior of device
v2: Just cleanup, changed the place of options under comment of fat. --- There was the report of strange behavior of device with recent blkdev_issue_flush() position change. The following is simplified usbmon trace. 4203 9.160230 host -> 1.25.1 USBMS 95 SCSI: Synchronize Cache(10) LUN: 0x00 (LBA: 0x, Len: 0) 4206 9.164911 1.25.1 -> host USBMS 77 SCSI: Response LUN: 0x00 (Synchronize Cache(10)) (Good) 4207 9.323927 host -> 1.25.1 USBMS 95 SCSI: Read(10) LUN: 0x00 (LBA: 0x00279950, Len: 240) 4212 9.327138 1.25.1 -> host USBMS 77 SCSI: Response LUN: 0x00 (Read(10)) (Good) [...] 7323 10.202167 host -> 1.25.1 USBMS 95 SCSI: Synchronize Cache(10) LUN: 0x00 (LBA: 0x, Len: 0) 7326 10.432266 1.25.1 -> host USBMS 77 SCSI: Response LUN: 0x00 (Synchronize Cache(10)) (Good) 7327 10.769092 host -> 1.25.1 USBMS 95 SCSI: Test Unit Ready LUN: 0x00 7330 10.769192 1.25.1 -> host USBMS 77 SCSI: Response LUN: 0x00 (Test Unit Ready) (Good) 7335 12.849093 host -> 1.25.1 USBMS 95 SCSI: Test Unit Ready LUN: 0x00 7338 12.849206 1.25.1 -> host USBMS 77 SCSI: Response LUN: 0x00 (Test Unit Ready) (Check Condition) 7339 12.849209 host -> 1.25.1 USBMS 95 SCSI: Request Sense LUN: 0x00 If "Synchronize Cache" command issued then there is idle time, the device stop to process further commands, and behave as like no media. (it returns NOT_READY [MEDIUM NOT PRESENT] for SENSE command, and this happened on Kindle) [just a guess, the device is trying to detect the "safe-unplug" operation of Windows or such?] To workaround those devices and provide flexibility, this adds "barrier"/"nobarrier" mount options to fat driver. Cc: Signed-off-by: OGAWA Hirofumi --- fs/fat/fat.h |1 + fs/fat/file.c |8 ++-- fs/fat/inode.c | 22 +- 3 files changed, 24 insertions(+), 7 deletions(-) diff -puN fs/fat/fat.h~fat-nobarrier fs/fat/fat.h --- linux/fs/fat/fat.h~fat-nobarrier2019-06-28 21:22:18.146191739 +0900 +++ linux-hirofumi/fs/fat/fat.h 2019-06-28 23:26:04.881215721 +0900 @@ -51,6 +51,7 @@ struct fat_mount_options { tz_set:1, /* Filesystem timestamps' offset set */ rodir:1, /* allow ATTR_RO for directory */ discard:1,/* Issue discard requests on deletions */ +barrier:1,/* Issue FLUSH command */ dos1xfloppy:1;/* Assume default BPB for DOS 1.x floppies */ }; diff -puN fs/fat/file.c~fat-nobarrier fs/fat/file.c --- linux/fs/fat/file.c~fat-nobarrier 2019-06-28 21:22:18.147191734 +0900 +++ linux-hirofumi/fs/fat/file.c2019-06-28 23:26:04.881215721 +0900 @@ -193,17 +193,21 @@ static int fat_file_release(struct inode int fat_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync) { struct inode *inode = filp->f_mapping->host; + struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); int err; err = __generic_file_fsync(filp, start, end, datasync); if (err) return err; - err = sync_mapping_buffers(MSDOS_SB(inode->i_sb)->fat_inode->i_mapping); + err = sync_mapping_buffers(sbi->fat_inode->i_mapping); if (err) return err; - return blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); + if (sbi->options.barrier) + err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); + + return err; } diff -puN fs/fat/inode.c~fat-nobarrier fs/fat/inode.c --- linux/fs/fat/inode.c~fat-nobarrier 2019-06-28 21:22:18.148191730 +0900 +++ linux-hirofumi/fs/fat/inode.c 2019-06-28 23:26:28.029103863 +0900 @@ -1016,6 +1016,8 @@ static int fat_show_options(struct seq_f seq_puts(m, ",nfs=stale_rw"); if (opts->discard) seq_puts(m, ",discard"); + if (!opts->barrier) + seq_puts(m, ",nobarrier"); if (opts->dos1xfloppy) seq_puts(m, ",dos1xfloppy"); @@ -1031,8 +1033,9 @@ enum { Opt_shortname_winnt, Opt_shortname_mixed, Opt_utf8_no, Opt_utf8_yes, Opt_uni_xl_no, Opt_uni_xl_yes, Opt_nonumtail_no, Opt_nonumtail_yes, Opt_obsolete, Opt_flush, Opt_tz_utc, Opt_rodir, Opt_err_cont, - Opt_err_panic, Opt_err_ro, Opt_discard, Opt_nfs, Opt_time_offset, - Opt_nfs_stale_rw, Opt_nfs_nostale_ro, Opt_err, Opt_dos1xfloppy, + Opt_err_panic, Opt_err_ro, Opt_discard, Opt_barrier, Opt_nobarrier, + Opt_nfs, Opt_time_offset, Opt_nfs_stale_rw, Opt_nfs_nostale_ro, + Opt_err, Opt_dos1xfloppy, }; static const match_table_t fat_tokens = { @@ -1062,6 +1065,8 @@ static const
Re: [PATCH] fat: Add nobarrier to workaround the strange behavior of device
On Fri, Jun 28, 2019 at 11:18:19PM +0900, OGAWA Hirofumi wrote: > To workaround those devices and provide flexibility, this adds > "barrier"/"nobarrier" mount options to fat driver. We have deprecated these rather misnamed options, and now instead allow tweaking the 'cache_type' attribute on the SCSI device. That being said if the device behave this buggy you should also report it to to the usb-storage and scsi maintainers so that we can add a quirk for it.
[PATCH] fat: Add nobarrier to workaround the strange behavior of device
There was the report of strange behavior of device with recent blkdev_issue_flush() position change. The following is simplified usbmon trace. 4203 9.160230 host -> 1.25.1 USBMS 95 SCSI: Synchronize Cache(10) LUN: 0x00 (LBA: 0x, Len: 0) 4206 9.164911 1.25.1 -> host USBMS 77 SCSI: Response LUN: 0x00 (Synchronize Cache(10)) (Good) 4207 9.323927 host -> 1.25.1 USBMS 95 SCSI: Read(10) LUN: 0x00 (LBA: 0x00279950, Len: 240) 4212 9.327138 1.25.1 -> host USBMS 77 SCSI: Response LUN: 0x00 (Read(10)) (Good) [...] 7323 10.202167 host -> 1.25.1 USBMS 95 SCSI: Synchronize Cache(10) LUN: 0x00 (LBA: 0x, Len: 0) 7326 10.432266 1.25.1 -> host USBMS 77 SCSI: Response LUN: 0x00 (Synchronize Cache(10)) (Good) 7327 10.769092 host -> 1.25.1 USBMS 95 SCSI: Test Unit Ready LUN: 0x00 7330 10.769192 1.25.1 -> host USBMS 77 SCSI: Response LUN: 0x00 (Test Unit Ready) (Good) 7335 12.849093 host -> 1.25.1 USBMS 95 SCSI: Test Unit Ready LUN: 0x00 7338 12.849206 1.25.1 -> host USBMS 77 SCSI: Response LUN: 0x00 (Test Unit Ready) (Check Condition) 7339 12.849209 host -> 1.25.1 USBMS 95 SCSI: Request Sense LUN: 0x00 If "Synchronize Cache" command issued then there is idle time, the device stop to process further commands, and behave as like no media. (it returns NOT_READY [MEDIUM NOT PRESENT] for SENSE command, and this happened on Kindle) [just a guess, the device is trying to detect the "safe-unplug" operation of Windows or such?] To workaround those devices and provide flexibility, this adds "barrier"/"nobarrier" mount options to fat driver. Cc: Signed-off-by: OGAWA Hirofumi --- fs/fat/fat.h |1 + fs/fat/file.c |8 ++-- fs/fat/inode.c | 16 ++-- 3 files changed, 21 insertions(+), 4 deletions(-) diff -puN fs/fat/fat.h~fat-nobarrier fs/fat/fat.h --- linux/fs/fat/fat.h~fat-nobarrier2019-06-28 21:22:18.146191739 +0900 +++ linux-hirofumi/fs/fat/fat.h 2019-06-28 21:59:11.693934616 +0900 @@ -51,6 +51,7 @@ struct fat_mount_options { tz_set:1, /* Filesystem timestamps' offset set */ rodir:1, /* allow ATTR_RO for directory */ discard:1,/* Issue discard requests on deletions */ +barrier:1,/* Issue FLUSH command */ dos1xfloppy:1;/* Assume default BPB for DOS 1.x floppies */ }; diff -puN fs/fat/file.c~fat-nobarrier fs/fat/file.c --- linux/fs/fat/file.c~fat-nobarrier 2019-06-28 21:22:18.147191734 +0900 +++ linux-hirofumi/fs/fat/file.c2019-06-28 21:59:11.693934616 +0900 @@ -193,17 +193,21 @@ static int fat_file_release(struct inode int fat_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync) { struct inode *inode = filp->f_mapping->host; + struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); int err; err = __generic_file_fsync(filp, start, end, datasync); if (err) return err; - err = sync_mapping_buffers(MSDOS_SB(inode->i_sb)->fat_inode->i_mapping); + err = sync_mapping_buffers(sbi->fat_inode->i_mapping); if (err) return err; - return blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); + if (sbi->options.barrier) + err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); + + return err; } diff -puN fs/fat/inode.c~fat-nobarrier fs/fat/inode.c --- linux/fs/fat/inode.c~fat-nobarrier 2019-06-28 21:22:18.148191730 +0900 +++ linux-hirofumi/fs/fat/inode.c 2019-06-28 21:59:11.694934611 +0900 @@ -1016,6 +1016,8 @@ static int fat_show_options(struct seq_f seq_puts(m, ",nfs=stale_rw"); if (opts->discard) seq_puts(m, ",discard"); + if (!opts->barrier) + seq_puts(m, ",nobarrier"); if (opts->dos1xfloppy) seq_puts(m, ",dos1xfloppy"); @@ -1031,8 +1033,9 @@ enum { Opt_shortname_winnt, Opt_shortname_mixed, Opt_utf8_no, Opt_utf8_yes, Opt_uni_xl_no, Opt_uni_xl_yes, Opt_nonumtail_no, Opt_nonumtail_yes, Opt_obsolete, Opt_flush, Opt_tz_utc, Opt_rodir, Opt_err_cont, - Opt_err_panic, Opt_err_ro, Opt_discard, Opt_nfs, Opt_time_offset, - Opt_nfs_stale_rw, Opt_nfs_nostale_ro, Opt_err, Opt_dos1xfloppy, + Opt_err_panic, Opt_err_ro, Opt_discard, Opt_barrier, Opt_nobarrier, + Opt_nfs, Opt_time_offset, Opt_nfs_stale_rw, Opt_nfs_nostale_ro, + Opt_err, Opt_dos1xfloppy, }; static const match_table_t fat_tokens = { @@ -1062,6 +1065,8 @@ static const match_table_t fat_tokens = {Opt_err_panic, "errors=panic"},
Re: Strange regression in hid_llogitech_dj (was: Re: Linux 5.2-rc4)
On Wed, 12 Jun 2019, Rafael J. Wysocki wrote: > > > hid-logitech-dj is not loaded after a fresh boot, so I need to modprobe > > > it manually and that > > > appears to be blocking (apparently indefinitely) until terminated with > > > ^C. But then it turns > > > out that hid-logitech-dj is there in the list of modules and it is in use > > > (by usbhid) and the > > > mouse works. > > > > My bad, I should've asked you to test with the complete 'for-5.2/fixes' > > branch which contains two reverts [1] [2] that should fix this issue as > > well. > > > > [1] > > https://git.kernel.org/pub/scm/linux/kernel/git/hid/hid.git/commit/?h=for-5.2/fixes&id=e0b7f9bc0246bc642d1de2ff3ff133730584c956 > > [2] > > https://git.kernel.org/pub/scm/linux/kernel/git/hid/hid.git/commit/?h=for-5.2/fixes&id=f9482dabfd1686987cc6044e06ae0e4c05915518 > > Yes, with the two reverts applied in addition to the fix, it all works as > expected. Rafael, thanks a lot for testing. I am sending the pile to Linus today or tomorrow latest. -- Jiri Kosina SUSE Labs
Re: Strange regression in hid_llogitech_dj (was: Re: Linux 5.2-rc4)
On Wednesday, June 12, 2019 10:31:45 AM CEST Jiri Kosina wrote: > On Wed, 12 Jun 2019, Rafael J. Wysocki wrote: > > > It kind of helps, but there is a catch. > > > > hid-logitech-dj is not loaded after a fresh boot, so I need to modprobe it > > manually and that > > appears to be blocking (apparently indefinitely) until terminated with ^C. > > But then it turns > > out that hid-logitech-dj is there in the list of modules and it is in use > > (by usbhid) and the > > mouse works. > > My bad, I should've asked you to test with the complete 'for-5.2/fixes' > branch which contains two reverts [1] [2] that should fix this issue as > well. > > [1] > https://git.kernel.org/pub/scm/linux/kernel/git/hid/hid.git/commit/?h=for-5.2/fixes&id=e0b7f9bc0246bc642d1de2ff3ff133730584c956 > [2] > https://git.kernel.org/pub/scm/linux/kernel/git/hid/hid.git/commit/?h=for-5.2/fixes&id=f9482dabfd1686987cc6044e06ae0e4c05915518 Yes, with the two reverts applied in addition to the fix, it all works as expected. Thanks!
Re: Strange regression in hid_llogitech_dj (was: Re: Linux 5.2-rc4)
On Wed, 12 Jun 2019, Rafael J. Wysocki wrote: > It kind of helps, but there is a catch. > > hid-logitech-dj is not loaded after a fresh boot, so I need to modprobe it > manually and that > appears to be blocking (apparently indefinitely) until terminated with ^C. > But then it turns > out that hid-logitech-dj is there in the list of modules and it is in use (by > usbhid) and the > mouse works. My bad, I should've asked you to test with the complete 'for-5.2/fixes' branch which contains two reverts [1] [2] that should fix this issue as well. [1] https://git.kernel.org/pub/scm/linux/kernel/git/hid/hid.git/commit/?h=for-5.2/fixes&id=e0b7f9bc0246bc642d1de2ff3ff133730584c956 [2] https://git.kernel.org/pub/scm/linux/kernel/git/hid/hid.git/commit/?h=for-5.2/fixes&id=f9482dabfd1686987cc6044e06ae0e4c05915518 -- Jiri Kosina SUSE Labs
Re: Strange regression in hid_llogitech_dj (was: Re: Linux 5.2-rc4)
On Wednesday, June 12, 2019 12:02:21 AM CEST Jiri Kosina wrote: > On Tue, 11 Jun 2019, Rafael J. Wysocki wrote: > > > I noticed that the cordless mouse used by me with one of the machines here > > stopped to work in 5.2-rc (up to and including the -rc4). > > > > Bisection turned up commit 74808f9115ce ("HID: logitech-dj: add support for > > non > > unifying receivers"). > > > > Of course, that commit does not revert cleanly from 5.2-rc4, but I have > > reverted > > the changes made by it in hid/hid-ids.h and I took the version of > > hid/hid-logitech-dj.c > > from commit b6aeeddef68d ("HID: logitech-dj: add > > logi_dj_recv_queue_unknown_work > > helper"), which is the parent of commit 74808f9115ce, and that made the > > mouse > > work again for me. > > > > Here's the output of "dmesg | grep -i logitech" from 5.2-rc4 with the above > > changes: > > > > [4.288905] usb 1-2: Manufacturer: Logitech > > [5.444621] input: Logitech USB Receiver as > > /devices/pci:00/:00:14.0/usb1/1-2/1-2:1.0/0003:046D:C52F.0002/input/input23 > > [5.446960] hid-generic 0003:046D:C52F.0002: input,hidraw1: USB HID > > v1.11 Mouse [Logitech USB Receiver] on usb-:00:14.0-2/input0 > > [5.451265] input: Logitech USB Receiver Consumer Control as > > /devices/pci:00/:00:14.0/usb1/1-2/1-2:1.1/0003:046D:C52F.0003/input/input24 > > [5.507545] hid-generic 0003:046D:C52F.0003: input,hiddev96,hidraw2: USB > > HID v1.11 Device [Logitech USB Receiver] on usb-:00:14.0-2/input1 > > Hi Rafael, > > 0x046d/0xc52f is known to have issues in 5.2-rcX. There is a patch queued > [1] that is believed to fix all this; my plan is to send it to Linus in > the coming 1-2 days. If you could report whether it fixes the issues > you've been seeing yourself as well, it'd be helpful. > > Thanks. > > [1] > https://git.kernel.org/pub/scm/linux/kernel/git/hid/hid.git/commit/?h=for-5.2/fixes&id=3ed224e273ac5880eeab4c3043a6b06b0478dd56 It kind of helps, but there is a catch. hid-logitech-dj is not loaded after a fresh boot, so I need to modprobe it manually and that appears to be blocking (apparently indefinitely) until terminated with ^C. But then it turns out that hid-logitech-dj is there in the list of modules and it is in use (by usbhid) and the mouse works. I guess I need to update the mkinitrd configuration, but even so that is not exactly straightforward IMO. :-) Cheers!
Re: Strange regression in hid_llogitech_dj (was: Re: Linux 5.2-rc4)
On Tue, 11 Jun 2019, Rafael J. Wysocki wrote: > I noticed that the cordless mouse used by me with one of the machines here > stopped to work in 5.2-rc (up to and including the -rc4). > > Bisection turned up commit 74808f9115ce ("HID: logitech-dj: add support for > non > unifying receivers"). > > Of course, that commit does not revert cleanly from 5.2-rc4, but I have > reverted > the changes made by it in hid/hid-ids.h and I took the version of > hid/hid-logitech-dj.c > from commit b6aeeddef68d ("HID: logitech-dj: add > logi_dj_recv_queue_unknown_work > helper"), which is the parent of commit 74808f9115ce, and that made the mouse > work again for me. > > Here's the output of "dmesg | grep -i logitech" from 5.2-rc4 with the above > changes: > > [4.288905] usb 1-2: Manufacturer: Logitech > [5.444621] input: Logitech USB Receiver as > /devices/pci:00/:00:14.0/usb1/1-2/1-2:1.0/0003:046D:C52F.0002/input/input23 > [5.446960] hid-generic 0003:046D:C52F.0002: input,hidraw1: USB HID v1.11 > Mouse [Logitech USB Receiver] on usb-:00:14.0-2/input0 > [5.451265] input: Logitech USB Receiver Consumer Control as > /devices/pci:00/:00:14.0/usb1/1-2/1-2:1.1/0003:046D:C52F.0003/input/input24 > [5.507545] hid-generic 0003:046D:C52F.0003: input,hiddev96,hidraw2: USB > HID v1.11 Device [Logitech USB Receiver] on usb-:00:14.0-2/input1 Hi Rafael, 0x046d/0xc52f is known to have issues in 5.2-rcX. There is a patch queued [1] that is believed to fix all this; my plan is to send it to Linus in the coming 1-2 days. If you could report whether it fixes the issues you've been seeing yourself as well, it'd be helpful. Thanks. [1] https://git.kernel.org/pub/scm/linux/kernel/git/hid/hid.git/commit/?h=for-5.2/fixes&id=3ed224e273ac5880eeab4c3043a6b06b0478dd56 -- Jiri Kosina SUSE Labs
Strange regression in hid_llogitech_dj (was: Re: Linux 5.2-rc4)
On Sunday, June 9, 2019 5:46:48 AM CEST Linus Torvalds wrote: > No, I'm not confused, and I haven't lost track of what day it is, I do > actually know that it's still Saturday here, not Sunday, and I'm just > doing rc4 a bit early because I'll be on an airplane during my normal > release time. And while I've done releases on airports and airplanes > before, I looked at my empty queue of pull requests and went "let's > just do it now". > > We've had a fairly calm release so far, and on the whole that seems to > hold. rc4 isn't smaller than rc3 was (it's a bit bigger), but rc3 was > fairly small, so the size increase isn't all that worrisome. I do hope > that we'll start actually shrinking now, though. I noticed that the cordless mouse used by me with one of the machines here stopped to work in 5.2-rc (up to and including the -rc4). Bisection turned up commit 74808f9115ce ("HID: logitech-dj: add support for non unifying receivers"). Of course, that commit does not revert cleanly from 5.2-rc4, but I have reverted the changes made by it in hid/hid-ids.h and I took the version of hid/hid-logitech-dj.c from commit b6aeeddef68d ("HID: logitech-dj: add logi_dj_recv_queue_unknown_work helper"), which is the parent of commit 74808f9115ce, and that made the mouse work again for me. Here's the output of "dmesg | grep -i logitech" from 5.2-rc4 with the above changes: [4.288905] usb 1-2: Manufacturer: Logitech [5.444621] input: Logitech USB Receiver as /devices/pci:00/:00:14.0/usb1/1-2/1-2:1.0/0003:046D:C52F.0002/input/input23 [5.446960] hid-generic 0003:046D:C52F.0002: input,hidraw1: USB HID v1.11 Mouse [Logitech USB Receiver] on usb-:00:14.0-2/input0 [5.451265] input: Logitech USB Receiver Consumer Control as /devices/pci:00/:00:14.0/usb1/1-2/1-2:1.1/0003:046D:C52F.0003/input/input24 [5.507545] hid-generic 0003:046D:C52F.0003: input,hiddev96,hidraw2: USB HID v1.11 Device [Logitech USB Receiver] on usb-:00:14.0-2/input1 Please let me know what you need to diagnose this. Thanks, Rafael
Re: Strange issues with epoll since 5.0
On Thu, 02 May 2019, Deepa Dinamani wrote: Reported-by: Omar Kilani Do we actually know if this was the issue Omar was hitting? Thanks, Davidlohr
Re: Strange issues with epoll since 5.0
Deepa Dinamani wrote: > Eric, > Can you please help test this? Nope, that was _really_ badly whitespace-damaged. (C'mon, it's not like you're new to this)
Re: Strange issues with epoll since 5.0
Eric, Can you please help test this? If this solves your problem, I can post the fix. Thanks, - Deepa -8<--- Subject: [PATCH] signal: Adjust error codes according to restore_user_sigmask() For all the syscalls that receive a sigmask from the userland, the user sigmask is to be in effect through the syscall execution. At the end of syscall, sigmask of the current process is restored to what it was before the switch over to user sigmask. But, for this to be true in practice, the sigmask should be restored only at the the point we change the saved_sigmask. Anything before that loses signals. And, anything after is just pointless as the signal is already lost by restoring the sigmask. The issue was detected because of a regression caused by 854a6ed56839a. The patch moved the signal_pending() check closer to restoring of the user sigmask. But, it failed to update the error code accordingly. Detailed issue discussion permalink: https://lore.kernel.org/linux-fsdevel/20190427093319.sgicqik2oqkez3wk@dcvr/ Note that the patch returns interrupted errors (EINTR, ERESTARTNOHAND, etc) only when there is no other error. If there is a signal and an error like EINVAL, the syscalls return -EINVAL rather than the interrupted error codes. Reported-by: Omar Kilani Reported-by: Eric Wong Fixes: 854a6ed56839a40f6b5d02a2962f48841482eec4 ("signal: Add restore_user_sigmask()") Signed-off-by: Deepa Dinamani --- fs/aio.c | 24 fs/eventpoll.c | 14 ++ fs/io_uring.c | 9 ++--- fs/select.c| 37 + include/linux/signal.h | 2 +- kernel/signal.c| 8 +--- 6 files changed, 55 insertions(+), 39 deletions(-) diff --git a/fs/aio.c b/fs/aio.c index 38b741aef0bf..7de2f7573d55 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -2133,7 +2133,7 @@ SYSCALL_DEFINE6(io_pgetevents, struct __aio_sigsetksig = { NULL, }; sigset_tksigmask, sigsaved; struct timespec64ts; -int ret; +int ret, signal_detected; if (timeout && unlikely(get_timespec64(&ts, timeout))) return -EFAULT; @@ -2146,8 +2146,8 @@ SYSCALL_DEFINE6(io_pgetevents, return ret; ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL); -restore_user_sigmask(ksig.sigmask, &sigsaved); -if (signal_pending(current) && !ret) +signal_detected = restore_user_sigmask(ksig.sigmask, &sigsaved); +if (signal_detected && !ret) ret = -ERESTARTNOHAND; return ret; @@ -2166,7 +2166,7 @@ SYSCALL_DEFINE6(io_pgetevents_time32, struct __aio_sigsetksig = { NULL, }; sigset_tksigmask, sigsaved; struct timespec64ts; -int ret; +int ret, signal_detected; if (timeout && unlikely(get_old_timespec32(&ts, timeout))) return -EFAULT; @@ -2180,8 +2180,8 @@ SYSCALL_DEFINE6(io_pgetevents_time32, return ret; ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL); -restore_user_sigmask(ksig.sigmask, &sigsaved); -if (signal_pending(current) && !ret) +signal_detected = restore_user_sigmask(ksig.sigmask, &sigsaved); +if (signal_detected && !ret) ret = -ERESTARTNOHAND; return ret; @@ -2231,7 +2231,7 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents, struct __compat_aio_sigset ksig = { NULL, }; sigset_t ksigmask, sigsaved; struct timespec64 t; -int ret; +int ret, signal_detected; if (timeout && get_old_timespec32(&t, timeout)) return -EFAULT; @@ -2244,8 +2244,8 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents, return ret; ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL); -restore_user_sigmask(ksig.sigmask, &sigsaved); -if (signal_pending(current) && !ret) +signal_detected = restore_user_sigmask(ksig.sigmask, &sigsaved); +if (signal_detected && !ret) ret = -ERESTARTNOHAND; return ret; @@ -2264,7 +2264,7 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents_time64, struct __compat_aio_sigset ksig = { NULL, }; sigset_t ksigmask, sigsaved; struct timespec64 t; -int ret; +int ret, signal_detected; if (timeout && get_timespec64(&t, timeout)) return -EFAULT; @@ -2277,8 +2277,8 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents_time64, return ret; ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL); -restore_user_sigmask(ksig.sigmask, &sigsaved); -if (signal_pending(current) && !ret) +signal_detected = restore_user_sigmask(ksig.sigmask, &sigsaved); +if (signal_detected && !ret) ret = -ERESTARTNOHAND; return ret; diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 4a0e98d87fcc..fe5a0724b417 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -2317,7 +2317,7 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events, int, maxevents, int, timeout, const sigset_t __user *, sigmask,
Re: Strange issues with epoll since 5.0
On Wed, May 1, 2019 at 1:48 PM Eric Wong wrote: > > Deepa Dinamani wrote: > > So here is my analysis: > > > > > So the 854a6ed56839a40f6 seems to be better than the original code in > > that it detects the signal. > > OTOH, does matter to anybody that a signal is detected slightly > sooner than it would've been, otherwise? The original code drops the signal altogether. This is because it overwrites the current's sigmask with the provided one(set_current_blocked()). If a signal bit was set, it is lost forever. It does not detect it sooner. The check for pending signal is sooner and not just before the syscall returns. This is what the patch in discussion does: check for signals just before returning. > > > But, the problem is that it doesn't > > communicate it to the userspace. > > Yup, that's a big problem :) > > > So a patch like below solves the problem. This is incomplete. I'll > > verify and send you a proper fix you can test soon. This is just for > > the sake of discussion: > > > > diff --git a/fs/eventpoll.c b/fs/eventpoll.c > > index 4a0e98d87fcc..63a387329c3d 100644 > > --- a/fs/eventpoll.c > > +++ b/fs/eventpoll.c > > @@ -2317,7 +2317,7 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct > > epoll_event __user *, events, > > int, maxevents, int, timeout, const sigset_t __user *, > > sigmask, > > size_t, sigsetsize) > > { > > - int error; > > + int error, signal_detected; > > sigset_t ksigmask, sigsaved; > > > > /* > > @@ -2330,7 +2330,10 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct > > epoll_event __user *, events, > > > > error = do_epoll_wait(epfd, events, maxevents, timeout); > > > > - restore_user_sigmask(sigmask, &sigsaved); > > + signal_detected = restore_user_sigmask(sigmask, &sigsaved); > > + > > + if (signal_detected && !error) > > + return -EITNR; > > > > return error; > > Looks like a reasonable API. > > > @@ -2862,7 +2862,7 @@ void restore_user_sigmask(const void __user > > *usigmask, sigset_t *sigsaved) > > if (signal_pending(current)) { > > current->saved_sigmask = *sigsaved; > > set_restore_sigmask(); > > - return; > > + return 0; > > Shouldn't that "return 1" if a signal is pending? Yep, I meant this to be 1. -Deepa
Re: Strange issues with epoll since 5.0
Deepa Dinamani wrote: > So here is my analysis: > So the 854a6ed56839a40f6 seems to be better than the original code in > that it detects the signal. OTOH, does matter to anybody that a signal is detected slightly sooner than it would've been, otherwise? > But, the problem is that it doesn't > communicate it to the userspace. Yup, that's a big problem :) > So a patch like below solves the problem. This is incomplete. I'll > verify and send you a proper fix you can test soon. This is just for > the sake of discussion: > > diff --git a/fs/eventpoll.c b/fs/eventpoll.c > index 4a0e98d87fcc..63a387329c3d 100644 > --- a/fs/eventpoll.c > +++ b/fs/eventpoll.c > @@ -2317,7 +2317,7 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct > epoll_event __user *, events, > int, maxevents, int, timeout, const sigset_t __user *, > sigmask, > size_t, sigsetsize) > { > - int error; > + int error, signal_detected; > sigset_t ksigmask, sigsaved; > > /* > @@ -2330,7 +2330,10 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct > epoll_event __user *, events, > > error = do_epoll_wait(epfd, events, maxevents, timeout); > > - restore_user_sigmask(sigmask, &sigsaved); > + signal_detected = restore_user_sigmask(sigmask, &sigsaved); > + > + if (signal_detected && !error) > + return -EITNR; > > return error; Looks like a reasonable API. > @@ -2862,7 +2862,7 @@ void restore_user_sigmask(const void __user > *usigmask, sigset_t *sigsaved) > if (signal_pending(current)) { > current->saved_sigmask = *sigsaved; > set_restore_sigmask(); > - return; > + return 0; Shouldn't that "return 1" if a signal is pending?
Re: Strange issues with epoll since 5.0
Thanks for trying the fix. So here is my analysis: Let's start with epoll_pwait: ep_poll() is what checks for signal_pending() and is responsible for setting errno to -EINTR when there is a signal. So if a signal is received after ep_poll(), it is never noticed by the syscall during execution. Moreover, the original code before 854a6ed56839a40f6b5d02a2962f48841482eec4 ("signal: Add restore_user_sigmask()"), had the following call flow: error = do_epoll_wait(epfd, events, maxevents, timeout); Here error = 0 if the signal is received after ep_poll(). - /* -* If we changed the signal mask, we need to restore the original one. -* In case we've got a signal while waiting, we do not restore the -* signal mask yet, and we allow do_signal() to deliver the signal on -* the way back to userspace, before the signal mask is restored. -*/ - if (sigmask) { - if (error == -EINTR) { - memcpy(¤t->saved_sigmask, &sigsaved, - sizeof(sigsaved)); - set_restore_sigmask(); - } else Execution reaches this else statement and the sigmask is restored directly, ignoring the newly generated signal. The signal is never handled. - set_current_blocked(&sigsaved); - } In the current execution flow: error = do_epoll_wait(epfd, events, maxevents, timeout); error is still 0 as ep_poll() did not detect the signal. restore_user_sigmask(sigmask, &sigsaved, error == -EITNR); void restore_user_sigmask(const void __user *usigmask, sigset_t *sigsaved) { if (!usigmask) return; /* * When signals are pending, do not restore them here. * Restoring sigmask here can lead to delivering signals that the above * syscalls are intended to block because of the sigmask passed in. */ if (signal_pending(current)) { execution path reaches here and do_signal() actually delivers the signal to userspace. But the errno is not set. So the userspace fails to notice it. current->saved_sigmask = *sigsaved; set_restore_sigmask(); return; } /* * This is needed because the fast syscall return path does not restore * saved_sigmask when signals are not pending. */ set_current_blocked(sigsaved); } For other syscalls in the same commit: sys_io_pgetevents() does not seem to have this problem as we are still checking signal_pending() here. sys_pselect6() seems to have a similar problem. The changes to sys_pselect6() also impact sys_select() as the changes are in the common code path. So the 854a6ed56839a40f6 seems to be better than the original code in that it detects the signal. But, the problem is that it doesn't communicate it to the userspace. So a patch like below solves the problem. This is incomplete. I'll verify and send you a proper fix you can test soon. This is just for the sake of discussion: diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 4a0e98d87fcc..63a387329c3d 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -2317,7 +2317,7 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events, int, maxevents, int, timeout, const sigset_t __user *, sigmask, size_t, sigsetsize) { - int error; + int error, signal_detected; sigset_t ksigmask, sigsaved; /* @@ -2330,7 +2330,10 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events, error = do_epoll_wait(epfd, events, maxevents, timeout); - restore_user_sigmask(sigmask, &sigsaved); + signal_detected = restore_user_sigmask(sigmask, &sigsaved); + + if (signal_detected && !error) + return -EITNR; return error; } @@ -2342,7 +2345,7 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd, const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize) { - long err; + long err, signal_detected; sigset_t ksigmask, sigsaved; /* @@ -2355,7 +2358,10 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd, err = do_epoll_wait(epfd, events, maxevents, timeout); - restore_user_sigmask(sigmask, &sigsaved); + signal_detected = restore_user_sigmask(sigmask, &sigsaved); + + if (signal_detected && !err) + return -EITNR; return err; } diff --git a/kernel/signal.c b/kernel/signal.c index 3a9e41197d46..c76ab2a52ebf 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2849,11 +2849,11 @@ EXPORT_SYMBOL(set_compat_user_sigmask); * This is useful for syscalls such as ppoll, pselect, io_pgetevents and * epoll_pwait where a new sigmask is passed in from userland for the syscalls. */ -void restore_user_sigmask(const void __user *usigmask, sigset_t *sigsaved) +int restore
Re: Strange issues with epoll since 5.0
Eric Wong wrote: > (didn't test AIO, but everything else seems good) "seems" != "is" Now that I understand the fix for epoll, the fs/select.c changes would hit the same problem and not return -EINTR when it should. I'll let you guys decide how to fix this, but there's definitely a problem when "(errno == EINTR)" comparisons in userspace stop working.
Re: Strange issues with epoll since 5.0
Eric Wong wrote: > Deepa Dinamani wrote: > > I'm not sure what the hang in the userspace is about. Is it because > > the syscall did not return an error or the particular signal was > > blocked etc. > > Uh, ok; that's less comforting. Nevermind, I think I understand everything, now. epoll_pwait never set errno without our patch. cmogstored does this in notify.c: /* wait_intr calls epoll_pwait: */ mfd = mog_idleq_wait_intr(mog_notify_queue, timeout); if (mfd) notify_queue_step(mfd); else if (errno == EINTR) /* EINTR fails to be noticed */ note_run();
Re: Strange issues with epoll since 5.0
Deepa Dinamani wrote: > I was also not able to reproduce this. > Arnd and I were talking about this today morning. Here is something > Arnd noticed: > > If there was a signal after do_epoll_wait(), we never were not > entering the if (err = -EINTR) at all before. I'm not sure which `if' statement you're talking about, but ok... > But, now we do. > We could try with the below patch: Wasn't close to applying or being buildable, but I put a working version together (below). epoll_pwait wakes up as expected, now :> > If this works that means we know what is busted. OK, good to know... > I'm not sure what the hang in the userspace is about. Is it because > the syscall did not return an error or the particular signal was > blocked etc. Uh, ok; that's less comforting. > There are also a few timing differences also. But, can we try this first? Anyways, the following patch works and builds cleanly for me (didn't test AIO, but everything else seems good) Thanks! -8<--- Subject: [PATCH] test fix from Deepa TBD --- fs/aio.c | 8 fs/eventpoll.c | 4 ++-- fs/select.c| 12 ++-- include/linux/signal.h | 2 +- kernel/signal.c| 6 -- 5 files changed, 17 insertions(+), 15 deletions(-) diff --git a/fs/aio.c b/fs/aio.c index 3d9669d011b9..d54513ca11b6 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -2146,7 +2146,7 @@ SYSCALL_DEFINE6(io_pgetevents, return ret; ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL); - restore_user_sigmask(ksig.sigmask, &sigsaved); + restore_user_sigmask(ksig.sigmask, &sigsaved, -1); if (signal_pending(current) && !ret) ret = -ERESTARTNOHAND; @@ -2180,7 +2180,7 @@ SYSCALL_DEFINE6(io_pgetevents_time32, return ret; ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL); - restore_user_sigmask(ksig.sigmask, &sigsaved); + restore_user_sigmask(ksig.sigmask, &sigsaved, -1); if (signal_pending(current) && !ret) ret = -ERESTARTNOHAND; @@ -2244,7 +2244,7 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents, return ret; ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL); - restore_user_sigmask(ksig.sigmask, &sigsaved); + restore_user_sigmask(ksig.sigmask, &sigsaved, -1); if (signal_pending(current) && !ret) ret = -ERESTARTNOHAND; @@ -2277,7 +2277,7 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents_time64, return ret; ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL); - restore_user_sigmask(ksig.sigmask, &sigsaved); + restore_user_sigmask(ksig.sigmask, &sigsaved, -1); if (signal_pending(current) && !ret) ret = -ERESTARTNOHAND; diff --git a/fs/eventpoll.c b/fs/eventpoll.c index a5d219d920e7..bd84ec54a8fb 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -2247,7 +2247,7 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events, error = do_epoll_wait(epfd, events, maxevents, timeout); - restore_user_sigmask(sigmask, &sigsaved); + restore_user_sigmask(sigmask, &sigsaved, error == -EINTR); return error; } @@ -2272,7 +2272,7 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd, err = do_epoll_wait(epfd, events, maxevents, timeout); - restore_user_sigmask(sigmask, &sigsaved); + restore_user_sigmask(sigmask, &sigsaved, err == -EINTR); return err; } diff --git a/fs/select.c b/fs/select.c index d0f35dbc0e8f..04720e5856ed 100644 --- a/fs/select.c +++ b/fs/select.c @@ -760,7 +760,7 @@ static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp, ret = core_sys_select(n, inp, outp, exp, to); ret = poll_select_copy_remaining(&end_time, tsp, type, ret); - restore_user_sigmask(sigmask, &sigsaved); + restore_user_sigmask(sigmask, &sigsaved, -1); return ret; } @@ -1106,7 +1106,7 @@ SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds, ret = do_sys_poll(ufds, nfds, to); - restore_user_sigmask(sigmask, &sigsaved); + restore_user_sigmask(sigmask, &sigsaved, -1); /* We can restart this syscall, usually */ if (ret == -EINTR) @@ -1142,7 +1142,7 @@ SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds, unsigned int, nfds, ret = do_sys_poll(ufds, nfds, to); - restore_user_sigmask(sigmask, &sigsaved); + restore_user_sigmask(sigmask, &sigsaved, -1); /* We can restart this syscall, usually */ if (ret == -EINTR) @@ -1352,7 +1352,7 @@ static long do_compat_pselect(int n, compat_ulong_t __user *inp, ret = compat_core_sys_select(n, inp, outp, exp, to); ret = poll_select_copy_remaining(&end_time, tsp, type, ret); - restore_user_sigmask(sigmask, &sigsaved); + restore_user_sigm
Re: Strange issues with epoll since 5.0
I was also not able to reproduce this. Arnd and I were talking about this today morning. Here is something Arnd noticed: If there was a signal after do_epoll_wait(), we never were not entering the if (err = -EINTR) at all before. But, now we do. We could try with the below patch: diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 4a0e98d87fcc..5cfb800cf598 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -2330,7 +2330,7 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events, error = do_epoll_wait(epfd, events, maxevents, timeout); - restore_user_sigmask(sigmask, &sigsaved); + restore_user_sigmask(sigmask, &sigsaved, error == -EITNR); return error; } diff --git a/kernel/signal.c b/kernel/signal.c index 3a9e41197d46..4a8f96f5c1c0 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2849,7 +2849,7 @@ EXPORT_SYMBOL(set_compat_user_sigmask); * This is useful for syscalls such as ppoll, pselect, io_pgetevents and * epoll_pwait where a new sigmask is passed in from userland for the syscalls. */ -void restore_user_sigmask(const void __user *usigmask, sigset_t *sigsaved) +void restore_user_sigmask(const void __user *usigmask, sigset_t *sigsaved, int sig_pending) { if (!usigmask) @@ -2859,7 +2859,7 @@ void restore_user_sigmask(const void __user *usigmask, sigset_t *sigsaved) * Restoring sigmask here can lead to delivering signals that the above * syscalls are intended to block because of the sigmask passed in. */ - if (signal_pending(current)) { + if (sig_pending) { current->saved_sigmask = *sigsaved; set_restore_sigmask(); If this works that means we know what is busted. I'm not sure what the hang in the userspace is about. Is it because the syscall did not return an error or the particular signal was blocked etc. There are also a few timing differences also. But, can we try this first? -Deepa
Re: Strange issues with epoll since 5.0
Davidlohr Bueso wrote: > On Sun, 28 Apr 2019, Eric Wong wrote: > > > Just running one test won't trigger since it needs a busy > > machine; but: > > > > make test/mgmt_auto_adjust.log > > (and "rm make test/mgmt_auto_adjust.log" if you want to rerun) > > fyi no luck reproducing on both either a large (280) or small (4 cpu) > machine, I'm running it along with a kernel build overcommiting cpus x2. Thanks for taking a look. > Is there any workload in particular you are using to stress the box? Just the "make -j$N check" I mentioned up-thread which spawns a lot of tests in parallel. For the small 4 CPU machine, -j32 or -j16 ought to be reproducing the problem. I'll try to set aside some time this week to get familiar with kernel internals w.r.t. signal handling.
Re: Strange issues with epoll since 5.0
On Sun, 28 Apr 2019, Eric Wong wrote: Just running one test won't trigger since it needs a busy machine; but: make test/mgmt_auto_adjust.log (and "rm make test/mgmt_auto_adjust.log" if you want to rerun) fyi no luck reproducing on both either a large (280) or small (4 cpu) machine, I'm running it along with a kernel build overcommiting cpus x2. Is there any workload in particular you are using to stress the box? Thanks, Davidlohr
Re: Strange issues with epoll since 5.0
Deepa Dinamani wrote: > I tried to replicate the failure on qemu. > I do not see the failure with N=32. > Does it work for N < 32? Depends on number of cores you have; I have 4 cores, 8 threads with HT; so I needed to have a lot of load on the machine to get it to fail (it takes about 1 minute). cmogstored is intended to run on machines that were already saturated in CPU/memory from other processes, but not HDD I/O bandwidth. > Does any other signal work? SIGCONT does, via: perl -i -p -e 's/SIGURG/SIGCONT/g' `git ls-files` > Are there any other architectures that fail? I don't have other arches (well, 32-bit x86, but I've never really tried cmogstored on that, even). > Could you help me figure out how to run just the one test that is failing? Just running one test won't trigger since it needs a busy machine; but: make test/mgmt_auto_adjust.log (and "rm make test/mgmt_auto_adjust.log" if you want to rerun) Thanks for looking into this. Fwiw, cmogstored uses epoll in strange and uncommon ways which has led to kernel bugfixes in the past.
Re: Strange issues with epoll since 5.0
I tried to replicate the failure on qemu. I do not see the failure with N=32. Does it work for N < 32? Does any other signal work? Are there any other architectures that fail? Could you help me figure out how to run just the one test that is failing? -Deepa
Re: Strange issues with epoll since 5.0
Eric Wong wrote: > Omar Kilani wrote: > > Hi there, > > > > I’m still trying to piece together a reproducible test that triggers > > this, but I wanted to post in case someone goes “hmmm... change X > > might have done this”. > > Maybe Davidlohr knows, since he's responsible for most of the > epoll changes in 5.0. Well, I am not sure if I am hitting the same problem Omar is hitting. But I did find an epoll_pwait regression in 5.0: epoll_pwait seems unresponsive to SIGURG in my heavily-parallelized use case[1] on 5.0.9. I bisected it to commit 854a6ed56839a40f6b5d02a2962f48841482eec4 ("signal: Add restore_user_sigmask()") Just reverting the fs/eventpoll.c change in 854a6ed56 seems enough to fix the non-responsive epoll_pwait for me. I have not looked deeply into this, but perhaps the signal_pending check in restore_user_sigmask is racy w.r.t. epoll. It is been a while since I have looked at kernel stuff, myself. Anyways, this revert works; but I'm not 100% sure why... diff --git a/fs/eventpoll.c b/fs/eventpoll.c index a5d219d920e7..151739d76801 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -2247,7 +2247,20 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events, error = do_epoll_wait(epfd, events, maxevents, timeout); - restore_user_sigmask(sigmask, &sigsaved); + /* +* If we changed the signal mask, we need to restore the original one. +* In case we've got a signal while waiting, we do not restore the +* signal mask yet, and we allow do_signal() to deliver the signal on +* the way back to userspace, before the signal mask is restored. +*/ + if (sigmask) { + if (error == -EINTR) { + memcpy(¤t->saved_sigmask, &sigsaved, + sizeof(sigsaved)); + set_restore_sigmask(); + } else + set_current_blocked(&sigsaved); + } return error; } @@ -2272,7 +2285,20 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd, err = do_epoll_wait(epfd, events, maxevents, timeout); - restore_user_sigmask(sigmask, &sigsaved); + /* +* If we changed the signal mask, we need to restore the original one. +* In case we've got a signal while waiting, we do not restore the +* signal mask yet, and we allow do_signal() to deliver the signal on +* the way back to userspace, before the signal mask is restored. +*/ + if (sigmask) { + if (err == -EINTR) { + memcpy(¤t->saved_sigmask, &sigsaved, + sizeof(sigsaved)); + set_restore_sigmask(); + } else + set_current_blocked(&sigsaved); + } return err; } Comments and/or a proper fix would be greatly appreciated. [1] my test case is running the cmogstored 1.7.0 test suite in amd64 Debian stable environment. test/mgmt_auto_adjust would get stuck and time-out after 60s on vanilla v5.0.9 tgz: https://bogomips.org/cmogstored/files/cmogstored-1.7.0.tar.gz # Standard autotools install, N=32 or some high-ish number ./configure make -j$N make check -j$N # OR git clone https://bogomips.org/cmogstored.git So, requoting the rest of Omar's original report, here; since I am not sure if his use case involves epoll_pwait like mine does: > Omar Kilani wrote: > > Basically, something’s broken (or at least, has changed enough to > > cause problems in user space) in epoll since 5.0. It’s still broken in > > 5.1-rc5. > > > > It doesn’t happen 100% of the time. It’s sort of hard to pin down but > > I’ve observed the following: > > > > * nginx not accepting connections under load > > * A java app which uses netty / NIO having strange writability > > semantics on channels, which confuses netty / java enough to not > > properly flush written data on the socket. > > > > I went and tested these Linux kernels: > > > > 4.20.17 > > 4.19.32 > > 4.14.111 > > > > And the issue(s) do not show up there. > > > > I’m still actively chasing this up, and will report back — I haven’t > > touched kernel code in 15 years so I’m a little rusty. :) > > > > Regards, > > Omar
Re: Strange issues with epoll since 5.0
On Wed, 24 Apr 2019, Davidlohr Bueso wrote: On Wed, 24 Apr 2019, Eric Wong wrote: Omar Kilani wrote: Hi there, I???m still trying to piece together a reproducible test that triggers this, but I wanted to post in case someone goes ???hmmm... change X might have done this???. Maybe Davidlohr knows, since he's responsible for most of the epoll changes in 5.0. Not really, I have not been made aware of any issues until now. Basically, something???s broken (or at least, has changed enough to cause problems in user space) in epoll since 5.0. It???s still broken in 5.1-rc5. It doesn???t happen 100% of the time. It???s sort of hard to pin down but I???ve observed the following: * nginx not accepting connections under load * A java app which uses netty / NIO having strange writability semantics on channels, which confuses netty / java enough to not properly flush written data on the socket. Off the top of my head, could the following be responsible? c5a282e9635 (fs/epoll: reduce the scope of wq lock in epoll_wait())
Re: Strange issues with epoll since 5.0
On Wed, 24 Apr 2019, Eric Wong wrote: Omar Kilani wrote: Hi there, I???m still trying to piece together a reproducible test that triggers this, but I wanted to post in case someone goes ???hmmm... change X might have done this???. Maybe Davidlohr knows, since he's responsible for most of the epoll changes in 5.0. Not really, I have not been made aware of any issues until now. Basically, something???s broken (or at least, has changed enough to cause problems in user space) in epoll since 5.0. It???s still broken in 5.1-rc5. It doesn???t happen 100% of the time. It???s sort of hard to pin down but I???ve observed the following: * nginx not accepting connections under load * A java app which uses netty / NIO having strange writability semantics on channels, which confuses netty / java enough to not properly flush written data on the socket. I went and tested these Linux kernels: 4.20.17 4.19.32 4.14.111 And the issue(s) do not show up there. I???m still actively chasing this up, and will report back ??? I haven???t touched kernel code in 15 years so I???m a little rusty. :) A bisection and/or workload that triggers the issue would be great to see what's going on. Thanks, Davidlohr
Re: Strange issues with epoll since 5.0
Omar Kilani wrote: > Hi there, > > I’m still trying to piece together a reproducible test that triggers > this, but I wanted to post in case someone goes “hmmm... change X > might have done this”. Maybe Davidlohr knows, since he's responsible for most of the epoll changes in 5.0. > Basically, something’s broken (or at least, has changed enough to > cause problems in user space) in epoll since 5.0. It’s still broken in > 5.1-rc5. > > It doesn’t happen 100% of the time. It’s sort of hard to pin down but > I’ve observed the following: > > * nginx not accepting connections under load > * A java app which uses netty / NIO having strange writability > semantics on channels, which confuses netty / java enough to not > properly flush written data on the socket. > > I went and tested these Linux kernels: > > 4.20.17 > 4.19.32 > 4.14.111 > > And the issue(s) do not show up there. > > I’m still actively chasing this up, and will report back — I haven’t > touched kernel code in 15 years so I’m a little rusty. :) > > Regards, > Omar
Strange issues with epoll since 5.0
Hi there, I’m still trying to piece together a reproducible test that triggers this, but I wanted to post in case someone goes “hmmm... change X might have done this”. Basically, something’s broken (or at least, has changed enough to cause problems in user space) in epoll since 5.0. It’s still broken in 5.1-rc5. It doesn’t happen 100% of the time. It’s sort of hard to pin down but I’ve observed the following: * nginx not accepting connections under load * A java app which uses netty / NIO having strange writability semantics on channels, which confuses netty / java enough to not properly flush written data on the socket. I went and tested these Linux kernels: 4.20.17 4.19.32 4.14.111 And the issue(s) do not show up there. I’m still actively chasing this up, and will report back — I haven’t touched kernel code in 15 years so I’m a little rusty. :) Regards, Omar
Re: Kernel 4.9: strange behavior with fifo scheduler
On 2/6/19 2:25 PM, Frédéric Mathieu wrote: Hi Dietmar, Attention !, these tests were executed on a kernel with the patch RT and the option CONFIG_PREEMPT_RT_FULL = y. I confirm the truth of my priority settings On a vanilla kernel, I get the same results as you. After talking with mike Galbraith, I turned my attention to the priority of kernel threads. The following link explains in the behavior of the scheduler : https://wiki.linuxfoundation.org/realtime/documentation/technical_details/hr_timers Contrary to what I thought, there is no dynamic adjustment of the priority according to the priority of the calling task. -Message d'origine- De : linux-kernel-ow...@vger.kernel.org [mailto:linux-kernel-ow...@vger.kernel.org] De la part de Dietmar Eggemann Envoyé : mercredi 6 février 2019 11:55 À : Frédéric Mathieu ; linux-kernel@vger.kernel.org Objet : Re: Kernel 4.9: strange behavior with fifo scheduler Hi Frédéric, On 2/5/19 11:47 AM, Frédéric Mathieu wrote: Hi, on an X86_64 architecture (Intel(R) Core(TM) i3-6100U CPU @ 2.30GHz), I use the linux kernel 4.9.146 with patch rt 125. uname -a: Linux 4.9.146-rt125 #1 SMP PREEMPT RT Tue Jan 29 14:17:55 CET 2019 x86_64 GNU/Linux Ah, OK, I should have read it more carefully! So you're talking about the ktimersoftd/0 thread in this case which makes cyclictest show the latency of ~6ms every 5th time. v4.9 + rt 125 on ARM64 Juno: (test prio (49) > cyclictest prio (50) ... ktimersoftd/0-4 [000]54.948862: sched_pi_setprio: comm=ktimersoftd/0 pid=4 oldprio=49 newprio=98 ktimersoftd/0-4 [000]54.948867: sched_waking: comm=cyclictest pid=2730 prio=49 target_cpu=000 ktimersoftd/0-4 [000]54.948870: sched_wakeup: cyclictest:2730 [49] success=1 CPU:000 ktimersoftd/0-4 [000]54.948874: sched_switch: ktimersoftd/0:4 [98] R ==> cyclictest:2730 [49] cyclictest-2730 [000]54.948890: sched_switch: cyclictest:2730 [49] S ==> ktimersoftd/0:4 [98] ktimersoftd/0-4 [000]54.948906: sched_switch: ktimersoftd/0:4 [98] S ==> swapper/0:0 [120] -0 [000]54.949087: sched_waking: comm=test pid=2723 prio=50 target_cpu=000 -0 [000]54.949090: sched_wakeup: test:2723 [50] success=1 CPU:000 -0 [000]54.949098: sched_switch: swapper/0:0 [120] R ==> test:2723 [50] test-2723 [000]54.949784: sched_waking: comm=ktimersoftd/0 pid=4 prio=98 target_cpu=000 test-2723 [000]54.949787: sched_wakeup: ktimersoftd/0:4 [98] success=1 CPU:000 test-2723 [000]54.951979: sched_waking: comm=ksoftirqd/0 pid=3 prio=120 target_cpu=000 test-2723 [000]54.951983: sched_wakeup: ksoftirqd/0:3 [120] success=1 CPU:000 test-2723 [000]54.955111: sched_switch: test:2723 [50] S ==> ktimersoftd/0:4 [98] ktimersoftd/0-4 [000]54.955125: sched_waking: comm=cyclictest pid=2730 prio=49 target_cpu=000 ktimersoftd/0-4 [000]54.955128: sched_wakeup: cyclictest:2730 [49] success=1 CPU:000 ktimersoftd/0-4 [000]54.955132: sched_switch: ktimersoftd/0:4 [98] R ==> cyclictest:2730 [49] cyclictest-2730 [000]54.955138: sched_pi_setprio: comm=ktimersoftd/0 pid=4 oldprio=98 newprio=49 cyclictest-2730 [000]54.955145: sched_switch: cyclictest:2730 [49] D ==> ktimersoftd/0:4 [49] ktimersoftd/0-4 [000]54.955149: sched_pi_setprio: comm=ktimersoftd/0 pid=4 oldprio=49 newprio=98 ktimersoftd/0-4 [000]54.955154: sched_waking: comm=cyclictest pid=2730 prio=49 target_cpu=000 ktimersoftd/0-4 [000]54.955157: sched_wakeup: cyclictest:2730 [49] success=1 CPU:000 ... [...]
RE: Kernel 4.9: strange behavior with fifo scheduler
Hi Dietmar, Attention !, these tests were executed on a kernel with the patch RT and the option CONFIG_PREEMPT_RT_FULL = y. I confirm the truth of my priority settings On a vanilla kernel, I get the same results as you. After talking with mike Galbraith, I turned my attention to the priority of kernel threads. The following link explains in the behavior of the scheduler : https://wiki.linuxfoundation.org/realtime/documentation/technical_details/hr_timers Contrary to what I thought, there is no dynamic adjustment of the priority according to the priority of the calling task. -Message d'origine- De : linux-kernel-ow...@vger.kernel.org [mailto:linux-kernel-ow...@vger.kernel.org] De la part de Dietmar Eggemann Envoyé : mercredi 6 février 2019 11:55 À : Frédéric Mathieu ; linux-kernel@vger.kernel.org Objet : Re: Kernel 4.9: strange behavior with fifo scheduler Hi Frédéric, On 2/5/19 11:47 AM, Frédéric Mathieu wrote: > Hi, > > on an X86_64 architecture (Intel(R) Core(TM) i3-6100U CPU @ 2.30GHz), > I use the linux kernel 4.9.146 with patch rt 125. > uname -a: Linux 4.9.146-rt125 #1 SMP PREEMPT RT Tue Jan 29 14:17:55 > CET 2019 > x86_64 GNU/Linux > > I observed a strange behavior of the scheduler when several tasks are > executed in FIFO mode on a CPU core and a significant CPU activity. > > first test (reference: cpu load=0%): > cyclictest -m -D 5 -i 1000 -p 50 -a 0 > # / dev / cpu_dma_latency set to 0us > policy: fifo: loadavg: 1.95 1.06 0.43 1/159 14305 > T: 0 (14145) P: 50 I: 1000 C: 4997 Min: 7 Act: 7 Avg: 7 Max: > 18 work fine > > now, i'm loading the system on the cpu core 0 with a homemade process: >cpu load 60%, sched FIFO prio 1, cpu 0 Are you sure that your test app runs with prio 1? Is this in the range of the SCHED_FIFO (userspace) priorities shown by chrt -m? ... SCHED_FIFO min/max priority : 1/99 ... If I run your setup (test and cyclictest affine to CPU0) on 4.15.0-43 (i7-4750HQ) with: (1) test prio > cyclictest prio # chrt -p $PID_TEST pid 28489's current scheduling policy: SCHED_FIFO pid 28489's current scheduling priority: 51 # cat /proc/$PID_TEST/stat 28489 (test) R 28488 28487 8664 34828 28487 4194304 86 0 0 0 0 0 0 0 -52 I get your behaviour: # /dev/cpu_dma_latency set to 0us Thread 0 Interval: 1500 0: 0: 6 0: 1: 5 0: 2: 2 0: 3:5419 0: 4: 3 0: 5: 2 0: 6: 2 0: 7: 2 0: 8:5422 0: 9: 3 ... whereas with: (2) test prio < cyclictest prio # chrt -p $PID_TEST pid 28811's current scheduling policy: SCHED_FIFO pid 28811's current scheduling priority: 49 # cat /proc/$PID_TEST/stat 28811 (test) S 28810 28809 8664 34828 28809 1077936128 90 0 0 0 0 0 0 0 -50 I get: # /dev/cpu_dma_latency set to 0us Thread 0 Interval: 1500 0: 0: 7 0: 1: 4 0: 2: 3 0: 3: 5 0: 4: 4 0: 5: 2 0: 6: 2 0: 7: 2 0: 8: 2 0: 9: 3 ... [...] > In this case cyclictest detects very long latencies >cyclictest -m -D 5 -i 1000 -p 50 -a 0 -v > > Max CPUs = 2 > # /dev/cpu_dma_latency set to 0us > Thread 0 Interval: 1500 > 0: 0: 13 > 0: 1: 8 > 0: 2: 7 > 0: 3:5648 > 0: 4: 8 > 0: 5: 7 > 0: 6: 7 > 0: 7: 7 > 0: 8:5649 [...] > After verification, although no other process is running with real > time scheduler, I see a latency of about 5.6 ms at regular intervals. > > it seems that the priority task 1 (fifo) is not pre-empted by the > cyclictest process with a priority of 50 (fifo) when the low priority task is > active. > This corresponds to the cycle recorded in the file: 6 ms of latency > followed by 4 "normal" latencies of 7 us. > > Does anyone have any idea of this problem? > > Best regards > Frederic MATHIEU
Re: Kernel 4.9: strange behavior with fifo scheduler
Hi Frédéric, On 2/5/19 11:47 AM, Frédéric Mathieu wrote: Hi, on an X86_64 architecture (Intel(R) Core(TM) i3-6100U CPU @ 2.30GHz), I use the linux kernel 4.9.146 with patch rt 125. uname -a: Linux 4.9.146-rt125 #1 SMP PREEMPT RT Tue Jan 29 14:17:55 CET 2019 x86_64 GNU/Linux I observed a strange behavior of the scheduler when several tasks are executed in FIFO mode on a CPU core and a significant CPU activity. first test (reference: cpu load=0%): cyclictest -m -D 5 -i 1000 -p 50 -a 0 # / dev / cpu_dma_latency set to 0us policy: fifo: loadavg: 1.95 1.06 0.43 1/159 14305 T: 0 (14145) P: 50 I: 1000 C: 4997 Min: 7 Act: 7 Avg: 7 Max: 18 work fine now, i'm loading the system on the cpu core 0 with a homemade process: cpu load 60%, sched FIFO prio 1, cpu 0 Are you sure that your test app runs with prio 1? Is this in the range of the SCHED_FIFO (userspace) priorities shown by chrt -m? ... SCHED_FIFO min/max priority : 1/99 ... If I run your setup (test and cyclictest affine to CPU0) on 4.15.0-43 (i7-4750HQ) with: (1) test prio > cyclictest prio # chrt -p $PID_TEST pid 28489's current scheduling policy: SCHED_FIFO pid 28489's current scheduling priority: 51 # cat /proc/$PID_TEST/stat 28489 (test) R 28488 28487 8664 34828 28487 4194304 86 0 0 0 0 0 0 0 -52 I get your behaviour: # /dev/cpu_dma_latency set to 0us Thread 0 Interval: 1500 0: 0: 6 0: 1: 5 0: 2: 2 0: 3:5419 0: 4: 3 0: 5: 2 0: 6: 2 0: 7: 2 0: 8:5422 0: 9: 3 ... whereas with: (2) test prio < cyclictest prio # chrt -p $PID_TEST pid 28811's current scheduling policy: SCHED_FIFO pid 28811's current scheduling priority: 49 # cat /proc/$PID_TEST/stat 28811 (test) S 28810 28809 8664 34828 28809 1077936128 90 0 0 0 0 0 0 0 -50 I get: # /dev/cpu_dma_latency set to 0us Thread 0 Interval: 1500 0: 0: 7 0: 1: 4 0: 2: 3 0: 3: 5 0: 4: 4 0: 5: 2 0: 6: 2 0: 7: 2 0: 8: 2 0: 9: 3 ... [...] In this case cyclictest detects very long latencies cyclictest -m -D 5 -i 1000 -p 50 -a 0 -v Max CPUs = 2 # /dev/cpu_dma_latency set to 0us Thread 0 Interval: 1500 0: 0: 13 0: 1: 8 0: 2: 7 0: 3:5648 0: 4: 8 0: 5: 7 0: 6: 7 0: 7: 7 0: 8:5649 [...] After verification, although no other process is running with real time scheduler, I see a latency of about 5.6 ms at regular intervals. it seems that the priority task 1 (fifo) is not pre-empted by the cyclictest process with a priority of 50 (fifo) when the low priority task is active. This corresponds to the cycle recorded in the file: 6 ms of latency followed by 4 "normal" latencies of 7 us. Does anyone have any idea of this problem? Best regards Frederic MATHIEU
Kernel 4.9: strange behavior with fifo scheduler
Hi, on an X86_64 architecture (Intel(R) Core(TM) i3-6100U CPU @ 2.30GHz), I use the linux kernel 4.9.146 with patch rt 125. uname a: Linux 4.9.146-rt125 #1 SMP PREEMPT RT Tue Jan 29 14:17:55 CET 2019 x86_64 GNU/Linux I observed a strange behavior of the scheduler when several tasks are executed in FIFO mode on a CPU core and a significant CPU activity. first test (reference: cpu load=0%): cyclictest -m -D 5 -i 1000 -p 50 -a 0 # / dev / cpu_dma_latency set to 0us policy: fifo: loadavg: 1.95 1.06 0.43 1/159 14305 T: 0 (14145) P: 50 I: 1000 C: 4997 Min: 7 Act: 7 Avg: 7 Max: 18 work fine now, i'm loading the system on the cpu core 0 with a homemade process: cpu load 60%, sched FIFO prio 1, cpu 0 code snippet active_time = 6000; sleep_time = 4000; do { clock_gettime(CLOCK_MONOTONIC, &start_time); // active loop do { for( cptdelay=0; cptdelay < 100; cptdelay++ ); clock_gettime(CLOCK_MONOTONIC, &now); diff = timespec_diff_us( &start_time, &now ); } while (diff < active_time); // sleep : suspend time usleep(sleep_time); } while( 1 ); In this case cyclictest detects very long latencies cyclictest -m -D 5 -i 1000 -p 50 -a 0 -v Max CPUs = 2 # /dev/cpu_dma_latency set to 0us Thread 0 Interval: 1500 0: 0: 13 0: 1: 8 0: 2: 7 0: 3:5648 0: 4: 8 0: 5: 7 0: 6: 7 0: 7: 7 0: 8:5649 0: 9: 7 0: 10: 7 0: 11: 7 0: 12: 7 0: 13:5651 0: 14: 7 0: 15: 7 0: 16: 7 0: 17: 7 0: 18:5655 0: 19: 7 0: 20: 7 0: 21: 7 0: 22: 7 0: 23:5658 0: 24: 7 0: 25: 7 0: 26: 7 0: 27: 7 0: 28:5663 0: 29: 7 0: 30: 7 0: 31: 7 0: 32: 7 0: 33:5664 0: 34: 7 0: 35: 7 0: 36: 7 0: 37: 7 0: 38:5667 0: 39: 7 0: 40: 7 0: 41: 7 0: 42: 7 0: 43:5671 0: 44: 8 0: 45: 7 0: 46: 7 0: 47: 7 0: 48:5673 0: 49: 7 0: 50: 7 0: 51: 7 0: 52: 7 0: 53:5676 0: 54: 7 0: 55: 7 0: 56: 7 0: 57: 7 0: 58:5679 0: 59: 7 0: 60: 7 0: 61: 7 0: 62: 7 0: 63:5682 0: 64: 7 0: 65: 7 0: 66: 7 0: 67: 7 0: 68:5685 After verification, although no other process is running with real time scheduler, I see a latency of about 5.6 ms at regular intervals. it seems that the priority task 1 (fifo) is not pre-empted by the cyclictest process with a priority of 50 (fifo) when the low priority task is active. This corresponds to the cycle recorded in the file: 6 ms of latency followed by 4 "normal" latencies of 7 us. Does anyone have any idea of this problem? Best regards Frederic MATHIEU
Re: Strange hang with gcc 8 of kprobe multiple_kprobes test
* Steven Rostedt wrote: > On Tue, 4 Dec 2018 09:15:06 +0100 > Ingo Molnar wrote: > > > * Masami Hiramatsu wrote: > > > > > I remember I have fixed this, and actually WE did it :-D > > > > > > https://lkml.org/lkml/2018/8/23/1203 > > > > > > Ah, we hit a same bug... > > > > > > Ingo, could you pick the patch? Should I resend it? > > > > Indeed: I just picked it up into tip:perf/urgent. > > > > It's my bad: I missed the original submission due to Steve's feedback > > which I mistook as a request for another iteration, while he only > > commented on the reason for the original breakage and gave his > > Reviewed-by ... > > > > Sorry for the confusion. The patch and code was quite complex, and I > was documenting what I thought of the patch (and the bug), so that my > Reviewed-by had a bit more meaning. It was totally welcome when I read it the second time - and it's my bad I was reading too quickly first time around ;-) So please keep doing it whenever you find the time! :) Thanks, Ingo
Re: Strange hang with gcc 8 of kprobe multiple_kprobes test
On Tue, 4 Dec 2018 09:15:06 +0100 Ingo Molnar wrote: > * Masami Hiramatsu wrote: > > > I remember I have fixed this, and actually WE did it :-D > > > > https://lkml.org/lkml/2018/8/23/1203 > > > > Ah, we hit a same bug... > > > > Ingo, could you pick the patch? Should I resend it? > > Indeed: I just picked it up into tip:perf/urgent. > > It's my bad: I missed the original submission due to Steve's feedback > which I mistook as a request for another iteration, while he only > commented on the reason for the original breakage and gave his > Reviewed-by ... > Sorry for the confusion. The patch and code was quite complex, and I was documenting what I thought of the patch (and the bug), so that my Reviewed-by had a bit more meaning. -- Steve
Re: Strange hang with gcc 8 of kprobe multiple_kprobes test
* Masami Hiramatsu wrote: > I remember I have fixed this, and actually WE did it :-D > > https://lkml.org/lkml/2018/8/23/1203 > > Ah, we hit a same bug... > > Ingo, could you pick the patch? Should I resend it? Indeed: I just picked it up into tip:perf/urgent. It's my bad: I missed the original submission due to Steve's feedback which I mistook as a request for another iteration, while he only commented on the reason for the original breakage and gave his Reviewed-by ... Thanks, Ingo
Re: Strange hang with gcc 8 of kprobe multiple_kprobes test
On Tue, 4 Dec 2018 16:40:07 +0900 Masami Hiramatsu wrote: > Hi Steve, > > On Mon, 3 Dec 2018 21:18:07 -0500 > Steven Rostedt wrote: > > > Hi Masami, > > > > I started testing some of my new code and the system got into a > > strange state. Debugging further, I found the cause came from the > > kprobe tests. It became stranger to me that I could reproduce it with > > older kernels. I went back as far as 4.16 and it triggered. I thought > > this very strange because I ran this test on all those kernels in the > > past. > > > > After a bit of hair pulling, I figured out what changed. I upgraded to > > gcc 8.1 (and I reproduce it with 8.2 as well). I convert back to gcc 7 > > and the tests pass without issue. > > OK, let me see. > > > The issue that I notice when the system gets into this strange state is > > that I can't log into the box. Nor can I reboot. Basically it's > > anything to do with systemd just doesn't work (insert your jokes here > > now, and then let's move on). > > > > I was able to narrow down what the exact function was that caused the > > issues and it is: update_vsyscall() > > > > gcc 7 looks like this: > > > > 81004bf0 : > > 81004bf0: e8 0b cc 9f 00 callq 81a01800 > > <__fentry__> > > 81004bf1: R_X86_64_PC32 __fentry__-0x4 > > 81004bf5: 48 8b 07mov(%rdi),%rax > > 81004bf8: 8b 15 96 5f 34 01 mov0x1345f96(%rip),%edx > ># 8234ab94 > > 81004bfa: R_X86_64_PC32 vclocks_used-0x4 > > 81004bfe: 83 05 7b 84 6f 01 01addl $0x1,0x16f847b(%rip) > ># 826fd080 > > 81004c00: R_X86_64_PC32 > > vsyscall_gtod_data-0x5 > > 81004c05: 8b 48 24mov0x24(%rax),%ecx > > 81004c08: b8 01 00 00 00 mov$0x1,%eax > > 81004c0d: d3 e0 shl%cl,%eax > > > > And gcc 8 looks like this: > > > > 81004c90 : > > 81004c90: e8 6b cb 9f 00 callq 81a01800 > > <__fentry__> > > 81004c91: R_X86_64_PC32 __fentry__-0x4 > > 81004c95: 48 8b 07mov(%rdi),%rax > > 81004c98: 83 05 e1 93 6f 01 01addl $0x1,0x16f93e1(%rip) > ># 826fe080 > > Hm this is a RIP relative instruction, it should be modified by kprobes. > > > 81004c9a: R_X86_64_PC32 > > vsyscall_gtod_data-0x5 > > 81004c9f: 8b 50 24mov0x24(%rax),%edx > > 81004ca2: 8b 05 ec 5e 34 01 mov0x1345eec(%rip),%eax > ># 8234ab94 > > 81004ca4: R_X86_64_PC32 vclocks_used-0x4 > > > > The test adds a kprobe (optimized) at udpate_vsyscall+5. And will > > insert a jump on the two instructions after fentry. The difference > > between v7 and v8 is that v7 is touching vclocks_used and v8 is > > touching vsyscall_gtod_data. > > > > Is there some black magic going on with the vsyscall area with > > vsyscall_gtod_data that is causing havoc when a kprobe is added there? > > I think it might miss something when preprocessing RIP relative instruction. Ah, I got it. I thought it had been fixed, but not... In arch/x86/kernel/kprobes/opt.c, copy_optimized_instructions() does a copy loop, but only update src and dest cursors, but not update real address which is used for adjusting RIP relative instruction. while (len < RELATIVEJUMP_SIZE) { ret = __copy_instruction(dest + len, src + len, real, &insn); if (!ret || !can_boost(&insn, src + len)) return -EINVAL; len += ret; } I remember I have fixed this, and actually WE did it :-D https://lkml.org/lkml/2018/8/23/1203 Ah, we hit a same bug... Ingo, could you pick the patch? Should I resend it? Thank you, -- Masami Hiramatsu
Re: Strange hang with gcc 8 of kprobe multiple_kprobes test
Hi Steve, On Mon, 3 Dec 2018 21:18:07 -0500 Steven Rostedt wrote: > Hi Masami, > > I started testing some of my new code and the system got into a > strange state. Debugging further, I found the cause came from the > kprobe tests. It became stranger to me that I could reproduce it with > older kernels. I went back as far as 4.16 and it triggered. I thought > this very strange because I ran this test on all those kernels in the > past. > > After a bit of hair pulling, I figured out what changed. I upgraded to > gcc 8.1 (and I reproduce it with 8.2 as well). I convert back to gcc 7 > and the tests pass without issue. OK, let me see. > The issue that I notice when the system gets into this strange state is > that I can't log into the box. Nor can I reboot. Basically it's > anything to do with systemd just doesn't work (insert your jokes here > now, and then let's move on). > > I was able to narrow down what the exact function was that caused the > issues and it is: update_vsyscall() > > gcc 7 looks like this: > > 81004bf0 : > 81004bf0: e8 0b cc 9f 00 callq 81a01800 > <__fentry__> > 81004bf1: R_X86_64_PC32 __fentry__-0x4 > 81004bf5: 48 8b 07mov(%rdi),%rax > 81004bf8: 8b 15 96 5f 34 01 mov0x1345f96(%rip),%edx > # 8234ab94 > 81004bfa: R_X86_64_PC32 vclocks_used-0x4 > 81004bfe: 83 05 7b 84 6f 01 01addl $0x1,0x16f847b(%rip) > # 826fd080 > 81004c00: R_X86_64_PC32 vsyscall_gtod_data-0x5 > 81004c05: 8b 48 24mov0x24(%rax),%ecx > 81004c08: b8 01 00 00 00 mov$0x1,%eax > 81004c0d: d3 e0 shl%cl,%eax > > And gcc 8 looks like this: > > 81004c90 : > 81004c90: e8 6b cb 9f 00 callq 81a01800 > <__fentry__> > 81004c91: R_X86_64_PC32 __fentry__-0x4 > 81004c95: 48 8b 07mov(%rdi),%rax > 81004c98: 83 05 e1 93 6f 01 01addl $0x1,0x16f93e1(%rip) > # 826fe080 Hm this is a RIP relative instruction, it should be modified by kprobes. > 81004c9a: R_X86_64_PC32 vsyscall_gtod_data-0x5 > 81004c9f: 8b 50 24mov0x24(%rax),%edx > 81004ca2: 8b 05 ec 5e 34 01 mov0x1345eec(%rip),%eax > # 8234ab94 > 81004ca4: R_X86_64_PC32 vclocks_used-0x4 > > The test adds a kprobe (optimized) at udpate_vsyscall+5. And will > insert a jump on the two instructions after fentry. The difference > between v7 and v8 is that v7 is touching vclocks_used and v8 is > touching vsyscall_gtod_data. > > Is there some black magic going on with the vsyscall area with > vsyscall_gtod_data that is causing havoc when a kprobe is added there? I think it might miss something when preprocessing RIP relative instruction. Could you disable jump optimization as below and test what happen on update_vsyscall+5 AND update_vsyscall+8? (RIP relative preprocess must happen even if the jump optimization is disabled) # echo 0 > /proc/sys/debug/kprobes-optimization > I can dig a little more into this, but I'm currently at my HQ office > with a lot of other objectives that I must get done, and I can't work > on this much more this week. OK, let me try to reproduce it in my environment. > > I included my config (for my virt machine, which I was also able to > trigger it with). Thanks, but I think it should not depend on the kconfig. > > The test that triggers this bug is: > > tools/testing/selftests/ftrace/test.d/kprobe/multiple_kprobes.tc > > It runs the test fine, but other things just start to act up after I > run it. Yeah, thank you for digging it down. It is now much easier to me. > > I notice that when I get into the state, journald and the dbus_daemon > are constantly running. Perhaps the userspace time keeping went bad? Yeah, I think so. Maybe addl instruction becomes broken. Thank you, -- Masami Hiramatsu
Re: CHECKPATCH: strange warning on alignment modifier
On Mon, 2018-10-08 at 10:56 +0300, Igor Stoppa wrote: > Hi, > > I have the following fragment of code: > > +struct my_struct { > + atomic_long_t l __aligned(sizeof(atomic_long_t)); > +} __aligned(sizeof(atomic_long_t)); > > > triggering this warning, when fed to checkpatch.pl: > > WARNING: function definition argument 'atomic_long_t' should also have > an identifier name > #19: FILE: path/to/file.h > + atomic_long_t l __aligned(sizeof(atomic_long_t)); > > > gcc [(Ubuntu 7.3.0-16ubuntu3) 7.3.0] seems to be happy about it > > I am using the HEAD from mainline. > > My intent is to specify the alignment of both the field and the > structure (yes, probably redundant in this single-field case). > > If I am doing something wrong, I can't figure out what it is, but I > don't understand why the WARNING is mentioning a function definition. It's a defect in checkpatch. For now, just ignore the message. I will work on it later.
CHECKPATCH: strange warning on alignment modifier
Hi, I have the following fragment of code: +struct my_struct { + atomic_long_t l __aligned(sizeof(atomic_long_t)); +} __aligned(sizeof(atomic_long_t)); triggering this warning, when fed to checkpatch.pl: WARNING: function definition argument 'atomic_long_t' should also have an identifier name #19: FILE: path/to/file.h + atomic_long_t l __aligned(sizeof(atomic_long_t)); gcc [(Ubuntu 7.3.0-16ubuntu3) 7.3.0] seems to be happy about it I am using the HEAD from mainline. My intent is to specify the alignment of both the field and the structure (yes, probably redundant in this single-field case). If I am doing something wrong, I can't figure out what it is, but I don't understand why the WARNING is mentioning a function definition. -- igor
[PATCH 4.14 053/156] s390/zcrypt: Fix wrong comparison leading to strange load balancing
4.14-stable review patch. If anyone has any objections, please let me know. -- From: Harald Freudenberger [ Upstream commit 0b0882672640ced4deeebf84da0b88b6389619c4 ] The function to decide if one zcrypt queue is better than another one compared two pointers instead of comparing the values where the pointers refer to. So within the same zcrypt card when load of each queue was equal just one queue was used. This effect only appears on relatively lite load, typically with one thread applications. This patch fixes the wrong comparison and now the counters show that requests are balanced equally over all available queues within the cards. There is no performance improvement coming with this fix. As long as the queue depth for an APQN queue is not touched, processing is not faster when requests are spread over queues within the same card hardware. So this fix only beautifies the lszcrypt counter printouts. Signed-off-by: Harald Freudenberger Signed-off-by: Martin Schwidefsky Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/s390/crypto/zcrypt_api.c |4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) --- a/drivers/s390/crypto/zcrypt_api.c +++ b/drivers/s390/crypto/zcrypt_api.c @@ -218,8 +218,8 @@ static inline bool zcrypt_queue_compare( weight += atomic_read(&zq->load); pref_weight += atomic_read(&pref_zq->load); if (weight == pref_weight) - return &zq->queue->total_request_count > - &pref_zq->queue->total_request_count; + return zq->queue->total_request_count > + pref_zq->queue->total_request_count; return weight > pref_weight; }
[PATCH AUTOSEL for 4.14 024/100] s390/zcrypt: Fix wrong comparison leading to strange load balancing
From: Harald Freudenberger [ Upstream commit 0b0882672640ced4deeebf84da0b88b6389619c4 ] The function to decide if one zcrypt queue is better than another one compared two pointers instead of comparing the values where the pointers refer to. So within the same zcrypt card when load of each queue was equal just one queue was used. This effect only appears on relatively lite load, typically with one thread applications. This patch fixes the wrong comparison and now the counters show that requests are balanced equally over all available queues within the cards. There is no performance improvement coming with this fix. As long as the queue depth for an APQN queue is not touched, processing is not faster when requests are spread over queues within the same card hardware. So this fix only beautifies the lszcrypt counter printouts. Signed-off-by: Harald Freudenberger Signed-off-by: Martin Schwidefsky Signed-off-by: Sasha Levin --- drivers/s390/crypto/zcrypt_api.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/s390/crypto/zcrypt_api.c b/drivers/s390/crypto/zcrypt_api.c index b5f4006198b9..a9a56aa9c26b 100644 --- a/drivers/s390/crypto/zcrypt_api.c +++ b/drivers/s390/crypto/zcrypt_api.c @@ -218,8 +218,8 @@ static inline bool zcrypt_queue_compare(struct zcrypt_queue *zq, weight += atomic_read(&zq->load); pref_weight += atomic_read(&pref_zq->load); if (weight == pref_weight) - return &zq->queue->total_request_count > - &pref_zq->queue->total_request_count; + return zq->queue->total_request_count > + pref_zq->queue->total_request_count; return weight > pref_weight; } -- 2.11.0
[PATCH 02/22] signal: Document the strange si_codes used by ptrace event stops
Signed-off-by: "Eric W. Biederman" --- include/uapi/asm-generic/siginfo.h | 5 + 1 file changed, 5 insertions(+) diff --git a/include/uapi/asm-generic/siginfo.h b/include/uapi/asm-generic/siginfo.h index cbe1b0cc7a6a..7158421ac911 100644 --- a/include/uapi/asm-generic/siginfo.h +++ b/include/uapi/asm-generic/siginfo.h @@ -225,6 +225,11 @@ typedef struct siginfo { #define TRAP_HWBKPT 4 /* hardware breakpoint/watchpoint */ #define NSIGTRAP 4 +/* + * There are an additional set of SIGTRAP si_codes used by ptrace + * that of the form: ((PTRACE_EVENT_XXX << 8) | SIGTRAP) + */ + /* * SIGCHLD si_codes */ -- 2.14.1
Re: linux-next: strange update of the sound tree
Hi Takashi, On Mon, 05 Jun 2017 18:32:54 +0200 Takashi Iwai wrote: > > Doh, I pushed out a wrong branch (master) to for-next, as I had to > work on another machine at home today due to a holiday. Now the > correct branch was pushed. Much better, thanks :-) -- Cheers, Stephen Rothwell
Re: linux-next: strange update of the sound tree
On Mon, 05 Jun 2017 16:17:23 +0200, Stephen Rothwell wrote: > > Hi Takashi, > > I don't think you have the result in the sound tree > (git://git.kernel.org/pub/scm/linux/kernel/git/tiwai/sound.git > branch for-next). Yesterday it was a fairly linear tree with head > commit 7421a1671abe ("ALSA: pcm: include pcm_local.h and remove some > extraneous tabs") based on commit f83914fdfcc3 ("ALSA: usb-audio: fix > Amanero Combo384 quirk on big-endian hosts"). Now it is a tree with > 687 commits (realative to Linus' tree) with 651 merge commits in it. :-( Doh, I pushed out a wrong branch (master) to for-next, as I had to work on another machine at home today due to a holiday. Now the correct branch was pushed. Sorry for inconvenience, and thanks for catching it. Takashi
linux-next: strange update of the sound tree
Hi Takashi, I don't think you have the result in the sound tree (git://git.kernel.org/pub/scm/linux/kernel/git/tiwai/sound.git branch for-next). Yesterday it was a fairly linear tree with head commit 7421a1671abe ("ALSA: pcm: include pcm_local.h and remove some extraneous tabs") based on commit f83914fdfcc3 ("ALSA: usb-audio: fix Amanero Combo384 quirk on big-endian hosts"). Now it is a tree with 687 commits (realative to Linus' tree) with 651 merge commits in it. :-( -- Cheers, Stephen Rothwell
Re: strange PAGE_ALLOC_COSTLY_ORDER usage in xgbe_map_rx_buffer
On 6/2/2017 9:43 AM, Michal Hocko wrote: On Fri 02-06-17 09:20:54, Tom Lendacky wrote: On 5/31/2017 11:04 AM, Michal Hocko wrote: Hi Tom, Hi Michal, I have stumbled over the following construct in xgbe_map_rx_buffer order = max_t(int, PAGE_ALLOC_COSTLY_ORDER - 1, 0); which looks quite suspicious. Why does it PAGE_ALLOC_COSTLY_ORDER - 1? And why do you depend on PAGE_ALLOC_COSTLY_ORDER at all? The driver tries to allocate a number of pages to be used as receive buffers. Based on what I could find in documentation, the value of PAGE_ALLOC_COSTLY_ORDER is the point at which order allocations (could) get expensive. So I decrease by one the order requested. The max_t test is just to insure that in case PAGE_ALLOC_COSTLY_ORDER ever gets defined as 0, 0 would be used. So you have fallen into a carefully prepared trap ;). The thing is that orders _larger_ than PAGE_ALLOC_COSTLY_ORDER are costly actually. I can completely see how this can be confusing. Moreover xgbe_map_rx_buffer does an atomic allocation which doesn't do any direct reclaim/compaction attempts so the costly vs. non-costly doesn't apply here at all. I would be much happier if no code outside of mm used PAGE_ALLOC_COSTLY_ORDER directly but that requires a deeper consideration. E.g. what would be the largest size that would be useful for this path? xgbe_alloc_pages does the order fallback so PAGE_ALLOC_COSTLY_ORDER sounds like an artificial limit to me. I guess we can at least simplify the xgbe right away though. --- From c7d5ca637b889c4e3779f8d2a84ade6448a76ef9 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 2 Jun 2017 16:34:28 +0200 Subject: [PATCH] amd-xgbe: use PAGE_ALLOC_COSTLY_ORDER in xgbe_map_rx_buffer xgbe_map_rx_buffer is rather confused about what PAGE_ALLOC_COSTLY_ORDER means. It uses PAGE_ALLOC_COSTLY_ORDER-1 assuming that PAGE_ALLOC_COSTLY_ORDER is the first costly order which is not the case actually because orders larger than that are costly. And even that applies only to sleeping allocations which is not the case here. We simply do not perform any costly operations like reclaim or compaction for those. Simplify the code by dropping the order calculation and use PAGE_ALLOC_COSTLY_ORDER directly. Signed-off-by: Michal Hocko --- drivers/net/ethernet/amd/xgbe/xgbe-desc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-desc.c b/drivers/net/ethernet/amd/xgbe/xgbe-desc.c index b3bc87fe3764..5ded10eba418 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-desc.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-desc.c @@ -333,9 +333,8 @@ static int xgbe_map_rx_buffer(struct xgbe_prv_data *pdata, } if (!ring->rx_buf_pa.pages) { - order = max_t(int, PAGE_ALLOC_COSTLY_ORDER - 1, 0); ret = xgbe_alloc_pages(pdata, &ring->rx_buf_pa, GFP_ATOMIC, - order); + PAGE_ALLOC_COSTLY_ORDER); You'll need to also remove the variable definition to avoid an un-used variable warning. You should also send this to the netdev mailing list to send this through the net-next tree (or net tree if you want it fixed in the current version of the Linux kernel). Thanks, Tom if (ret) return ret; }
Re: strange PAGE_ALLOC_COSTLY_ORDER usage in xgbe_map_rx_buffer
On Fri 02-06-17 09:20:54, Tom Lendacky wrote: > On 5/31/2017 11:04 AM, Michal Hocko wrote: > >Hi Tom, > > Hi Michal, > > >I have stumbled over the following construct in xgbe_map_rx_buffer > > order = max_t(int, PAGE_ALLOC_COSTLY_ORDER - 1, 0); > >which looks quite suspicious. Why does it PAGE_ALLOC_COSTLY_ORDER - 1? > >And why do you depend on PAGE_ALLOC_COSTLY_ORDER at all? > > > > The driver tries to allocate a number of pages to be used as receive > buffers. Based on what I could find in documentation, the value of > PAGE_ALLOC_COSTLY_ORDER is the point at which order allocations > (could) get expensive. So I decrease by one the order requested. The > max_t test is just to insure that in case PAGE_ALLOC_COSTLY_ORDER ever > gets defined as 0, 0 would be used. So you have fallen into a carefully prepared trap ;). The thing is that orders _larger_ than PAGE_ALLOC_COSTLY_ORDER are costly actually. I can completely see how this can be confusing. Moreover xgbe_map_rx_buffer does an atomic allocation which doesn't do any direct reclaim/compaction attempts so the costly vs. non-costly doesn't apply here at all. I would be much happier if no code outside of mm used PAGE_ALLOC_COSTLY_ORDER directly but that requires a deeper consideration. E.g. what would be the largest size that would be useful for this path? xgbe_alloc_pages does the order fallback so PAGE_ALLOC_COSTLY_ORDER sounds like an artificial limit to me. I guess we can at least simplify the xgbe right away though. --- >From c7d5ca637b889c4e3779f8d2a84ade6448a76ef9 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 2 Jun 2017 16:34:28 +0200 Subject: [PATCH] amd-xgbe: use PAGE_ALLOC_COSTLY_ORDER in xgbe_map_rx_buffer xgbe_map_rx_buffer is rather confused about what PAGE_ALLOC_COSTLY_ORDER means. It uses PAGE_ALLOC_COSTLY_ORDER-1 assuming that PAGE_ALLOC_COSTLY_ORDER is the first costly order which is not the case actually because orders larger than that are costly. And even that applies only to sleeping allocations which is not the case here. We simply do not perform any costly operations like reclaim or compaction for those. Simplify the code by dropping the order calculation and use PAGE_ALLOC_COSTLY_ORDER directly. Signed-off-by: Michal Hocko --- drivers/net/ethernet/amd/xgbe/xgbe-desc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-desc.c b/drivers/net/ethernet/amd/xgbe/xgbe-desc.c index b3bc87fe3764..5ded10eba418 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-desc.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-desc.c @@ -333,9 +333,8 @@ static int xgbe_map_rx_buffer(struct xgbe_prv_data *pdata, } if (!ring->rx_buf_pa.pages) { - order = max_t(int, PAGE_ALLOC_COSTLY_ORDER - 1, 0); ret = xgbe_alloc_pages(pdata, &ring->rx_buf_pa, GFP_ATOMIC, - order); + PAGE_ALLOC_COSTLY_ORDER); if (ret) return ret; } -- 2.11.0 -- Michal Hocko SUSE Labs
Re: strange PAGE_ALLOC_COSTLY_ORDER usage in xgbe_map_rx_buffer
On 5/31/2017 11:04 AM, Michal Hocko wrote: Hi Tom, Hi Michal, I have stumbled over the following construct in xgbe_map_rx_buffer order = max_t(int, PAGE_ALLOC_COSTLY_ORDER - 1, 0); which looks quite suspicious. Why does it PAGE_ALLOC_COSTLY_ORDER - 1? And why do you depend on PAGE_ALLOC_COSTLY_ORDER at all? The driver tries to allocate a number of pages to be used as receive buffers. Based on what I could find in documentation, the value of PAGE_ALLOC_COSTLY_ORDER is the point at which order allocations (could) get expensive. So I decrease by one the order requested. The max_t test is just to insure that in case PAGE_ALLOC_COSTLY_ORDER ever gets defined as 0, 0 would be used. I believe there have been some enhancements relative to speed in allocating 0-order pages recently that may make this unnecessary. I haven't run any performance tests yet to determine if I can just go to a 0-order allocation, though. Thanks, Tom Thanks!
strange PAGE_ALLOC_COSTLY_ORDER usage in xgbe_map_rx_buffer
Hi Tom, I have stumbled over the following construct in xgbe_map_rx_buffer order = max_t(int, PAGE_ALLOC_COSTLY_ORDER - 1, 0); which looks quite suspicious. Why does it PAGE_ALLOC_COSTLY_ORDER - 1? And why do you depend on PAGE_ALLOC_COSTLY_ORDER at all? Thanks! -- Michal Hocko SUSE Labs
Re: [dm-devel] [PATCH] dm-region-hash: fix strange usage of mempool_alloc.
On Wed, May 03 2017, Mikulas Patocka wrote: > On Mon, 24 Apr 2017, NeilBrown wrote: >> >> I had a look at how the allocation 'dm_region' objects are used, >> and it would take a bit of work to make it really safe. >> My guess is __rh_find() should be allowed to fail, and the various >> callers need to handle failure. >> For example, dm_rh_inc_pending() would be given a second bio_list, >> and would move any bios for which rh_inc() fails, onto that list. >> Then do_writes() would merge that list back into ms->writes. >> That way do_mirror() would not block indefinitely and forward progress >> could be assured ... maybe. >> It would take more work than I'm able to give at the moment, so >> I'm happy to just drop this patch. >> >> Thanks, >> NeilBrown > > I think that the only way how to fix this would be to preallocate the all > the regions when the target is created. > > But, with the default region size 512kiB, it would cause high memory > consumption (approximatelly 1GB of RAM for 20TB device). Two reflections: 1/ This is close to what md/bitmap does. It actually uses a 2-level array for the 'pending' field from dm_region, combined with something similar to 'state'. The top level is allocated when the device is created. Entries in this table are either - pointers to a second-level array for 2048 regions - entries for 2 giant regions, 1024 times the normal size. So if we cannot allocate a page when we need that second level, we just use an enormous region and so risk resync taking a bit longer if there is a crash. 2/ Even though md does pre-alloc to a degree, I'm not convinced that it is necessary. We only need a region to be recorded when it is actively being written to, or when it is being recovered. We could, in theory, have just one region that is written to and one region that is being recovered. If a writes request arrives for a different region it blocks until the current region has no active requests. Then that region is forgotten and the new region activated, and the new write allowed to proceed. Obviously this would be horribly slow, but it should be deadlock-free. Using a mempool instead of a single region would then allow multiple regions to be active in parallel, which would improve throughput without affecting the deadlock status. Maybe I'll try to code it and see what happens ... maybe not. NeilBrown signature.asc Description: PGP signature
Re: [dm-devel] [PATCH] dm-region-hash: fix strange usage of mempool_alloc.
On Mon, 24 Apr 2017, NeilBrown wrote: > On Fri, Apr 21 2017, Mikulas Patocka wrote: > > > On Mon, 10 Apr 2017, NeilBrown wrote: > > > >> mempool_alloc() should only be called with GFP_ATOMIC when > >> it is not safe to wait. Passing __GFP_NOFAIL to kmalloc() > >> says that it is safe to wait indefinitely. So this code is > >> inconsistent. > >> > >> Clearly it is OK to wait indefinitely in this code, and > >> mempool_alloc() is able to do that. So just use > >> mempool_alloc, and allow it to sleep. If no memory is > >> available it will wait for something to be returned to the > >> pool, and will retry a normal allocation regularly. > > > > The region hash code is buggy anyway, because it may allocate more entries > > than the size of the pool and not give them back. > > > > That kmalloc was introduced in the commit c06aad854 to work around a > > deadlock due to incorrect mempool usage. > > > > Your patch slightly increases the probability of the deadlock because > > mempool_alloc does all allocations with __GFP_NOMEMALLOC, so it uses > > higher limit than kmalloc(GFP_NOIO). > > > > Thanks for the review. > > I had a look at how the allocation 'dm_region' objects are used, > and it would take a bit of work to make it really safe. > My guess is __rh_find() should be allowed to fail, and the various > callers need to handle failure. > For example, dm_rh_inc_pending() would be given a second bio_list, > and would move any bios for which rh_inc() fails, onto that list. > Then do_writes() would merge that list back into ms->writes. > That way do_mirror() would not block indefinitely and forward progress > could be assured ... maybe. > It would take more work than I'm able to give at the moment, so > I'm happy to just drop this patch. > > Thanks, > NeilBrown I think that the only way how to fix this would be to preallocate the all the regions when the target is created. But, with the default region size 512kiB, it would cause high memory consumption (approximatelly 1GB of RAM for 20TB device). Mikulas
Re: [dm-devel] [PATCH] dm-region-hash: fix strange usage of mempool_alloc.
On Fri, Apr 21 2017, Mikulas Patocka wrote: > On Mon, 10 Apr 2017, NeilBrown wrote: > >> mempool_alloc() should only be called with GFP_ATOMIC when >> it is not safe to wait. Passing __GFP_NOFAIL to kmalloc() >> says that it is safe to wait indefinitely. So this code is >> inconsistent. >> >> Clearly it is OK to wait indefinitely in this code, and >> mempool_alloc() is able to do that. So just use >> mempool_alloc, and allow it to sleep. If no memory is >> available it will wait for something to be returned to the >> pool, and will retry a normal allocation regularly. > > The region hash code is buggy anyway, because it may allocate more entries > than the size of the pool and not give them back. > > That kmalloc was introduced in the commit c06aad854 to work around a > deadlock due to incorrect mempool usage. > > Your patch slightly increases the probability of the deadlock because > mempool_alloc does all allocations with __GFP_NOMEMALLOC, so it uses > higher limit than kmalloc(GFP_NOIO). > Thanks for the review. I had a look at how the allocation 'dm_region' objects are used, and it would take a bit of work to make it really safe. My guess is __rh_find() should be allowed to fail, and the various callers need to handle failure. For example, dm_rh_inc_pending() would be given a second bio_list, and would move any bios for which rh_inc() fails, onto that list. Then do_writes() would merge that list back into ms->writes. That way do_mirror() would not block indefinitely and forward progress could be assured ... maybe. It would take more work than I'm able to give at the moment, so I'm happy to just drop this patch. Thanks, NeilBrown signature.asc Description: PGP signature
Re: [PATCH] dm-region-hash: fix strange usage of mempool_alloc.
On Mon, 10 Apr 2017, NeilBrown wrote: > mempool_alloc() should only be called with GFP_ATOMIC when > it is not safe to wait. Passing __GFP_NOFAIL to kmalloc() > says that it is safe to wait indefinitely. So this code is > inconsistent. > > Clearly it is OK to wait indefinitely in this code, and > mempool_alloc() is able to do that. So just use > mempool_alloc, and allow it to sleep. If no memory is > available it will wait for something to be returned to the > pool, and will retry a normal allocation regularly. The region hash code is buggy anyway, because it may allocate more entries than the size of the pool and not give them back. That kmalloc was introduced in the commit c06aad854 to work around a deadlock due to incorrect mempool usage. Your patch slightly increases the probability of the deadlock because mempool_alloc does all allocations with __GFP_NOMEMALLOC, so it uses higher limit than kmalloc(GFP_NOIO). Mikulas > Signed-off-by: NeilBrown > --- > drivers/md/dm-region-hash.c | 4 +--- > 1 file changed, 1 insertion(+), 3 deletions(-) > > diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c > index 85c32b22a420..a6279f5d779e 100644 > --- a/drivers/md/dm-region-hash.c > +++ b/drivers/md/dm-region-hash.c > @@ -287,9 +287,7 @@ static struct dm_region *__rh_alloc(struct dm_region_hash > *rh, region_t region) > { > struct dm_region *reg, *nreg; > > - nreg = mempool_alloc(rh->region_pool, GFP_ATOMIC); > - if (unlikely(!nreg)) > - nreg = kmalloc(sizeof(*nreg), GFP_NOIO | __GFP_NOFAIL); > + nreg = mempool_alloc(rh->region_pool, GFP_NOIO); > > nreg->state = rh->log->type->in_sync(rh->log, region, 1) ? > DM_RH_CLEAN : DM_RH_NOSYNC; > -- > 2.12.2 > >
[PATCH] dm-region-hash: fix strange usage of mempool_alloc.
mempool_alloc() should only be called with GFP_ATOMIC when it is not safe to wait. Passing __GFP_NOFAIL to kmalloc() says that it is safe to wait indefinitely. So this code is inconsistent. Clearly it is OK to wait indefinitely in this code, and mempool_alloc() is able to do that. So just use mempool_alloc, and allow it to sleep. If no memory is available it will wait for something to be returned to the pool, and will retry a normal allocation regularly. Signed-off-by: NeilBrown --- drivers/md/dm-region-hash.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c index 85c32b22a420..a6279f5d779e 100644 --- a/drivers/md/dm-region-hash.c +++ b/drivers/md/dm-region-hash.c @@ -287,9 +287,7 @@ static struct dm_region *__rh_alloc(struct dm_region_hash *rh, region_t region) { struct dm_region *reg, *nreg; - nreg = mempool_alloc(rh->region_pool, GFP_ATOMIC); - if (unlikely(!nreg)) - nreg = kmalloc(sizeof(*nreg), GFP_NOIO | __GFP_NOFAIL); + nreg = mempool_alloc(rh->region_pool, GFP_NOIO); nreg->state = rh->log->type->in_sync(rh->log, region, 1) ? DM_RH_CLEAN : DM_RH_NOSYNC; -- 2.12.2 signature.asc Description: PGP signature
[PATCH 3/9] rhashtable: simplify a strange allocation pattern
From: Michal Hocko alloc_bucket_locks allocation pattern is quite unusual. We are preferring vmalloc when CONFIG_NUMA is enabled. The rationale is that vmalloc will respect the memory policy of the current process and so the backing memory will get distributed over multiple nodes if the requester is configured properly. At least that is the intention, in reality rhastable is shrunk and expanded from a kernel worker so no mempolicy can be assumed. Let's just simplify the code and use kvmalloc helper, which is a transparent way to use kmalloc with vmalloc fallback, if the caller is allowed to block and use the flag otherwise. Cc: Tom Herbert Cc: Eric Dumazet Acked-by: Vlastimil Babka Signed-off-by: Michal Hocko --- lib/rhashtable.c | 13 +++-- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/lib/rhashtable.c b/lib/rhashtable.c index f8635fd57442..2c2c8afcde15 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -86,16 +86,9 @@ static int alloc_bucket_locks(struct rhashtable *ht, struct bucket_table *tbl, size = min(size, 1U << tbl->nest); if (sizeof(spinlock_t) != 0) { - tbl->locks = NULL; -#ifdef CONFIG_NUMA - if (size * sizeof(spinlock_t) > PAGE_SIZE && - gfp == GFP_KERNEL) - tbl->locks = vmalloc(size * sizeof(spinlock_t)); -#endif - if (gfp != GFP_KERNEL) - gfp |= __GFP_NOWARN | __GFP_NORETRY; - - if (!tbl->locks) + if (gfpflags_allow_blocking(gfp)) + tbl->locks = kvmalloc(size * sizeof(spinlock_t), gfp); + else tbl->locks = kmalloc_array(size, sizeof(spinlock_t), gfp); if (!tbl->locks) -- 2.11.0
[PATCH 4/9] ila: simplify a strange allocation pattern
From: Michal Hocko alloc_ila_locks seemed to c&p from alloc_bucket_locks allocation pattern which is quite unusual. The default allocation size is 320 * sizeof(spinlock_t) which is sub page unless lockdep is enabled when the performance benefit is really questionable and not worth the subtle code IMHO. Also note that the context when we call ila_init_net (modprobe or a task creating a net namespace) has to be properly configured. Let's just simplify the code and use kvmalloc helper which is a transparent way to use kmalloc with vmalloc fallback. Cc: Tom Herbert Cc: Eric Dumazet Acked-by: Vlastimil Babka Signed-off-by: Michal Hocko --- net/ipv6/ila/ila_xlat.c | 8 +--- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c index af8f52ee7180..2fd5ca151dcf 100644 --- a/net/ipv6/ila/ila_xlat.c +++ b/net/ipv6/ila/ila_xlat.c @@ -41,13 +41,7 @@ static int alloc_ila_locks(struct ila_net *ilan) size = roundup_pow_of_two(nr_pcpus * LOCKS_PER_CPU); if (sizeof(spinlock_t) != 0) { -#ifdef CONFIG_NUMA - if (size * sizeof(spinlock_t) > PAGE_SIZE) - ilan->locks = vmalloc(size * sizeof(spinlock_t)); - else -#endif - ilan->locks = kmalloc_array(size, sizeof(spinlock_t), - GFP_KERNEL); + ilan->locks = kvmalloc(size * sizeof(spinlock_t), GFP_KERNEL); if (!ilan->locks) return -ENOMEM; for (i = 0; i < size; i++) -- 2.11.0
[media] s5p-cec: strange clk enabling pattern
The s5p-cec driver has a few places that touch hdmicec clk: static int s5p_cec_probe(struct platform_device *pdev) { ... cec->clk = devm_clk_get(dev, "hdmicec"); if (IS_ERR(cec->clk)) return PTR_ERR(cec->clk); ... } static int __maybe_unused s5p_cec_runtime_suspend(struct device *dev) { struct s5p_cec_dev *cec = dev_get_drvdata(dev); clk_disable_unprepare(cec->clk); return 0; } static int __maybe_unused s5p_cec_runtime_resume(struct device *dev) { struct s5p_cec_dev *cec = dev_get_drvdata(dev); int ret; ret = clk_prepare_enable(cec->clk); if (ret < 0) return ret; return 0; } Is it ok to enable/disable clock in rusume/suspend only? Or have I missed anything? -- Thank you, Alexey Khoroshilov Linux Verification Center, ISPRAS web: http://linuxtesting.org
Re: [PATCH 4/9] ila: simplify a strange allocation pattern
On 01/30/2017 10:49 AM, Michal Hocko wrote: From: Michal Hocko alloc_ila_locks seemed to c&p from alloc_bucket_locks allocation pattern which is quite unusual. The default allocation size is 320 * sizeof(spinlock_t) which is sub page unless lockdep is enabled when the performance benefit is really questionable and not worth the subtle code IMHO. Also note that the context when we call ila_init_net (modprobe or a task creating a net namespace) has to be properly configured. Let's just simplify the code and use kvmalloc helper which is a transparent way to use kmalloc with vmalloc fallback. Cc: Tom Herbert Cc: Eric Dumazet Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka --- net/ipv6/ila/ila_xlat.c | 8 +--- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c index af8f52ee7180..2fd5ca151dcf 100644 --- a/net/ipv6/ila/ila_xlat.c +++ b/net/ipv6/ila/ila_xlat.c @@ -41,13 +41,7 @@ static int alloc_ila_locks(struct ila_net *ilan) size = roundup_pow_of_two(nr_pcpus * LOCKS_PER_CPU); if (sizeof(spinlock_t) != 0) { -#ifdef CONFIG_NUMA - if (size * sizeof(spinlock_t) > PAGE_SIZE) - ilan->locks = vmalloc(size * sizeof(spinlock_t)); - else -#endif - ilan->locks = kmalloc_array(size, sizeof(spinlock_t), - GFP_KERNEL); + ilan->locks = kvmalloc(size * sizeof(spinlock_t), GFP_KERNEL); if (!ilan->locks) return -ENOMEM; for (i = 0; i < size; i++)
Re: [PATCH 3/9] rhashtable: simplify a strange allocation pattern
On 01/30/2017 10:49 AM, Michal Hocko wrote: From: Michal Hocko alloc_bucket_locks allocation pattern is quite unusual. We are preferring vmalloc when CONFIG_NUMA is enabled. The rationale is that vmalloc will respect the memory policy of the current process and so the backing memory will get distributed over multiple nodes if the requester is configured properly. At least that is the intention, in reality rhastable is shrunk and expanded from a kernel worker so no mempolicy can be assumed. Let's just simplify the code and use kvmalloc helper, which is a transparent way to use kmalloc with vmalloc fallback, if the caller is allowed to block and use the flag otherwise. Cc: Tom Herbert Cc: Eric Dumazet Acked-by: Vlastimil Babka Signed-off-by: Michal Hocko --- lib/rhashtable.c | 13 +++-- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/lib/rhashtable.c b/lib/rhashtable.c index 32d0ad058380..1a487ea70829 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -77,16 +77,9 @@ static int alloc_bucket_locks(struct rhashtable *ht, struct bucket_table *tbl, size = min_t(unsigned int, size, tbl->size >> 1); if (sizeof(spinlock_t) != 0) { - tbl->locks = NULL; -#ifdef CONFIG_NUMA - if (size * sizeof(spinlock_t) > PAGE_SIZE && - gfp == GFP_KERNEL) - tbl->locks = vmalloc(size * sizeof(spinlock_t)); -#endif - if (gfp != GFP_KERNEL) - gfp |= __GFP_NOWARN | __GFP_NORETRY; - - if (!tbl->locks) + if (gfpflags_allow_blocking(gfp)) + tbl->locks = kvmalloc(size * sizeof(spinlock_t), gfp); + else tbl->locks = kmalloc_array(size, sizeof(spinlock_t), gfp); if (!tbl->locks)
[PATCH 4/9] ila: simplify a strange allocation pattern
From: Michal Hocko alloc_ila_locks seemed to c&p from alloc_bucket_locks allocation pattern which is quite unusual. The default allocation size is 320 * sizeof(spinlock_t) which is sub page unless lockdep is enabled when the performance benefit is really questionable and not worth the subtle code IMHO. Also note that the context when we call ila_init_net (modprobe or a task creating a net namespace) has to be properly configured. Let's just simplify the code and use kvmalloc helper which is a transparent way to use kmalloc with vmalloc fallback. Cc: Tom Herbert Cc: Eric Dumazet Signed-off-by: Michal Hocko --- net/ipv6/ila/ila_xlat.c | 8 +--- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c index af8f52ee7180..2fd5ca151dcf 100644 --- a/net/ipv6/ila/ila_xlat.c +++ b/net/ipv6/ila/ila_xlat.c @@ -41,13 +41,7 @@ static int alloc_ila_locks(struct ila_net *ilan) size = roundup_pow_of_two(nr_pcpus * LOCKS_PER_CPU); if (sizeof(spinlock_t) != 0) { -#ifdef CONFIG_NUMA - if (size * sizeof(spinlock_t) > PAGE_SIZE) - ilan->locks = vmalloc(size * sizeof(spinlock_t)); - else -#endif - ilan->locks = kmalloc_array(size, sizeof(spinlock_t), - GFP_KERNEL); + ilan->locks = kvmalloc(size * sizeof(spinlock_t), GFP_KERNEL); if (!ilan->locks) return -ENOMEM; for (i = 0; i < size; i++) -- 2.11.0
[PATCH 3/9] rhashtable: simplify a strange allocation pattern
From: Michal Hocko alloc_bucket_locks allocation pattern is quite unusual. We are preferring vmalloc when CONFIG_NUMA is enabled. The rationale is that vmalloc will respect the memory policy of the current process and so the backing memory will get distributed over multiple nodes if the requester is configured properly. At least that is the intention, in reality rhastable is shrunk and expanded from a kernel worker so no mempolicy can be assumed. Let's just simplify the code and use kvmalloc helper, which is a transparent way to use kmalloc with vmalloc fallback, if the caller is allowed to block and use the flag otherwise. Cc: Tom Herbert Cc: Eric Dumazet Signed-off-by: Michal Hocko --- lib/rhashtable.c | 13 +++-- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/lib/rhashtable.c b/lib/rhashtable.c index 32d0ad058380..1a487ea70829 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -77,16 +77,9 @@ static int alloc_bucket_locks(struct rhashtable *ht, struct bucket_table *tbl, size = min_t(unsigned int, size, tbl->size >> 1); if (sizeof(spinlock_t) != 0) { - tbl->locks = NULL; -#ifdef CONFIG_NUMA - if (size * sizeof(spinlock_t) > PAGE_SIZE && - gfp == GFP_KERNEL) - tbl->locks = vmalloc(size * sizeof(spinlock_t)); -#endif - if (gfp != GFP_KERNEL) - gfp |= __GFP_NOWARN | __GFP_NORETRY; - - if (!tbl->locks) + if (gfpflags_allow_blocking(gfp)) + tbl->locks = kvmalloc(size * sizeof(spinlock_t), gfp); + else tbl->locks = kmalloc_array(size, sizeof(spinlock_t), gfp); if (!tbl->locks) -- 2.11.0
Re: edac/i7300_edac.c:307: strange macro ?
On Thu, Jan 26, 2017 at 06:57:44AM -0200, Mauro Carvalho Chehab wrote: > Seems OK to me. Do you want to put it on your tree, or do you > prefer if I put it on mine? I can carry it. > If you prefer to send via your tree: > > Reviewed-by: Mauro Carvalho Chehab Thanks. -- Regards/Gruss, Boris. Good mailing practices for 400: avoid top-posting and trim the reply.
Re: edac/i7300_edac.c:307: strange macro ?
Em Wed, 25 Jan 2017 16:37:28 +0100 Borislav Petkov escreveu: > On Wed, Jan 25, 2017 at 12:04:04PM +, David Binderman wrote: > > You'll have a very long wait to get a linux patch from me. > > > > I am happy for someone else to invent a patch. > > Ah ok, I thought you wanted to give it a try and would want me to help > you with it. :-) > > Anyway, here it is: > > --- > From: Borislav Petkov > Date: Wed, 25 Jan 2017 16:08:27 +0100 > Subject: [PATCH] EDAC, i7300: Test for the second channel properly > > REDMEMB[17] is the ECC_Locator bit, which, when set, identifies the > CS[3:2] as the simbols in error. And thus the second channel. > > The macro computing it was wrong so get rid of it (it was used at one > place only) and get rid of the conditional too. Generates better code > this way anyway. > > Signed-off-by: Borislav Petkov > Reported-by: David Binderman Seems OK to me. Do you want to put it on your tree, or do you prefer if I put it on mine? If you prefer to send via your tree: Reviewed-by: Mauro Carvalho Chehab > --- > drivers/edac/i7300_edac.c | 6 +++--- > 1 file changed, 3 insertions(+), 3 deletions(-) > > diff --git a/drivers/edac/i7300_edac.c b/drivers/edac/i7300_edac.c > index 0a912bf6de00..e391f5a716be 100644 > --- a/drivers/edac/i7300_edac.c > +++ b/drivers/edac/i7300_edac.c > @@ -304,7 +304,6 @@ static const char *ferr_global_lo_name[] = { > #define REDMEMA 0xdc > > #define REDMEMB 0x7c > - #define IS_SECOND_CH(v)((v) * (1 << 17)) > > #define RECMEMA 0xe0 >#define RECMEMA_BANK(v)(((v) >> 12) & 7) > @@ -483,8 +482,9 @@ static void i7300_process_fbd_error(struct mem_ctl_info > *mci) > pci_read_config_dword(pvt->pci_dev_16_1_fsb_addr_map, >REDMEMB, &value); > channel = (branch << 1); > - if (IS_SECOND_CH(value)) > - channel++; > + > + /* Second channel ? */ > + channel += !!(value & BIT(17)); > > /* Clear the error bit */ > pci_write_config_dword(pvt->pci_dev_16_1_fsb_addr_map, -- Thanks, Mauro
Re: edac/i7300_edac.c:307: strange macro ?
On Wed, Jan 25, 2017 at 12:04:04PM +, David Binderman wrote: > You'll have a very long wait to get a linux patch from me. > > I am happy for someone else to invent a patch. Ah ok, I thought you wanted to give it a try and would want me to help you with it. :-) Anyway, here it is: --- From: Borislav Petkov Date: Wed, 25 Jan 2017 16:08:27 +0100 Subject: [PATCH] EDAC, i7300: Test for the second channel properly REDMEMB[17] is the ECC_Locator bit, which, when set, identifies the CS[3:2] as the simbols in error. And thus the second channel. The macro computing it was wrong so get rid of it (it was used at one place only) and get rid of the conditional too. Generates better code this way anyway. Signed-off-by: Borislav Petkov Reported-by: David Binderman --- drivers/edac/i7300_edac.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/edac/i7300_edac.c b/drivers/edac/i7300_edac.c index 0a912bf6de00..e391f5a716be 100644 --- a/drivers/edac/i7300_edac.c +++ b/drivers/edac/i7300_edac.c @@ -304,7 +304,6 @@ static const char *ferr_global_lo_name[] = { #define REDMEMA0xdc #define REDMEMB0x7c - #define IS_SECOND_CH(v) ((v) * (1 << 17)) #define RECMEMA0xe0 #define RECMEMA_BANK(v) (((v) >> 12) & 7) @@ -483,8 +482,9 @@ static void i7300_process_fbd_error(struct mem_ctl_info *mci) pci_read_config_dword(pvt->pci_dev_16_1_fsb_addr_map, REDMEMB, &value); channel = (branch << 1); - if (IS_SECOND_CH(value)) - channel++; + + /* Second channel ? */ + channel += !!(value & BIT(17)); /* Clear the error bit */ pci_write_config_dword(pvt->pci_dev_16_1_fsb_addr_map, -- 2.11.0 -- Regards/Gruss, Boris. Good mailing practices for 400: avoid top-posting and trim the reply.
Re: edac/i7300_edac.c:307: strange macro ?
On Wed, Jan 11, 2017 at 11:04:22PM +, David Binderman wrote: > Thanks for the confirmation that my best guess looks good to you. So what now, is there a fix in the form of a patch going to appear on the horizon anytime soon? :-) -- Regards/Gruss, Boris. Good mailing practices for 400: avoid top-posting and trim the reply.
[PATCH 4/6] ila: simplify a strange allocation pattern
From: Michal Hocko alloc_ila_locks seemed to c&p from alloc_bucket_locks allocation pattern which is quite unusual. The default allocation size is 320 * sizeof(spinlock_t) which is sub page unless lockdep is enabled when the performance benefit is really questionable and not worth the subtle code IMHO. Also note that the context when we call ila_init_net (modprobe or a task creating a net namespace) has to be properly configured. Let's just simplify the code and use kvmalloc helper which is a transparent way to use kmalloc with vmalloc fallback. Cc: Tom Herbert Cc: Eric Dumazet Signed-off-by: Michal Hocko --- net/ipv6/ila/ila_xlat.c | 8 +--- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c index af8f52ee7180..2fd5ca151dcf 100644 --- a/net/ipv6/ila/ila_xlat.c +++ b/net/ipv6/ila/ila_xlat.c @@ -41,13 +41,7 @@ static int alloc_ila_locks(struct ila_net *ilan) size = roundup_pow_of_two(nr_pcpus * LOCKS_PER_CPU); if (sizeof(spinlock_t) != 0) { -#ifdef CONFIG_NUMA - if (size * sizeof(spinlock_t) > PAGE_SIZE) - ilan->locks = vmalloc(size * sizeof(spinlock_t)); - else -#endif - ilan->locks = kmalloc_array(size, sizeof(spinlock_t), - GFP_KERNEL); + ilan->locks = kvmalloc(size * sizeof(spinlock_t), GFP_KERNEL); if (!ilan->locks) return -ENOMEM; for (i = 0; i < size; i++) -- 2.11.0
[PATCH 3/6] rhashtable: simplify a strange allocation pattern
From: Michal Hocko alloc_bucket_locks allocation pattern is quite unusual. We are preferring vmalloc when CONFIG_NUMA is enabled. The rationale is that vmalloc will respect the memory policy of the current process and so the backing memory will get distributed over multiple nodes if the requester is configured properly. At least that is the intention, in reality rhastable is shrunk and expanded from a kernel worker so no mempolicy can be assumed. Let's just simplify the code and use kvmalloc helper, which is a transparent way to use kmalloc with vmalloc fallback, if the caller is allowed to block and use the flag otherwise. Cc: Tom Herbert Cc: Eric Dumazet Signed-off-by: Michal Hocko --- lib/rhashtable.c | 13 +++-- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/lib/rhashtable.c b/lib/rhashtable.c index 32d0ad058380..1a487ea70829 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -77,16 +77,9 @@ static int alloc_bucket_locks(struct rhashtable *ht, struct bucket_table *tbl, size = min_t(unsigned int, size, tbl->size >> 1); if (sizeof(spinlock_t) != 0) { - tbl->locks = NULL; -#ifdef CONFIG_NUMA - if (size * sizeof(spinlock_t) > PAGE_SIZE && - gfp == GFP_KERNEL) - tbl->locks = vmalloc(size * sizeof(spinlock_t)); -#endif - if (gfp != GFP_KERNEL) - gfp |= __GFP_NOWARN | __GFP_NORETRY; - - if (!tbl->locks) + if (gfpflags_allow_blocking(gfp)) + tbl->locks = kvmalloc(size * sizeof(spinlock_t), gfp); + else tbl->locks = kmalloc_array(size, sizeof(spinlock_t), gfp); if (!tbl->locks) -- 2.11.0
Re: edac/i7300_edac.c:307: strange macro ?
On Wed, Jan 11, 2017 at 04:48:51PM +, David Binderman wrote: > Hello there, > > drivers/edac/i7300_edac.c:307:32: warning: ‘*’ in boolean context, suggest > ‘&&’ instead [-Wint-in-bool-context] Are you adding some other -W-switches to the kernel Makefile? :-) > Source code is > > #define IS_SECOND_CH(v) ((v) * (1 << 17)) Looks like a bug to me. According to the chipset doc, bit 17 in REDMEMB is the locator bit for CS[3:2] which probably means the second channel but multiplying the full register value with 131072 looks wrong. > Maybe better code > > #define IS_SECOND_CH(v) ((v) & (1 << 17)) Yap. Mauro, what's up? -- Regards/Gruss, Boris. Good mailing practices for 400: avoid top-posting and trim the reply.
[PATCH 1/2] rhashtable: simplify a strange allocation pattern
From: Michal Hocko alloc_bucket_locks allocation pattern is quite unusual. We are preferring vmalloc when CONFIG_NUMA is enabled. The rationale is that vmalloc will respect the memory policy of the current process and so the backing memory will get distributed over multiple nodes if the requester is configured properly. At least that is the intention, in reality rhastable is shrunk and expanded from a kernel worker so no mempolicy can be assumed. Let's just simplify the code and use kvmalloc helper, which is a transparent way to use kmalloc with vmalloc fallback, if the caller is allowed to block and use the flag otherwise. Cc: Tom Herbert Signed-off-by: Michal Hocko --- lib/rhashtable.c | 13 +++-- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/lib/rhashtable.c b/lib/rhashtable.c index 32d0ad058380..1a487ea70829 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -77,16 +77,9 @@ static int alloc_bucket_locks(struct rhashtable *ht, struct bucket_table *tbl, size = min_t(unsigned int, size, tbl->size >> 1); if (sizeof(spinlock_t) != 0) { - tbl->locks = NULL; -#ifdef CONFIG_NUMA - if (size * sizeof(spinlock_t) > PAGE_SIZE && - gfp == GFP_KERNEL) - tbl->locks = vmalloc(size * sizeof(spinlock_t)); -#endif - if (gfp != GFP_KERNEL) - gfp |= __GFP_NOWARN | __GFP_NORETRY; - - if (!tbl->locks) + if (gfpflags_allow_blocking(gfp)) + tbl->locks = kvmalloc(size * sizeof(spinlock_t), gfp); + else tbl->locks = kmalloc_array(size, sizeof(spinlock_t), gfp); if (!tbl->locks) -- 2.11.0