[PATCH v3] media: i2c: tda1997: replace codec to component

2018-04-05 Thread Kuninori Morimoto

From: Kuninori Morimoto 

Now we can replace Codec to Component. Let's do it.

Note:
xxx_codec_xxx() ->  xxx_component_xxx()
.idle_bias_off = 0  ->  .idle_bias_on = 1
.ignore_pmdown_time = 0 ->  .use_pmdown_time = 1
-   ->  .endianness = 1
-   ->  .non_legacy_dai_naming = 1

Signed-off-by: Kuninori Morimoto 
---
Tim, Mauro

This replace patch is needed for ALSA SoC, otherwise it can't probe anymore.
Mark, I think it needs your acked-by ?

v2 -> v3

 - fixup driver name (= tda1997)

 drivers/media/i2c/tda1997x.c | 25 +
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/drivers/media/i2c/tda1997x.c b/drivers/media/i2c/tda1997x.c
index 3021913..33d7fcf 100644
--- a/drivers/media/i2c/tda1997x.c
+++ b/drivers/media/i2c/tda1997x.c
@@ -2444,7 +2444,7 @@ static int tda1997x_pcm_startup(struct snd_pcm_substream 
*substream,
struct snd_soc_dai *dai)
 {
struct tda1997x_state *state = snd_soc_dai_get_drvdata(dai);
-   struct snd_soc_codec *codec = dai->codec;
+   struct snd_soc_component *component = dai->component;
struct snd_pcm_runtime *rtd = substream->runtime;
int rate, err;
 
@@ -2452,11 +2452,11 @@ static int tda1997x_pcm_startup(struct 
snd_pcm_substream *substream,
err = snd_pcm_hw_constraint_minmax(rtd, SNDRV_PCM_HW_PARAM_RATE,
   rate, rate);
if (err < 0) {
-   dev_err(codec->dev, "failed to constrain samplerate to %dHz\n",
+   dev_err(component->dev, "failed to constrain samplerate to 
%dHz\n",
rate);
return err;
}
-   dev_info(codec->dev, "set samplerate constraint to %dHz\n", rate);
+   dev_info(component->dev, "set samplerate constraint to %dHz\n", rate);
 
return 0;
 }
@@ -2479,20 +2479,22 @@ static int tda1997x_pcm_startup(struct 
snd_pcm_substream *substream,
.ops = _dai_ops,
 };
 
-static int tda1997x_codec_probe(struct snd_soc_codec *codec)
+static int tda1997x_codec_probe(struct snd_soc_component *component)
 {
return 0;
 }
 
-static int tda1997x_codec_remove(struct snd_soc_codec *codec)
+static void tda1997x_codec_remove(struct snd_soc_component *component)
 {
-   return 0;
 }
 
-static struct snd_soc_codec_driver tda1997x_codec_driver = {
-   .probe = tda1997x_codec_probe,
-   .remove = tda1997x_codec_remove,
-   .reg_word_size = sizeof(u16),
+static struct snd_soc_component_driver tda1997x_codec_driver = {
+   .probe  = tda1997x_codec_probe,
+   .remove = tda1997x_codec_remove,
+   .idle_bias_on   = 1,
+   .use_pmdown_time= 1,
+   .endianness = 1,
+   .non_legacy_dai_naming  = 1,
 };
 
 static int tda1997x_probe(struct i2c_client *client,
@@ -2737,7 +2739,7 @@ static int tda1997x_probe(struct i2c_client *client,
else
formats = SNDRV_PCM_FMTBIT_S16_LE;
tda1997x_audio_dai.capture.formats = formats;
-   ret = snd_soc_register_codec(>client->dev,
+   ret = devm_snd_soc_register_component(>client->dev,
 _codec_driver,
 _audio_dai, 1);
if (ret) {
@@ -2782,7 +2784,6 @@ static int tda1997x_remove(struct i2c_client *client)
struct tda1997x_platform_data *pdata = >pdata;
 
if (pdata->audout_format) {
-   snd_soc_unregister_codec(>dev);
mutex_destroy(>audio_lock);
}
 
-- 
1.9.1



[PATCH v3] media: i2c: tda1997: replace codec to component

2018-04-05 Thread Kuninori Morimoto

From: Kuninori Morimoto 

Now we can replace Codec to Component. Let's do it.

Note:
xxx_codec_xxx() ->  xxx_component_xxx()
.idle_bias_off = 0  ->  .idle_bias_on = 1
.ignore_pmdown_time = 0 ->  .use_pmdown_time = 1
-   ->  .endianness = 1
-   ->  .non_legacy_dai_naming = 1

Signed-off-by: Kuninori Morimoto 
---
Tim, Mauro

This replace patch is needed for ALSA SoC, otherwise it can't probe anymore.
Mark, I think it needs your acked-by ?

v2 -> v3

 - fixup driver name (= tda1997)

 drivers/media/i2c/tda1997x.c | 25 +
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/drivers/media/i2c/tda1997x.c b/drivers/media/i2c/tda1997x.c
index 3021913..33d7fcf 100644
--- a/drivers/media/i2c/tda1997x.c
+++ b/drivers/media/i2c/tda1997x.c
@@ -2444,7 +2444,7 @@ static int tda1997x_pcm_startup(struct snd_pcm_substream 
*substream,
struct snd_soc_dai *dai)
 {
struct tda1997x_state *state = snd_soc_dai_get_drvdata(dai);
-   struct snd_soc_codec *codec = dai->codec;
+   struct snd_soc_component *component = dai->component;
struct snd_pcm_runtime *rtd = substream->runtime;
int rate, err;
 
@@ -2452,11 +2452,11 @@ static int tda1997x_pcm_startup(struct 
snd_pcm_substream *substream,
err = snd_pcm_hw_constraint_minmax(rtd, SNDRV_PCM_HW_PARAM_RATE,
   rate, rate);
if (err < 0) {
-   dev_err(codec->dev, "failed to constrain samplerate to %dHz\n",
+   dev_err(component->dev, "failed to constrain samplerate to 
%dHz\n",
rate);
return err;
}
-   dev_info(codec->dev, "set samplerate constraint to %dHz\n", rate);
+   dev_info(component->dev, "set samplerate constraint to %dHz\n", rate);
 
return 0;
 }
@@ -2479,20 +2479,22 @@ static int tda1997x_pcm_startup(struct 
snd_pcm_substream *substream,
.ops = _dai_ops,
 };
 
-static int tda1997x_codec_probe(struct snd_soc_codec *codec)
+static int tda1997x_codec_probe(struct snd_soc_component *component)
 {
return 0;
 }
 
-static int tda1997x_codec_remove(struct snd_soc_codec *codec)
+static void tda1997x_codec_remove(struct snd_soc_component *component)
 {
-   return 0;
 }
 
-static struct snd_soc_codec_driver tda1997x_codec_driver = {
-   .probe = tda1997x_codec_probe,
-   .remove = tda1997x_codec_remove,
-   .reg_word_size = sizeof(u16),
+static struct snd_soc_component_driver tda1997x_codec_driver = {
+   .probe  = tda1997x_codec_probe,
+   .remove = tda1997x_codec_remove,
+   .idle_bias_on   = 1,
+   .use_pmdown_time= 1,
+   .endianness = 1,
+   .non_legacy_dai_naming  = 1,
 };
 
 static int tda1997x_probe(struct i2c_client *client,
@@ -2737,7 +2739,7 @@ static int tda1997x_probe(struct i2c_client *client,
else
formats = SNDRV_PCM_FMTBIT_S16_LE;
tda1997x_audio_dai.capture.formats = formats;
-   ret = snd_soc_register_codec(>client->dev,
+   ret = devm_snd_soc_register_component(>client->dev,
 _codec_driver,
 _audio_dai, 1);
if (ret) {
@@ -2782,7 +2784,6 @@ static int tda1997x_remove(struct i2c_client *client)
struct tda1997x_platform_data *pdata = >pdata;
 
if (pdata->audout_format) {
-   snd_soc_unregister_codec(>dev);
mutex_destroy(>audio_lock);
}
 
-- 
1.9.1



linux-next: Tree for Apr 6

2018-04-05 Thread Stephen Rothwell
Hi all,

Please do not add any v4.18 destined stuff to your linux-next included
trees until after v4.17-rc1 has been released.

Changes since 20180405:

The vfs tree still had its build failure for which I reverted a commit.

The parisc-hd tree gained a build failure for which I applied a patch.

Non-merge commits (relative to Linus' tree): 3739
 3955 files changed, 139059 insertions(+), 84448 deletions(-)



I have created today's linux-next tree at
git://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git
(patches at http://www.kernel.org/pub/linux/kernel/next/ ).  If you
are tracking the linux-next tree using git, you should not use "git pull"
to do so as that will try to merge the new linux-next release with the
old one.  You should use "git fetch" and checkout or reset to the new
master.

You can see which trees have been included by looking in the Next/Trees
file in the source.  There are also quilt-import.log and merge.log
files in the Next directory.  Between each merge, the tree was built
with a ppc64_defconfig for powerpc, an allmodconfig for x86_64, a
multi_v7_defconfig for arm and a native build of tools/perf. After
the final fixups (if any), I do an x86_64 modules_install followed by
builds for x86_64 allnoconfig, powerpc allnoconfig (32 and 64 bit),
ppc44x_defconfig, allyesconfig and pseries_le_defconfig and i386, sparc
and sparc64 defconfig. And finally, a simple boot test of the powerpc
pseries_le_defconfig kernel in qemu (with and without kvm enabled).

Below is a summary of the state of the merge.

I am currently merging 258 trees (counting Linus' and 44 trees of bug
fix patches pending for the current merge release).

Stats about the size of the tree over time can be seen at
http://neuling.org/linux-next-size.html .

Status of my local build tests will be at
http://kisskb.ellerman.id.au/linux-next .  If maintainers want to give
advice about cross compilers/configs that work, we are always open to add
more builds.

Thanks to Randy Dunlap for doing many randconfig builds.  And to Paul
Gortmaker for triage and bug fixes.

-- 
Cheers,
Stephen Rothwell

$ git checkout master
$ git reset --hard stable
Merging origin/master (052c220da392 Merge tag 'scsi-for-linus' of 
git://git.kernel.org/pub/scm/linux/kernel/git/jejb/scsi)
Merging fixes/master (147a89bc71e7 Merge tag 'kconfig-v4.17' of 
git://git.kernel.org/pub/scm/linux/kernel/git/masahiroy/linux-kbuild)
Merging kbuild-current/fixes (28913ee8191a netfilter: nf_nat_snmp_basic: add 
correct dependency to Makefile)
Merging arc-current/for-curr (661e50bc8532 Linux 4.16-rc4)
Merging arm-current/fixes (1b8837b61714 ARM: 8750/1: deflate_xip_data.sh: minor 
fixes)
Merging arm64-fixes/for-next/fixes (e21da1c99200 arm64: Relax 
ARM_SMCCC_ARCH_WORKAROUND_1 discovery)
Merging m68k-current/for-linus (ecd685580c8f m68k/mac: Remove bogus "FIXME" 
comment)
Merging powerpc-fixes/fixes (52396500f97c powerpc/64s: Fix i-side SLB miss bad 
address handler saving nonvolatile GPRs)
Merging sparc/master (17dec0a94915 Merge branch 'userns-linus' of 
git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace)
Merging fscrypt-current/for-stable (ae64f9bd1d36 Linux 4.15-rc2)
Merging net/master (d68a19f89bcf Merge branch 'net-tunnel-name-validate')
Merging bpf/master (33491588c1fb kernel/bpf/syscall: fix warning defined but 
not used)
Merging ipsec/master (9a3fb9fb84cc xfrm: Fix transport mode skb control buffer 
usage.)
Merging netfilter/master (b9fc828debc8 qede: Fix barrier usage after tx 
doorbell write.)
Merging ipvs/master (f7fb77fc1235 netfilter: nft_compat: check extension hook 
mask only if set)
Merging wireless-drivers/master (4608f064532c Merge 
git://git.kernel.org/pub/scm/linux/kernel/git/davem/sparc-next)
Merging mac80211/master (b5dbc28762fd Merge tag 'kbuild-fixes-v4.16-3' of 
git://git.kernel.org/pub/scm/linux/kernel/git/masahiroy/linux-kbuild)
Merging rdma-fixes/for-rc (84652aefb347 RDMA/ucma: Introduce safer 
rdma_addr_size() variants)
Merging sound-current/for-linus (a820ccbe21e8 ALSA: pcm: Fix UAF at PCM release 
via PCM timer access)
Merging pci-current/for-linus (fc110ebdd014 PCI: dwc: Fix enumeration end when 
reaching root subordinate)
Merging driver-core.current/driver-core-linus (0c8efd610b58 Linux 4.16-rc5)
Merging tty.current/tty-linus (c698ca527893 Linux 4.16-rc6)
Merging usb.current/usb-linus (c698ca527893 Linux 4.16-rc6)
Merging usb-gadget-fixes/fixes (c6ba5084ce0d usb: gadget: udc: renesas_usb3: 
add binging for r8a77965)
Merging usb-serial-fixes/usb-linus (86d71233b615 USB: serial: ftdi_sio: add 
support for Harman FirmwareHubEmulator)
Merging usb-chipidea-fixes/ci-for-usb-stable (964728f9f407 USB: chipidea: msm: 
fix ulpi-node lookup)
Merging phy/fixes (59fba0869aca phy: qcom-ufs: add MODULE_LICENSE tag)
Merging staging.current/staging-linus (df34df483a97 Merge tag 
'staging-4.17-rc1' of 
git://git.kernel.org/p

linux-next: Tree for Apr 6

2018-04-05 Thread Stephen Rothwell
Hi all,

Please do not add any v4.18 destined stuff to your linux-next included
trees until after v4.17-rc1 has been released.

Changes since 20180405:

The vfs tree still had its build failure for which I reverted a commit.

The parisc-hd tree gained a build failure for which I applied a patch.

Non-merge commits (relative to Linus' tree): 3739
 3955 files changed, 139059 insertions(+), 84448 deletions(-)



I have created today's linux-next tree at
git://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git
(patches at http://www.kernel.org/pub/linux/kernel/next/ ).  If you
are tracking the linux-next tree using git, you should not use "git pull"
to do so as that will try to merge the new linux-next release with the
old one.  You should use "git fetch" and checkout or reset to the new
master.

You can see which trees have been included by looking in the Next/Trees
file in the source.  There are also quilt-import.log and merge.log
files in the Next directory.  Between each merge, the tree was built
with a ppc64_defconfig for powerpc, an allmodconfig for x86_64, a
multi_v7_defconfig for arm and a native build of tools/perf. After
the final fixups (if any), I do an x86_64 modules_install followed by
builds for x86_64 allnoconfig, powerpc allnoconfig (32 and 64 bit),
ppc44x_defconfig, allyesconfig and pseries_le_defconfig and i386, sparc
and sparc64 defconfig. And finally, a simple boot test of the powerpc
pseries_le_defconfig kernel in qemu (with and without kvm enabled).

Below is a summary of the state of the merge.

I am currently merging 258 trees (counting Linus' and 44 trees of bug
fix patches pending for the current merge release).

Stats about the size of the tree over time can be seen at
http://neuling.org/linux-next-size.html .

Status of my local build tests will be at
http://kisskb.ellerman.id.au/linux-next .  If maintainers want to give
advice about cross compilers/configs that work, we are always open to add
more builds.

Thanks to Randy Dunlap for doing many randconfig builds.  And to Paul
Gortmaker for triage and bug fixes.

-- 
Cheers,
Stephen Rothwell

$ git checkout master
$ git reset --hard stable
Merging origin/master (052c220da392 Merge tag 'scsi-for-linus' of 
git://git.kernel.org/pub/scm/linux/kernel/git/jejb/scsi)
Merging fixes/master (147a89bc71e7 Merge tag 'kconfig-v4.17' of 
git://git.kernel.org/pub/scm/linux/kernel/git/masahiroy/linux-kbuild)
Merging kbuild-current/fixes (28913ee8191a netfilter: nf_nat_snmp_basic: add 
correct dependency to Makefile)
Merging arc-current/for-curr (661e50bc8532 Linux 4.16-rc4)
Merging arm-current/fixes (1b8837b61714 ARM: 8750/1: deflate_xip_data.sh: minor 
fixes)
Merging arm64-fixes/for-next/fixes (e21da1c99200 arm64: Relax 
ARM_SMCCC_ARCH_WORKAROUND_1 discovery)
Merging m68k-current/for-linus (ecd685580c8f m68k/mac: Remove bogus "FIXME" 
comment)
Merging powerpc-fixes/fixes (52396500f97c powerpc/64s: Fix i-side SLB miss bad 
address handler saving nonvolatile GPRs)
Merging sparc/master (17dec0a94915 Merge branch 'userns-linus' of 
git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace)
Merging fscrypt-current/for-stable (ae64f9bd1d36 Linux 4.15-rc2)
Merging net/master (d68a19f89bcf Merge branch 'net-tunnel-name-validate')
Merging bpf/master (33491588c1fb kernel/bpf/syscall: fix warning defined but 
not used)
Merging ipsec/master (9a3fb9fb84cc xfrm: Fix transport mode skb control buffer 
usage.)
Merging netfilter/master (b9fc828debc8 qede: Fix barrier usage after tx 
doorbell write.)
Merging ipvs/master (f7fb77fc1235 netfilter: nft_compat: check extension hook 
mask only if set)
Merging wireless-drivers/master (4608f064532c Merge 
git://git.kernel.org/pub/scm/linux/kernel/git/davem/sparc-next)
Merging mac80211/master (b5dbc28762fd Merge tag 'kbuild-fixes-v4.16-3' of 
git://git.kernel.org/pub/scm/linux/kernel/git/masahiroy/linux-kbuild)
Merging rdma-fixes/for-rc (84652aefb347 RDMA/ucma: Introduce safer 
rdma_addr_size() variants)
Merging sound-current/for-linus (a820ccbe21e8 ALSA: pcm: Fix UAF at PCM release 
via PCM timer access)
Merging pci-current/for-linus (fc110ebdd014 PCI: dwc: Fix enumeration end when 
reaching root subordinate)
Merging driver-core.current/driver-core-linus (0c8efd610b58 Linux 4.16-rc5)
Merging tty.current/tty-linus (c698ca527893 Linux 4.16-rc6)
Merging usb.current/usb-linus (c698ca527893 Linux 4.16-rc6)
Merging usb-gadget-fixes/fixes (c6ba5084ce0d usb: gadget: udc: renesas_usb3: 
add binging for r8a77965)
Merging usb-serial-fixes/usb-linus (86d71233b615 USB: serial: ftdi_sio: add 
support for Harman FirmwareHubEmulator)
Merging usb-chipidea-fixes/ci-for-usb-stable (964728f9f407 USB: chipidea: msm: 
fix ulpi-node lookup)
Merging phy/fixes (59fba0869aca phy: qcom-ufs: add MODULE_LICENSE tag)
Merging staging.current/staging-linus (df34df483a97 Merge tag 
'staging-4.17-rc1' of 
git://git.kernel.org/p

Re: [PATCH] staging: greybus: Fix warning to limit chars per line

2018-04-05 Thread Viresh Kumar
On Fri, Apr 6, 2018 at 3:49 AM, Mark Greer  wrote:
> On Wed, Apr 04, 2018 at 12:02:46AM +0530, Gaurav Dhingra wrote:
>> Wrap comment to fix warning "prefer a maximum 75 chars per line"
>>
>> Signed-off-by: Gaurav Dhingra 
>> ---
>>  drivers/staging/greybus/audio_codec.h | 4 +++-
>>  1 file changed, 3 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/staging/greybus/audio_codec.h 
>> b/drivers/staging/greybus/audio_codec.h
>> index a1d5440..01838d9 100644
>> --- a/drivers/staging/greybus/audio_codec.h
>> +++ b/drivers/staging/greybus/audio_codec.h
>> @@ -23,7 +23,9 @@ enum {
>>   NUM_CODEC_DAIS,
>>  };
>>
>> -/* device_type should be same as defined in audio.h (Android media layer) */
>> +/* device_type should be same as defined in audio.h

This isn't the right way to write a multi-line comment. It should be like:

/*
 * 
 * 
 */

>> + * (Android media layer)
>> + */
>>  enum {
>>   GBAUDIO_DEVICE_NONE = 0x0,
>>   /* reserved bits */
>> --
>> 1.9.1
>
> Hi Gaurav.
>
> Thank you for the patch, it looks fine to me.
>
> Reviewed-by: Mark Greer 


Re: [PATCH] staging: greybus: Fix warning to limit chars per line

2018-04-05 Thread Viresh Kumar
On Fri, Apr 6, 2018 at 3:49 AM, Mark Greer  wrote:
> On Wed, Apr 04, 2018 at 12:02:46AM +0530, Gaurav Dhingra wrote:
>> Wrap comment to fix warning "prefer a maximum 75 chars per line"
>>
>> Signed-off-by: Gaurav Dhingra 
>> ---
>>  drivers/staging/greybus/audio_codec.h | 4 +++-
>>  1 file changed, 3 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/staging/greybus/audio_codec.h 
>> b/drivers/staging/greybus/audio_codec.h
>> index a1d5440..01838d9 100644
>> --- a/drivers/staging/greybus/audio_codec.h
>> +++ b/drivers/staging/greybus/audio_codec.h
>> @@ -23,7 +23,9 @@ enum {
>>   NUM_CODEC_DAIS,
>>  };
>>
>> -/* device_type should be same as defined in audio.h (Android media layer) */
>> +/* device_type should be same as defined in audio.h

This isn't the right way to write a multi-line comment. It should be like:

/*
 * 
 * 
 */

>> + * (Android media layer)
>> + */
>>  enum {
>>   GBAUDIO_DEVICE_NONE = 0x0,
>>   /* reserved bits */
>> --
>> 1.9.1
>
> Hi Gaurav.
>
> Thank you for the patch, it looks fine to me.
>
> Reviewed-by: Mark Greer 


Re: [PATCH] mm: shmem: enable thp migration (Re: [PATCH v1] mm: consider non-anonymous thp as unmovable page)

2018-04-05 Thread Naoya Horiguchi
On Fri, Apr 06, 2018 at 03:07:11AM +, Horiguchi Naoya(堀口 直也) wrote:
...
> -
> From e31ec037701d1cc76b26226e4b66d8c783d40889 Mon Sep 17 00:00:00 2001
> From: Naoya Horiguchi 
> Date: Fri, 6 Apr 2018 10:58:35 +0900
> Subject: [PATCH] mm: enable thp migration for shmem thp
> 
> My testing for the latest kernel supporting thp migration showed an
> infinite loop in offlining the memory block that is filled with shmem
> thps.  We can get out of the loop with a signal, but kernel should
> return with failure in this case.
> 
> What happens in the loop is that scan_movable_pages() repeats returning
> the same pfn without any progress. That's because page migration always
> fails for shmem thps.
> 
> In memory offline code, memory blocks containing unmovable pages should
> be prevented from being offline targets by has_unmovable_pages() inside
> start_isolate_page_range(). So it's possible to change migratability
> for non-anonymous thps to avoid the issue, but it introduces more complex
> and thp-specific handling in migration code, so it might not good.
> 
> So this patch is suggesting to fix the issue by enabling thp migration
> for shmem thp. Both of anon/shmem thp are migratable so we don't need
> precheck about the type of thps.
> 
> Fixes: commit 72b39cfc4d75 ("mm, memory_hotplug: do not fail offlining too 
> early")
> Signed-off-by: Naoya Horiguchi 
> Cc: sta...@vger.kernel.org # v4.15+

... oh, I don't think this is suitable for stable.
Michal's fix in another email can come first with "CC: stable",
then this one.
Anyway I want to get some feedback on the change of this patch.

Thanks,
Naoya Horiguchi

> ---
>  mm/huge_memory.c |  5 -
>  mm/migrate.c | 19 ---
>  mm/rmap.c|  3 ---
>  3 files changed, 20 insertions(+), 7 deletions(-)
> 
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 2aff58624886..933c1bbd3464 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -2926,7 +2926,10 @@ void remove_migration_pmd(struct page_vma_mapped_walk 
> *pvmw, struct page *new)
>   pmde = maybe_pmd_mkwrite(pmde, vma);
>  
>   flush_cache_range(vma, mmun_start, mmun_start + HPAGE_PMD_SIZE);
> - page_add_anon_rmap(new, vma, mmun_start, true);
> + if (PageAnon(new))
> + page_add_anon_rmap(new, vma, mmun_start, true);
> + else
> + page_add_file_rmap(new, true);
>   set_pmd_at(mm, mmun_start, pvmw->pmd, pmde);
>   if (vma->vm_flags & VM_LOCKED)
>   mlock_vma_page(new);
> diff --git a/mm/migrate.c b/mm/migrate.c
> index bdef905b1737..f92dd9f50981 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -472,7 +472,7 @@ int migrate_page_move_mapping(struct address_space 
> *mapping,
>   pslot = radix_tree_lookup_slot(>i_pages,
>   page_index(page));
>  
> - expected_count += 1 + page_has_private(page);
> + expected_count += hpage_nr_pages(page) + page_has_private(page);
>   if (page_count(page) != expected_count ||
>   radix_tree_deref_slot_protected(pslot,
>   >i_pages.xa_lock) != page) {
> @@ -505,7 +505,7 @@ int migrate_page_move_mapping(struct address_space 
> *mapping,
>*/
>   newpage->index = page->index;
>   newpage->mapping = page->mapping;
> - get_page(newpage);  /* add cache reference */
> + page_ref_add(newpage, hpage_nr_pages(page)); /* add cache reference */
>   if (PageSwapBacked(page)) {
>   __SetPageSwapBacked(newpage);
>   if (PageSwapCache(page)) {
> @@ -524,13 +524,26 @@ int migrate_page_move_mapping(struct address_space 
> *mapping,
>   }
>  
>   radix_tree_replace_slot(>i_pages, pslot, newpage);
> + if (PageTransHuge(page)) {
> + int i;
> + int index = page_index(page);
> +
> + for (i = 0; i < HPAGE_PMD_NR; i++) {
> + pslot = radix_tree_lookup_slot(>i_pages,
> +index + i);
> + radix_tree_replace_slot(>i_pages, pslot,
> + newpage + i);
> + }
> + } else {
> + radix_tree_replace_slot(>i_pages, pslot, newpage);
> + }
>  
>   /*
>* Drop cache reference from old page by unfreezing
>* to one less reference.
>* We know this isn't the last reference.
>*/
> - page_ref_unfreeze(page, expected_count - 1);
> + page_ref_unfreeze(page, expected_count - hpage_nr_pages(page));
>  
>   xa_unlock(>i_pages);
>   /* Leave irq disabled to prevent preemption while updating stats */
> diff --git a/mm/rmap.c b/mm/rmap.c
> index f0dd4e4565bc..8d5337fed37b 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -1374,9 +1374,6 @@ static bool try_to_unmap_one(struct page *page, struct 
> vm_area_struct *vma,
>   if (!pvmw.pte && (flags & 

Re: [PATCH] mm: shmem: enable thp migration (Re: [PATCH v1] mm: consider non-anonymous thp as unmovable page)

2018-04-05 Thread Naoya Horiguchi
On Fri, Apr 06, 2018 at 03:07:11AM +, Horiguchi Naoya(堀口 直也) wrote:
...
> -
> From e31ec037701d1cc76b26226e4b66d8c783d40889 Mon Sep 17 00:00:00 2001
> From: Naoya Horiguchi 
> Date: Fri, 6 Apr 2018 10:58:35 +0900
> Subject: [PATCH] mm: enable thp migration for shmem thp
> 
> My testing for the latest kernel supporting thp migration showed an
> infinite loop in offlining the memory block that is filled with shmem
> thps.  We can get out of the loop with a signal, but kernel should
> return with failure in this case.
> 
> What happens in the loop is that scan_movable_pages() repeats returning
> the same pfn without any progress. That's because page migration always
> fails for shmem thps.
> 
> In memory offline code, memory blocks containing unmovable pages should
> be prevented from being offline targets by has_unmovable_pages() inside
> start_isolate_page_range(). So it's possible to change migratability
> for non-anonymous thps to avoid the issue, but it introduces more complex
> and thp-specific handling in migration code, so it might not good.
> 
> So this patch is suggesting to fix the issue by enabling thp migration
> for shmem thp. Both of anon/shmem thp are migratable so we don't need
> precheck about the type of thps.
> 
> Fixes: commit 72b39cfc4d75 ("mm, memory_hotplug: do not fail offlining too 
> early")
> Signed-off-by: Naoya Horiguchi 
> Cc: sta...@vger.kernel.org # v4.15+

... oh, I don't think this is suitable for stable.
Michal's fix in another email can come first with "CC: stable",
then this one.
Anyway I want to get some feedback on the change of this patch.

Thanks,
Naoya Horiguchi

> ---
>  mm/huge_memory.c |  5 -
>  mm/migrate.c | 19 ---
>  mm/rmap.c|  3 ---
>  3 files changed, 20 insertions(+), 7 deletions(-)
> 
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 2aff58624886..933c1bbd3464 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -2926,7 +2926,10 @@ void remove_migration_pmd(struct page_vma_mapped_walk 
> *pvmw, struct page *new)
>   pmde = maybe_pmd_mkwrite(pmde, vma);
>  
>   flush_cache_range(vma, mmun_start, mmun_start + HPAGE_PMD_SIZE);
> - page_add_anon_rmap(new, vma, mmun_start, true);
> + if (PageAnon(new))
> + page_add_anon_rmap(new, vma, mmun_start, true);
> + else
> + page_add_file_rmap(new, true);
>   set_pmd_at(mm, mmun_start, pvmw->pmd, pmde);
>   if (vma->vm_flags & VM_LOCKED)
>   mlock_vma_page(new);
> diff --git a/mm/migrate.c b/mm/migrate.c
> index bdef905b1737..f92dd9f50981 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -472,7 +472,7 @@ int migrate_page_move_mapping(struct address_space 
> *mapping,
>   pslot = radix_tree_lookup_slot(>i_pages,
>   page_index(page));
>  
> - expected_count += 1 + page_has_private(page);
> + expected_count += hpage_nr_pages(page) + page_has_private(page);
>   if (page_count(page) != expected_count ||
>   radix_tree_deref_slot_protected(pslot,
>   >i_pages.xa_lock) != page) {
> @@ -505,7 +505,7 @@ int migrate_page_move_mapping(struct address_space 
> *mapping,
>*/
>   newpage->index = page->index;
>   newpage->mapping = page->mapping;
> - get_page(newpage);  /* add cache reference */
> + page_ref_add(newpage, hpage_nr_pages(page)); /* add cache reference */
>   if (PageSwapBacked(page)) {
>   __SetPageSwapBacked(newpage);
>   if (PageSwapCache(page)) {
> @@ -524,13 +524,26 @@ int migrate_page_move_mapping(struct address_space 
> *mapping,
>   }
>  
>   radix_tree_replace_slot(>i_pages, pslot, newpage);
> + if (PageTransHuge(page)) {
> + int i;
> + int index = page_index(page);
> +
> + for (i = 0; i < HPAGE_PMD_NR; i++) {
> + pslot = radix_tree_lookup_slot(>i_pages,
> +index + i);
> + radix_tree_replace_slot(>i_pages, pslot,
> + newpage + i);
> + }
> + } else {
> + radix_tree_replace_slot(>i_pages, pslot, newpage);
> + }
>  
>   /*
>* Drop cache reference from old page by unfreezing
>* to one less reference.
>* We know this isn't the last reference.
>*/
> - page_ref_unfreeze(page, expected_count - 1);
> + page_ref_unfreeze(page, expected_count - hpage_nr_pages(page));
>  
>   xa_unlock(>i_pages);
>   /* Leave irq disabled to prevent preemption while updating stats */
> diff --git a/mm/rmap.c b/mm/rmap.c
> index f0dd4e4565bc..8d5337fed37b 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -1374,9 +1374,6 @@ static bool try_to_unmap_one(struct page *page, struct 
> vm_area_struct *vma,
>   if (!pvmw.pte && (flags & TTU_MIGRATION)) {
>   

Re: [PATCH 03/10] staging: fnic2 add fip handling

2018-04-05 Thread Greg Kroah-Hartman
On Thu, Apr 05, 2018 at 02:18:37PM -0700, Oliver Smith-Denny wrote:
> --- /dev/null
> +++ b/drivers/staging/fnic2/src/fip.c
> @@ -0,0 +1,804 @@
> +/*
> + * SPDX-License-Identifier: GPL-2.0
> + * Copyright 2018 Cisco Systems, Inc.  All rights reserved.
> + *
> + * This program is free software; you may redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; version 2 of the License.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
> + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
> + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
> + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
> + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
> + * SOFTWARE.
> + */
> +
> +/*! \file */

What is that line for?


Re: [PATCH 03/10] staging: fnic2 add fip handling

2018-04-05 Thread Greg Kroah-Hartman
On Thu, Apr 05, 2018 at 02:18:37PM -0700, Oliver Smith-Denny wrote:
> --- /dev/null
> +++ b/drivers/staging/fnic2/src/fip.c
> @@ -0,0 +1,804 @@
> +/*
> + * SPDX-License-Identifier: GPL-2.0
> + * Copyright 2018 Cisco Systems, Inc.  All rights reserved.
> + *
> + * This program is free software; you may redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; version 2 of the License.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
> + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
> + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
> + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
> + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
> + * SOFTWARE.
> + */
> +
> +/*! \file */

What is that line for?


Re: [PATCH 02/10] staging: fnic2 add resource allocation

2018-04-05 Thread Greg Kroah-Hartman
On Thu, Apr 05, 2018 at 02:17:52PM -0700, Oliver Smith-Denny wrote:
> --- /dev/null
> +++ b/drivers/staging/fnic2/src/fnic2_isr.c
> @@ -0,0 +1,324 @@
> +/*
> + * SPDX-License-Identifier: GPL-2.0
> + * Copyright 2018 Cisco Systems, Inc.  All rights reserved.
> + *
> + * This program is free software; you may redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; version 2 of the License.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
> + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
> + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
> + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
> + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
> + * SOFTWARE.
> + */

No need for the messy boilerplate text if you have a SPDX line, please
remove both of those paragraphs.

thanks,

greg k-h


Re: [PATCH 02/10] staging: fnic2 add resource allocation

2018-04-05 Thread Greg Kroah-Hartman
On Thu, Apr 05, 2018 at 02:17:52PM -0700, Oliver Smith-Denny wrote:
> --- /dev/null
> +++ b/drivers/staging/fnic2/src/fnic2_isr.c
> @@ -0,0 +1,324 @@
> +/*
> + * SPDX-License-Identifier: GPL-2.0
> + * Copyright 2018 Cisco Systems, Inc.  All rights reserved.
> + *
> + * This program is free software; you may redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; version 2 of the License.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
> + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
> + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
> + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
> + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
> + * SOFTWARE.
> + */

No need for the messy boilerplate text if you have a SPDX line, please
remove both of those paragraphs.

thanks,

greg k-h


Re: [PATCH 01/10] staging: fnic2 add initialization

2018-04-05 Thread Greg Kroah-Hartman
On Thu, Apr 05, 2018 at 02:16:45PM -0700, Oliver Smith-Denny wrote:
> These files contain module load and unload, global driver context,
> PCI registration, PCI probe and remove, and definitions of
> the fnic2 global context.
> 
> Signed-off-by: Oliver Smith-Denny 
> Signed-off-by: Sesidhar Baddela 
> Signed-off-by: Anil Chintalapati 
> Signed-off-by: Arulprabhu Ponnusamy 
> Signed-off-by: Gian Carlo Boffa 
> Co-Developed-by: Arulprabhu Ponnusamy 
> Co-Developed-by: Gian Carlo Boffa 
> Co-Developed-by: Oliver Smith-Denny 
> ---
>  drivers/staging/fnic2/src/fnic2.h  | 256 
>  drivers/staging/fnic2/src/fnic2_main.c | 711 
> +
>  2 files changed, 967 insertions(+)
>  create mode 100644 drivers/staging/fnic2/src/fnic2.h
>  create mode 100644 drivers/staging/fnic2/src/fnic2_main.c

Why is this a drivers/staging/ driver at all?  What is keeping you from
getting this merged into the "proper" place in the kernel?

If you have a staging driver, you have to have a TODO file in the
directory listing what is keeping this in the staging section.

Also, one tiny thing to fix up:

> --- /dev/null
> +++ b/drivers/staging/fnic2/src/fnic2.h
> @@ -0,0 +1,256 @@
> +/*
> + * SPDX-License-Identifier: GPL-2.0

Please read the documentation on how to properly use SPDX tags on kernel
files.  This needs to be the first line of the file.

thanks,

greg k-h


Re: [PATCH 01/10] staging: fnic2 add initialization

2018-04-05 Thread Greg Kroah-Hartman
On Thu, Apr 05, 2018 at 02:16:45PM -0700, Oliver Smith-Denny wrote:
> These files contain module load and unload, global driver context,
> PCI registration, PCI probe and remove, and definitions of
> the fnic2 global context.
> 
> Signed-off-by: Oliver Smith-Denny 
> Signed-off-by: Sesidhar Baddela 
> Signed-off-by: Anil Chintalapati 
> Signed-off-by: Arulprabhu Ponnusamy 
> Signed-off-by: Gian Carlo Boffa 
> Co-Developed-by: Arulprabhu Ponnusamy 
> Co-Developed-by: Gian Carlo Boffa 
> Co-Developed-by: Oliver Smith-Denny 
> ---
>  drivers/staging/fnic2/src/fnic2.h  | 256 
>  drivers/staging/fnic2/src/fnic2_main.c | 711 
> +
>  2 files changed, 967 insertions(+)
>  create mode 100644 drivers/staging/fnic2/src/fnic2.h
>  create mode 100644 drivers/staging/fnic2/src/fnic2_main.c

Why is this a drivers/staging/ driver at all?  What is keeping you from
getting this merged into the "proper" place in the kernel?

If you have a staging driver, you have to have a TODO file in the
directory listing what is keeping this in the staging section.

Also, one tiny thing to fix up:

> --- /dev/null
> +++ b/drivers/staging/fnic2/src/fnic2.h
> @@ -0,0 +1,256 @@
> +/*
> + * SPDX-License-Identifier: GPL-2.0

Please read the documentation on how to properly use SPDX tags on kernel
files.  This needs to be the first line of the file.

thanks,

greg k-h


Re: [PATCH v3] cpufreq: cppc_cpufreq: Initialize shared cpu's perf capabilities

2018-04-05 Thread Viresh Kumar
On 06-04-18, 10:43, Shunyong Yang wrote:
> When multiple cpus are related in one cpufreq policy, the first online
> cpu will be chosen by default to handle cpufreq operations. Let's take
> cpu0 and cpu1 as an example.
> 
> When cpu0 is offline, policy->cpu will be shifted to cpu1. Cpu1's perf
> capabilities should be initialized. Otherwise, perf capabilities are 0s
> and speed change can not take effect.
> 
> This patch copies perf capabilities of the first online cpu to other
> shared cpus when policy shared type is CPUFREQ_SHARED_TYPE_ANY.
> 
> Cc: Joey Zheng 
> Acked-by: Viresh Kumar 
> Signed-off-by: Shunyong Yang 
> ---
> 
> Changes in v3:
>   -Remove unnecessary wrap per Kumar's comments.
> 
> Changes in v2:
>   -Add unlikely in cpu comparison per Kumar's comments.
>   -Fix coding style per Kumar's comments.
> 
> Changes in v1:
>   -Drop RFC tag,
>  The original RFC link,
>  https://patchwork.kernel.org/patch/10299055/.
> 
>  This patch solves same issue as RFC above.
> 
>  Patch name is changed as code is too much different with RFC above.
> 
>   -Remove extra init() per Viresh Kumar's comments and only handle
>CPPC CPUFREQ_SHARED_TYPE_ANY case.
> 
> ---
>  drivers/cpufreq/cppc_cpufreq.c | 14 --
>  1 file changed, 12 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c
> index 8300a9fcb80c..8b432d6e846d 100644
> --- a/drivers/cpufreq/cppc_cpufreq.c
> +++ b/drivers/cpufreq/cppc_cpufreq.c
> @@ -167,9 +167,19 @@ static int cppc_cpufreq_cpu_init(struct cpufreq_policy 
> *policy)
>   NSEC_PER_USEC;
>   policy->shared_type = cpu->shared_type;
>  
> - if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY)
> + if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) {
> + int i;
> +
>   cpumask_copy(policy->cpus, cpu->shared_cpu_map);
> - else if (policy->shared_type == CPUFREQ_SHARED_TYPE_ALL) {
> +
> + for_each_cpu(i, policy->cpus) {
> + if (unlikely(i == policy->cpu))
> + continue;
> +
> + memcpy(_cpu_data[i]->perf_caps, >perf_caps,
> +sizeof(cpu->perf_caps));
> + }
> + } else if (policy->shared_type == CPUFREQ_SHARED_TYPE_ALL) {
>   /* Support only SW_ANY for now. */
>   pr_debug("Unsupported CPU co-ord type\n");
>   return -EFAULT;

LGTM. Thanks.

-- 
viresh


Re: [PATCH v3] cpufreq: cppc_cpufreq: Initialize shared cpu's perf capabilities

2018-04-05 Thread Viresh Kumar
On 06-04-18, 10:43, Shunyong Yang wrote:
> When multiple cpus are related in one cpufreq policy, the first online
> cpu will be chosen by default to handle cpufreq operations. Let's take
> cpu0 and cpu1 as an example.
> 
> When cpu0 is offline, policy->cpu will be shifted to cpu1. Cpu1's perf
> capabilities should be initialized. Otherwise, perf capabilities are 0s
> and speed change can not take effect.
> 
> This patch copies perf capabilities of the first online cpu to other
> shared cpus when policy shared type is CPUFREQ_SHARED_TYPE_ANY.
> 
> Cc: Joey Zheng 
> Acked-by: Viresh Kumar 
> Signed-off-by: Shunyong Yang 
> ---
> 
> Changes in v3:
>   -Remove unnecessary wrap per Kumar's comments.
> 
> Changes in v2:
>   -Add unlikely in cpu comparison per Kumar's comments.
>   -Fix coding style per Kumar's comments.
> 
> Changes in v1:
>   -Drop RFC tag,
>  The original RFC link,
>  https://patchwork.kernel.org/patch/10299055/.
> 
>  This patch solves same issue as RFC above.
> 
>  Patch name is changed as code is too much different with RFC above.
> 
>   -Remove extra init() per Viresh Kumar's comments and only handle
>CPPC CPUFREQ_SHARED_TYPE_ANY case.
> 
> ---
>  drivers/cpufreq/cppc_cpufreq.c | 14 --
>  1 file changed, 12 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c
> index 8300a9fcb80c..8b432d6e846d 100644
> --- a/drivers/cpufreq/cppc_cpufreq.c
> +++ b/drivers/cpufreq/cppc_cpufreq.c
> @@ -167,9 +167,19 @@ static int cppc_cpufreq_cpu_init(struct cpufreq_policy 
> *policy)
>   NSEC_PER_USEC;
>   policy->shared_type = cpu->shared_type;
>  
> - if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY)
> + if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) {
> + int i;
> +
>   cpumask_copy(policy->cpus, cpu->shared_cpu_map);
> - else if (policy->shared_type == CPUFREQ_SHARED_TYPE_ALL) {
> +
> + for_each_cpu(i, policy->cpus) {
> + if (unlikely(i == policy->cpu))
> + continue;
> +
> + memcpy(_cpu_data[i]->perf_caps, >perf_caps,
> +sizeof(cpu->perf_caps));
> + }
> + } else if (policy->shared_type == CPUFREQ_SHARED_TYPE_ALL) {
>   /* Support only SW_ANY for now. */
>   pr_debug("Unsupported CPU co-ord type\n");
>   return -EFAULT;

LGTM. Thanks.

-- 
viresh


Re: [PATCH 05/32] fs: introduce new ->get_poll_head and ->poll_mask methods

2018-04-05 Thread Al Viro
On Fri, Mar 30, 2018 at 05:07:42PM +0200, Christoph Hellwig wrote:

> +  get_poll_head: Returns the struct wait_queue_head that poll, select,
> +  epoll or aio poll should wait on in case this instance only has single
> +  waitqueue.  Can return NULL to indicate polling is not supported,
> +  or a POLL* value using the POLL_TO_PTR helper in case a grave error
> +  occured and ->poll_mask shall not be called.

> + if (IS_ERR(head))
> + return PTR_TO_POLL(head);

> + * ->get_poll_head can return a __poll_t in the PTR_ERR, use these macros
> + * to return the value and recover it.  It takes care of the negation as
> + * well as off the annotations.
> + */
> +#define POLL_TO_PTR(mask)(ERR_PTR(-(__force int)(mask)))

Uh-oh...
static inline bool __must_check IS_ERR(__force const void *ptr)
{
return IS_ERR_VALUE((unsigned long)ptr);
}
#define IS_ERR_VALUE(x) unlikely((unsigned long)(void *)(x) >= (unsigned 
long)-MAX_ERRNO)
#define MAX_ERRNO   4095

IOW, your trick relies upon arguments of PTR_TO_POLL being no greater
than 4095.  Now, consider
#define EPOLLRDHUP  (__force __poll_t)0x2000
which is to say, 8192...

So anything that tries e.g. POLL_TO_PTR(EPOLLRDHUP | EPOLLERR) will be in
for a quiet unpleasant surprise...


Re: [PATCH 05/32] fs: introduce new ->get_poll_head and ->poll_mask methods

2018-04-05 Thread Al Viro
On Fri, Mar 30, 2018 at 05:07:42PM +0200, Christoph Hellwig wrote:

> +  get_poll_head: Returns the struct wait_queue_head that poll, select,
> +  epoll or aio poll should wait on in case this instance only has single
> +  waitqueue.  Can return NULL to indicate polling is not supported,
> +  or a POLL* value using the POLL_TO_PTR helper in case a grave error
> +  occured and ->poll_mask shall not be called.

> + if (IS_ERR(head))
> + return PTR_TO_POLL(head);

> + * ->get_poll_head can return a __poll_t in the PTR_ERR, use these macros
> + * to return the value and recover it.  It takes care of the negation as
> + * well as off the annotations.
> + */
> +#define POLL_TO_PTR(mask)(ERR_PTR(-(__force int)(mask)))

Uh-oh...
static inline bool __must_check IS_ERR(__force const void *ptr)
{
return IS_ERR_VALUE((unsigned long)ptr);
}
#define IS_ERR_VALUE(x) unlikely((unsigned long)(void *)(x) >= (unsigned 
long)-MAX_ERRNO)
#define MAX_ERRNO   4095

IOW, your trick relies upon arguments of PTR_TO_POLL being no greater
than 4095.  Now, consider
#define EPOLLRDHUP  (__force __poll_t)0x2000
which is to say, 8192...

So anything that tries e.g. POLL_TO_PTR(EPOLLRDHUP | EPOLLERR) will be in
for a quiet unpleasant surprise...


[PATCH] staging: ks7010: replace kmalloc() + memcpy() with kmemdup()

2018-04-05 Thread Ji-Hun Kim
Use kmemdup rather than duplicating its implementation.

Signed-off-by: Ji-Hun Kim 
---
 drivers/staging/ks7010/ks7010_sdio.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/staging/ks7010/ks7010_sdio.c 
b/drivers/staging/ks7010/ks7010_sdio.c
index b8f55a1..c8eb55b 100644
--- a/drivers/staging/ks7010/ks7010_sdio.c
+++ b/drivers/staging/ks7010/ks7010_sdio.c
@@ -589,11 +589,10 @@ static int ks7010_sdio_update_index(struct 
ks_wlan_private *priv, u32 index)
int ret;
unsigned char *data_buf;
 
-   data_buf = kmalloc(sizeof(u32), GFP_KERNEL);
+   data_buf = kmemdup(, sizeof(u32), GFP_KERNEL);
if (!data_buf)
return -ENOMEM;
 
-   memcpy(data_buf, , sizeof(index));
ret = ks7010_sdio_write(priv, WRITE_INDEX, data_buf, sizeof(index));
if (ret)
goto err_free_data_buf;
-- 
1.9.1



[PATCH] staging: ks7010: replace kmalloc() + memcpy() with kmemdup()

2018-04-05 Thread Ji-Hun Kim
Use kmemdup rather than duplicating its implementation.

Signed-off-by: Ji-Hun Kim 
---
 drivers/staging/ks7010/ks7010_sdio.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/staging/ks7010/ks7010_sdio.c 
b/drivers/staging/ks7010/ks7010_sdio.c
index b8f55a1..c8eb55b 100644
--- a/drivers/staging/ks7010/ks7010_sdio.c
+++ b/drivers/staging/ks7010/ks7010_sdio.c
@@ -589,11 +589,10 @@ static int ks7010_sdio_update_index(struct 
ks_wlan_private *priv, u32 index)
int ret;
unsigned char *data_buf;
 
-   data_buf = kmalloc(sizeof(u32), GFP_KERNEL);
+   data_buf = kmemdup(, sizeof(u32), GFP_KERNEL);
if (!data_buf)
return -ENOMEM;
 
-   memcpy(data_buf, , sizeof(index));
ret = ks7010_sdio_write(priv, WRITE_INDEX, data_buf, sizeof(index));
if (ret)
goto err_free_data_buf;
-- 
1.9.1



linux-next: build warning after merge of the akpm-current tree

2018-04-05 Thread Stephen Rothwell
Hi Andrew,

After merging the akpm-current tree, today's linux-next build (x86_64
allmodconfig) produced this warning:

mm/memcontrol.c: In function 'memory_events_show':
mm/memcontrol.c:5453:23: warning: array subscript is above array bounds 
[-Warray-bounds]
  atomic_long_read(>memory_events[OOM_KILL]));
   ^~~

Introduced by commit

  725844c87a0d ("mm: memcg: make sure memory.events is uptodate when waking 
pollers")

-- 
Cheers,
Stephen Rothwell


pgpryi2jbrxNP.pgp
Description: OpenPGP digital signature


linux-next: build warning after merge of the akpm-current tree

2018-04-05 Thread Stephen Rothwell
Hi Andrew,

After merging the akpm-current tree, today's linux-next build (x86_64
allmodconfig) produced this warning:

mm/memcontrol.c: In function 'memory_events_show':
mm/memcontrol.c:5453:23: warning: array subscript is above array bounds 
[-Warray-bounds]
  atomic_long_read(>memory_events[OOM_KILL]));
   ^~~

Introduced by commit

  725844c87a0d ("mm: memcg: make sure memory.events is uptodate when waking 
pollers")

-- 
Cheers,
Stephen Rothwell


pgpryi2jbrxNP.pgp
Description: OpenPGP digital signature


Re: [PATCH v3 04/13] ARM: dts: ipq4019: Update ipq4019-dk01.1 board data

2018-04-05 Thread Richard Cochran
On Mon, Apr 02, 2018 at 03:28:47PM +0530, sricha...@codeaurora.org wrote:
>   Yes, i will post another series for ipq806[2/4] updates and the
> corresponding
>   boards after this.

I tried mainline on the ap148 using qcom_defconfig and the
qcom-ipq8064-ap148.dtb, and it doesn't boot FYI.

Thanks,
Richard


Re: [PATCH v3 04/13] ARM: dts: ipq4019: Update ipq4019-dk01.1 board data

2018-04-05 Thread Richard Cochran
On Mon, Apr 02, 2018 at 03:28:47PM +0530, sricha...@codeaurora.org wrote:
>   Yes, i will post another series for ipq806[2/4] updates and the
> corresponding
>   boards after this.

I tried mainline on the ap148 using qcom_defconfig and the
qcom-ipq8064-ap148.dtb, and it doesn't boot FYI.

Thanks,
Richard


[PATCH 1/1] arm64: To remove initrd reserved area entry from memblock

2018-04-05 Thread CHANDAN VN
INITRD reserved area entry is not removed from memblock
even though initrd reserved area is freed. After freeing
the memory it is released from memblock. The same can be
checked from /sys/kernel/debug/memblock/reserved.

The patch makes sure that the initrd entry is removed from
memblock when keepinitrd is not enabled.

Signed-off-by: CHANDAN VN 
---
 arch/arm64/mm/init.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 9f3c47a..1b18b47 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -646,8 +646,10 @@ void free_initmem(void)
 
 void __init free_initrd_mem(unsigned long start, unsigned long end)
 {
-   if (!keep_initrd)
+   if (!keep_initrd) {
free_reserved_area((void *)start, (void *)end, 0, "initrd");
+   memblock_free(__virt_to_phys(start), end - start);
+   }
 }
 
 static int __init keepinitrd_setup(char *__unused)
-- 
1.9.1



[PATCH 1/1] arm64: To remove initrd reserved area entry from memblock

2018-04-05 Thread CHANDAN VN
INITRD reserved area entry is not removed from memblock
even though initrd reserved area is freed. After freeing
the memory it is released from memblock. The same can be
checked from /sys/kernel/debug/memblock/reserved.

The patch makes sure that the initrd entry is removed from
memblock when keepinitrd is not enabled.

Signed-off-by: CHANDAN VN 
---
 arch/arm64/mm/init.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 9f3c47a..1b18b47 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -646,8 +646,10 @@ void free_initmem(void)
 
 void __init free_initrd_mem(unsigned long start, unsigned long end)
 {
-   if (!keep_initrd)
+   if (!keep_initrd) {
free_reserved_area((void *)start, (void *)end, 0, "initrd");
+   memblock_free(__virt_to_phys(start), end - start);
+   }
 }
 
 static int __init keepinitrd_setup(char *__unused)
-- 
1.9.1



Re: [PATCH] ata: ahci-platform: add reset control support

2018-04-05 Thread Kunihiko Hayashi
Hi Hans,

On Thu, 5 Apr 2018 16:08:24 +0200
Hans de Goede  wrote:

> Hi,
> 
> On 05-04-18 16:00, Hans de Goede wrote:
> > Hi,
> > > On 05-04-18 15:54, Thierry Reding wrote:
> >> On Thu, Apr 05, 2018 at 03:27:03PM +0200, Hans de Goede wrote:
> >>> Hi,
> >>>
> >>> On 05-04-18 15:17, Patrice CHOTARD wrote:
>  Hi Thierry
> 
>  On 04/05/2018 11:54 AM, Thierry Reding wrote:
> > On Fri, Mar 23, 2018 at 10:30:53AM +0900, Kunihiko Hayashi wrote:
> >> Add support to get and control a list of resets for the device
> >> as optional and shared. These resets must be kept de-asserted until
> >> the device is enabled.
> >>
> >> This is specified as shared because some SoCs like UniPhier series
> >> have common reset controls with all ahci controller instances.
> >>
> >> Signed-off-by: Kunihiko Hayashi 
> >> ---
> >> ??? .../devicetree/bindings/ata/ahci-platform.txt? |? 1 +
> >> ??? drivers/ata/ahci.h |? 1 +
> >> ??? drivers/ata/libahci_platform.c | 24 
> >> +++---
> >> ??? 3 files changed, 23 insertions(+), 3 deletions(-)
> >
> > This causes a regression on Tegra because we explicitly request the
> > resets after the call to ahci_platform_get_resources().
> 
>  I confirm, we got exactly the same behavior on STi platform.
> 
> >
> > ?? From a quick look, ahci_mtk and ahci_st are in the same boat, adding 
> > the
> > corresponding maintainers to Cc.
> >
> > Patrice, Matthias: does SATA still work for you after this patch? This
> > has been in linux-next since next-20180327.
> 
>  SATA is still working after this patch, but a kernel warning is
>  triggered due to the fact that resets are both requested by
>  libahci_platform and by ahci_st driver.
> >>>
> >>> So in your case you might be able to remove the reset handling
> >>> from the ahci_st driver and rely on the new libahci_platform
> >>> handling instead? If that works that seems like a win to me.
> >>>
> >>> As said elsewhere in this thread I think it makes sense to keep (or re-add
> >>> after a revert) the libahci_platform reset code, but make it conditional
> >>> on a flag passed to ahci_platform_get_resources(). This way we get
> >>> the shared code for most cases and platforms which need special handling
> >>> can opt-out.
> >>
> >> Agreed, although I prefer such helpers to be opt-in, rather than
> >> opt-out. In my experience that tends make the helpers more resilient to
> >> this kind of regression. It also simplifies things because instead of
> >> drivers saying "I want all the helpers except this one and that one",
> >> they can simply say "I want these helpers and that one". In the former
> >> case whenever you add some new (opt-out) feature, you have to update all
> >> drivers and add the exception. In the latter you only need to extend the
> >> drivers that want to make use of the new helper.
> 
> Erm, the idea never was to make this opt-out but rather opt in, so
> we add a flags parameter to ahci_platform_get_resources() and all
> current users pass in 0 for that to keep the current behavior.
> 
> And only the generic drivers/ata/ahci_platform.c driver will pass
> in a the new AHCI_PLATFORM_GET_RESETS flag, which makes
> ahci_platform_get_resources() (and the other functions) also deal
> with resets.
> 
> >> With that in mind, rather than adding a flag to the
> >> ahci_platform_get_resources() function, it might be more flexible to
> >> split the helpers into finer-grained functions. That way drivers can
> >> pick whatever functionality they want from the helpers.
> > > Good point, so lets:
> > > 1) Revert the patch for now
> > 2) Have a new version of the patch which adds a ahci_platform_get_resets() 
> > helper
> > 3) Modify the generic drivers/ata/ahci_platform.c driver to call the new
> >  ?? ahci_platform_get_resets() between its ahci_platform_get_resources()
> >  ?? and ahci_platform_enable_resources() calls.
> >  ?? I think that ahci_platform_enable_resources() should still automatically
> >  ?? do the right thing wrt resets if ahci_platform_get_resets() was called
> >  ?? (otherwise the resets array will be empty and should be skipped)
> > > This should make the generic driver usable for the UniPhier SoCs and
> > maybe some other drivers like the ahci_st driver can also switch to the
> > new ahci_platform_get_resets() functionality to reduce their code a bit.
> 
> So thinking slightly longer about this, with the opt-in variant
> (which is what I intended all along) I do think that a flags parameter
> is better, because the whole idea behind lib_ahci_platform is to avoid
> having to do err = get_resource_a(), if (err) bail, err = get_resource_b()
> if (err) bail, etc. in all the ahci (platform) drivers. And having fine
> grained helpers re-introduces that.

In case of adding a flag instead 

Re: [PATCH] ata: ahci-platform: add reset control support

2018-04-05 Thread Kunihiko Hayashi
Hi Hans,

On Thu, 5 Apr 2018 16:08:24 +0200
Hans de Goede  wrote:

> Hi,
> 
> On 05-04-18 16:00, Hans de Goede wrote:
> > Hi,
> > > On 05-04-18 15:54, Thierry Reding wrote:
> >> On Thu, Apr 05, 2018 at 03:27:03PM +0200, Hans de Goede wrote:
> >>> Hi,
> >>>
> >>> On 05-04-18 15:17, Patrice CHOTARD wrote:
>  Hi Thierry
> 
>  On 04/05/2018 11:54 AM, Thierry Reding wrote:
> > On Fri, Mar 23, 2018 at 10:30:53AM +0900, Kunihiko Hayashi wrote:
> >> Add support to get and control a list of resets for the device
> >> as optional and shared. These resets must be kept de-asserted until
> >> the device is enabled.
> >>
> >> This is specified as shared because some SoCs like UniPhier series
> >> have common reset controls with all ahci controller instances.
> >>
> >> Signed-off-by: Kunihiko Hayashi 
> >> ---
> >> ??? .../devicetree/bindings/ata/ahci-platform.txt? |? 1 +
> >> ??? drivers/ata/ahci.h |? 1 +
> >> ??? drivers/ata/libahci_platform.c | 24 
> >> +++---
> >> ??? 3 files changed, 23 insertions(+), 3 deletions(-)
> >
> > This causes a regression on Tegra because we explicitly request the
> > resets after the call to ahci_platform_get_resources().
> 
>  I confirm, we got exactly the same behavior on STi platform.
> 
> >
> > ?? From a quick look, ahci_mtk and ahci_st are in the same boat, adding 
> > the
> > corresponding maintainers to Cc.
> >
> > Patrice, Matthias: does SATA still work for you after this patch? This
> > has been in linux-next since next-20180327.
> 
>  SATA is still working after this patch, but a kernel warning is
>  triggered due to the fact that resets are both requested by
>  libahci_platform and by ahci_st driver.
> >>>
> >>> So in your case you might be able to remove the reset handling
> >>> from the ahci_st driver and rely on the new libahci_platform
> >>> handling instead? If that works that seems like a win to me.
> >>>
> >>> As said elsewhere in this thread I think it makes sense to keep (or re-add
> >>> after a revert) the libahci_platform reset code, but make it conditional
> >>> on a flag passed to ahci_platform_get_resources(). This way we get
> >>> the shared code for most cases and platforms which need special handling
> >>> can opt-out.
> >>
> >> Agreed, although I prefer such helpers to be opt-in, rather than
> >> opt-out. In my experience that tends make the helpers more resilient to
> >> this kind of regression. It also simplifies things because instead of
> >> drivers saying "I want all the helpers except this one and that one",
> >> they can simply say "I want these helpers and that one". In the former
> >> case whenever you add some new (opt-out) feature, you have to update all
> >> drivers and add the exception. In the latter you only need to extend the
> >> drivers that want to make use of the new helper.
> 
> Erm, the idea never was to make this opt-out but rather opt in, so
> we add a flags parameter to ahci_platform_get_resources() and all
> current users pass in 0 for that to keep the current behavior.
> 
> And only the generic drivers/ata/ahci_platform.c driver will pass
> in a the new AHCI_PLATFORM_GET_RESETS flag, which makes
> ahci_platform_get_resources() (and the other functions) also deal
> with resets.
> 
> >> With that in mind, rather than adding a flag to the
> >> ahci_platform_get_resources() function, it might be more flexible to
> >> split the helpers into finer-grained functions. That way drivers can
> >> pick whatever functionality they want from the helpers.
> > > Good point, so lets:
> > > 1) Revert the patch for now
> > 2) Have a new version of the patch which adds a ahci_platform_get_resets() 
> > helper
> > 3) Modify the generic drivers/ata/ahci_platform.c driver to call the new
> >  ?? ahci_platform_get_resets() between its ahci_platform_get_resources()
> >  ?? and ahci_platform_enable_resources() calls.
> >  ?? I think that ahci_platform_enable_resources() should still automatically
> >  ?? do the right thing wrt resets if ahci_platform_get_resets() was called
> >  ?? (otherwise the resets array will be empty and should be skipped)
> > > This should make the generic driver usable for the UniPhier SoCs and
> > maybe some other drivers like the ahci_st driver can also switch to the
> > new ahci_platform_get_resets() functionality to reduce their code a bit.
> 
> So thinking slightly longer about this, with the opt-in variant
> (which is what I intended all along) I do think that a flags parameter
> is better, because the whole idea behind lib_ahci_platform is to avoid
> having to do err = get_resource_a(), if (err) bail, err = get_resource_b()
> if (err) bail, etc. in all the ahci (platform) drivers. And having fine
> grained helpers re-introduces that.

In case of adding a flag instead of get_resource_a(),
for example, we add the flag 

regression, imx6 and sgtl5000 sound problems

2018-04-05 Thread Mika Penttilä
Hi,

With recent merge to pre 4.17-rc, audio stopped workin (or it's hearable but 
way too slow).
imx6q + sgtl5000 codec.

Maybe some of the soc/fsl changes is causing this.

--Mika


regression, imx6 and sgtl5000 sound problems

2018-04-05 Thread Mika Penttilä
Hi,

With recent merge to pre 4.17-rc, audio stopped workin (or it's hearable but 
way too slow).
imx6q + sgtl5000 codec.

Maybe some of the soc/fsl changes is causing this.

--Mika


Re: [GIT PULL] Kernel lockdown for secure boot

2018-04-05 Thread Peter Dolding
>
> There's no inherent difference, in terms of the trust chain, between
> compromising it to use the machine as a toaster or to run a botnet - the
> trust chain is compromised either way.  But you're much more likely to
> notice if your desktop starts producing bread products than if it hides
> some malware and keeps on booting, and the second one is much more

> That is to say, as a result of the way malware has been written, our way
> of thinking about it is often that it's a way to build a boot loader for
> a malicious kernel, so that's how we wind up talking about it.  Are we
> concerned with malware stealing your data?  Yes, but Secure Boot is only
> indirectly about that.  It's primarily about denying the malware easy
> mechanisms to build a persistence mechanism.  The uid-0 != ring-0 aspect
> is useful independent of Secure Boot, but Secure Boot without it falls
> way short of accomplishing its goal.
>
> --
I am sorry the issue here this is really expanding Secure Boot to
breaking point.

Yes a person wants a secure system having the boot parts verified by
some means and using a lockdown is advantage.

Problem comes in with the idea that UEFI Secure Boot and lockdown are linked.

If I am running windows and linux on the same machine Secure Boot need
to be on so windows run happy.

Remember its my machine.  If I wish to compromise security on my
machine because it make sense I should be allowed to,

A proper lockdown would prevent you from messing with ACPI tables it a
very creative hack have kernel load a DSDT and have it from ring zero
turn bits in the kernel off.

The reality here is we need to be able to operate without lockdown due
to how badly broken some hardware it to configure system.

Yes the need to include option to push button to disable secure boot
is required due to how badly broken this stuff is.   Of course this
does not address the issue that if I am working on a system from
remote or embedded where I don't  have the push button to turn off as
a option this is still a problem.


Effective lockdown has to protect linux kernel boot parameters,
initramfs and other bits from being modified as well.   This lead us
to problem with the broken hardware in a machine we cannot turn secure
boot off we still need to perform all these alterations.


We do not live in a world of perfect computer hardware so at this
stage proper unattackable secureboot cannot be done.


We would be better off putting effort into improve means with UEFI of
adding own KEK.   This is so that only boot loaders and kernels from
the vendors user has approved in fact to work.  There could also be a
configuration KEK that gets disabled after all the required operating
systems are installed.So Microsoft non OS KEK makes sense to be
the configuration rule breaking KEK but the current deployments of
UEFI don't have a off switch option on it.


One KEK for everyone who is not Microsoft to boot with is highly insecure.


UEFI secureboot falls way short in the validation department currently
because too much is validated under one KEK key.

UEFI also fall short due to failing to provide a system to protect
boot parameters that can alter OS behaviour and make a secure kernel
insecure this include kernels with this lockdown patches,


Really you need to compare UEFI secureboot vs boot loader and /boot on
a read only media.   Every where you can change something in the UEFI
secureboot without is being signed that you cannot in the read only
media of the boot loader and /boot is a defect in the UEFI secureboot
design and implementation.

If boot parameters were properly secured there would be no need for
lockdown query if UEFI was in secureboot mode or not.

Also lockdown being on and kernel and boot loader not running secured
still would provide extra item attacker has to get past.

So fairly much remove the EFI interrogation patches and work with UEFI
to fix it properly.   Hacking around these UEFI defects means we will
end up being stuck with them and the system still not being properly
secured.


Peter Dolding


Re: [GIT PULL] Kernel lockdown for secure boot

2018-04-05 Thread Peter Dolding
>
> There's no inherent difference, in terms of the trust chain, between
> compromising it to use the machine as a toaster or to run a botnet - the
> trust chain is compromised either way.  But you're much more likely to
> notice if your desktop starts producing bread products than if it hides
> some malware and keeps on booting, and the second one is much more

> That is to say, as a result of the way malware has been written, our way
> of thinking about it is often that it's a way to build a boot loader for
> a malicious kernel, so that's how we wind up talking about it.  Are we
> concerned with malware stealing your data?  Yes, but Secure Boot is only
> indirectly about that.  It's primarily about denying the malware easy
> mechanisms to build a persistence mechanism.  The uid-0 != ring-0 aspect
> is useful independent of Secure Boot, but Secure Boot without it falls
> way short of accomplishing its goal.
>
> --
I am sorry the issue here this is really expanding Secure Boot to
breaking point.

Yes a person wants a secure system having the boot parts verified by
some means and using a lockdown is advantage.

Problem comes in with the idea that UEFI Secure Boot and lockdown are linked.

If I am running windows and linux on the same machine Secure Boot need
to be on so windows run happy.

Remember its my machine.  If I wish to compromise security on my
machine because it make sense I should be allowed to,

A proper lockdown would prevent you from messing with ACPI tables it a
very creative hack have kernel load a DSDT and have it from ring zero
turn bits in the kernel off.

The reality here is we need to be able to operate without lockdown due
to how badly broken some hardware it to configure system.

Yes the need to include option to push button to disable secure boot
is required due to how badly broken this stuff is.   Of course this
does not address the issue that if I am working on a system from
remote or embedded where I don't  have the push button to turn off as
a option this is still a problem.


Effective lockdown has to protect linux kernel boot parameters,
initramfs and other bits from being modified as well.   This lead us
to problem with the broken hardware in a machine we cannot turn secure
boot off we still need to perform all these alterations.


We do not live in a world of perfect computer hardware so at this
stage proper unattackable secureboot cannot be done.


We would be better off putting effort into improve means with UEFI of
adding own KEK.   This is so that only boot loaders and kernels from
the vendors user has approved in fact to work.  There could also be a
configuration KEK that gets disabled after all the required operating
systems are installed.So Microsoft non OS KEK makes sense to be
the configuration rule breaking KEK but the current deployments of
UEFI don't have a off switch option on it.


One KEK for everyone who is not Microsoft to boot with is highly insecure.


UEFI secureboot falls way short in the validation department currently
because too much is validated under one KEK key.

UEFI also fall short due to failing to provide a system to protect
boot parameters that can alter OS behaviour and make a secure kernel
insecure this include kernels with this lockdown patches,


Really you need to compare UEFI secureboot vs boot loader and /boot on
a read only media.   Every where you can change something in the UEFI
secureboot without is being signed that you cannot in the read only
media of the boot loader and /boot is a defect in the UEFI secureboot
design and implementation.

If boot parameters were properly secured there would be no need for
lockdown query if UEFI was in secureboot mode or not.

Also lockdown being on and kernel and boot loader not running secured
still would provide extra item attacker has to get past.

So fairly much remove the EFI interrogation patches and work with UEFI
to fix it properly.   Hacking around these UEFI defects means we will
end up being stuck with them and the system still not being properly
secured.


Peter Dolding


Re: x86/dma conversion for v4.17-rc1 breaks sound / sst-acpi (commit 6e4bf5867783)

2018-04-05 Thread Dominik Brodowski
On Fri, Apr 06, 2018 at 02:14:18AM +0100, Mark Brown wrote:
> On Thu, Apr 05, 2018 at 10:56:57PM +0200, Dominik Brodowski wrote:
> > Christoph,
> > 
> > unfortunately, commit 6e4bf5867783 breaks sound on my Dell XPS13, see the
> > dmesg diff between fec777c385b6 and 6e4bf5867783:
> 
> Adding Vinod and Pierre from Intel in case they have any ideas here.
> Which model of XPS13 is this (2015?)?

Dell Inc. XPS 13 9343/0TM99H, BIOS A11 12/08/2016, with

# CONFIG_GPIOLIB is not set,

just CONFIG_PINCTRL=y but no

# CONFIG_PINCTRL_BAYTRAIL is not set
# CONFIG_PINCTRL_CHERRYVIEW is not set
# CONFIG_PINCTRL_BROXTON is not set
# CONFIG_PINCTRL_CANNONLAKE is not set
# CONFIG_PINCTRL_CEDARFORK is not set
# CONFIG_PINCTRL_DENVERTON is not set
# CONFIG_PINCTRL_GEMINILAKE is not set
# CONFIG_PINCTRL_LEWISBURG is not set
# CONFIG_PINCTRL_SUNRISEPOINT is not set

(yes, this works, see commit d1fa74520dcd ), and,

CONFIG_SND_HDA_I915=y
CONFIG_SND_SOC=y
CONFIG_SND_SOC_AC97_BUS=y
CONFIG_SND_SOC_COMPRESS=y
CONFIG_SND_SOC_ACPI=y
CONFIG_SND_SOC_INTEL_SST_TOPLEVEL=y
CONFIG_SND_SST_IPC=y
CONFIG_SND_SST_IPC_ACPI=y
CONFIG_SND_SOC_INTEL_SST_ACPI=y
CONFIG_SND_SOC_INTEL_SST=y
CONFIG_SND_SOC_INTEL_SST_FIRMWARE=y
CONFIG_SND_SOC_INTEL_HASWELL=y
CONFIG_SND_SST_ATOM_HIFI2_PLATFORM=y
CONFIG_SND_SOC_ACPI_INTEL_MATCH=y
CONFIG_SND_SOC_INTEL_MACH=y
CONFIG_SND_SOC_INTEL_BROADWELL_MACH=y
CONFIG_SND_SOC_INTEL_BYTCR_RT5640_MACH=y
CONFIG_SND_SOC_INTEL_BYTCR_RT5651_MACH=y
CONFIG_SND_SOC_INTEL_CHT_BSW_MAX98090_TI_MACH=y
CONFIG_SND_SOC_I2C_AND_SPI=y
CONFIG_SND_SOC_AC97_CODEC=y


Thanks for looking into this!

Dominik


> > -sst-acpi INT3438:00: DesignWare DMA Controller, 8 channels
> > -haswell-pcm-audio haswell-pcm-audio: Direct firmware load for 
> > intel/IntcPP01.bin failed with error -2
> > -haswell-pcm-audio haswell-pcm-audio: fw image intel/IntcPP01.bin not 
> > available(-2)
> > -haswell-pcm-audio haswell-pcm-audio: FW loaded, mailbox readback FW info: 
> > type 01, - version: 00.00, build 77, source commit id: 
> > 876ac6906f31a43b6772b23c7c983ce9dcb18a19
> > -broadwell-audio broadwell-audio: snd-soc-dummy-dai <-> System Pin mapping 
> > ok
> > -broadwell-audio broadwell-audio: snd-soc-dummy-dai <-> Offload0 Pin 
> > mapping ok
> > -broadwell-audio broadwell-audio: snd-soc-dummy-dai <-> Offload1 Pin 
> > mapping ok
> > -broadwell-audio broadwell-audio: snd-soc-dummy-dai <-> Loopback Pin 
> > mapping ok
> > -broadwell-audio broadwell-audio: rt286-aif1 <-> snd-soc-dummy-dai mapping 
> > ok
> > -input: broadwell-rt286 Headset as 
> > /devices/pci:00/INT3438:00/broadwell-audio/sound/card1/input15
> > +broadwell-audio broadwell-audio: ASoC: CPU DAI System Pin not registered
> 
> > So it seems that sst-acpi is unhappy with this patch. Any ideas?


Re: x86/dma conversion for v4.17-rc1 breaks sound / sst-acpi (commit 6e4bf5867783)

2018-04-05 Thread Dominik Brodowski
On Fri, Apr 06, 2018 at 02:14:18AM +0100, Mark Brown wrote:
> On Thu, Apr 05, 2018 at 10:56:57PM +0200, Dominik Brodowski wrote:
> > Christoph,
> > 
> > unfortunately, commit 6e4bf5867783 breaks sound on my Dell XPS13, see the
> > dmesg diff between fec777c385b6 and 6e4bf5867783:
> 
> Adding Vinod and Pierre from Intel in case they have any ideas here.
> Which model of XPS13 is this (2015?)?

Dell Inc. XPS 13 9343/0TM99H, BIOS A11 12/08/2016, with

# CONFIG_GPIOLIB is not set,

just CONFIG_PINCTRL=y but no

# CONFIG_PINCTRL_BAYTRAIL is not set
# CONFIG_PINCTRL_CHERRYVIEW is not set
# CONFIG_PINCTRL_BROXTON is not set
# CONFIG_PINCTRL_CANNONLAKE is not set
# CONFIG_PINCTRL_CEDARFORK is not set
# CONFIG_PINCTRL_DENVERTON is not set
# CONFIG_PINCTRL_GEMINILAKE is not set
# CONFIG_PINCTRL_LEWISBURG is not set
# CONFIG_PINCTRL_SUNRISEPOINT is not set

(yes, this works, see commit d1fa74520dcd ), and,

CONFIG_SND_HDA_I915=y
CONFIG_SND_SOC=y
CONFIG_SND_SOC_AC97_BUS=y
CONFIG_SND_SOC_COMPRESS=y
CONFIG_SND_SOC_ACPI=y
CONFIG_SND_SOC_INTEL_SST_TOPLEVEL=y
CONFIG_SND_SST_IPC=y
CONFIG_SND_SST_IPC_ACPI=y
CONFIG_SND_SOC_INTEL_SST_ACPI=y
CONFIG_SND_SOC_INTEL_SST=y
CONFIG_SND_SOC_INTEL_SST_FIRMWARE=y
CONFIG_SND_SOC_INTEL_HASWELL=y
CONFIG_SND_SST_ATOM_HIFI2_PLATFORM=y
CONFIG_SND_SOC_ACPI_INTEL_MATCH=y
CONFIG_SND_SOC_INTEL_MACH=y
CONFIG_SND_SOC_INTEL_BROADWELL_MACH=y
CONFIG_SND_SOC_INTEL_BYTCR_RT5640_MACH=y
CONFIG_SND_SOC_INTEL_BYTCR_RT5651_MACH=y
CONFIG_SND_SOC_INTEL_CHT_BSW_MAX98090_TI_MACH=y
CONFIG_SND_SOC_I2C_AND_SPI=y
CONFIG_SND_SOC_AC97_CODEC=y


Thanks for looking into this!

Dominik


> > -sst-acpi INT3438:00: DesignWare DMA Controller, 8 channels
> > -haswell-pcm-audio haswell-pcm-audio: Direct firmware load for 
> > intel/IntcPP01.bin failed with error -2
> > -haswell-pcm-audio haswell-pcm-audio: fw image intel/IntcPP01.bin not 
> > available(-2)
> > -haswell-pcm-audio haswell-pcm-audio: FW loaded, mailbox readback FW info: 
> > type 01, - version: 00.00, build 77, source commit id: 
> > 876ac6906f31a43b6772b23c7c983ce9dcb18a19
> > -broadwell-audio broadwell-audio: snd-soc-dummy-dai <-> System Pin mapping 
> > ok
> > -broadwell-audio broadwell-audio: snd-soc-dummy-dai <-> Offload0 Pin 
> > mapping ok
> > -broadwell-audio broadwell-audio: snd-soc-dummy-dai <-> Offload1 Pin 
> > mapping ok
> > -broadwell-audio broadwell-audio: snd-soc-dummy-dai <-> Loopback Pin 
> > mapping ok
> > -broadwell-audio broadwell-audio: rt286-aif1 <-> snd-soc-dummy-dai mapping 
> > ok
> > -input: broadwell-rt286 Headset as 
> > /devices/pci:00/INT3438:00/broadwell-audio/sound/card1/input15
> > +broadwell-audio broadwell-audio: ASoC: CPU DAI System Pin not registered
> 
> > So it seems that sst-acpi is unhappy with this patch. Any ideas?


Re: [GIT PULL 3/3] ARM: SoC driver updates for 4.17

2018-04-05 Thread Linus Torvalds
On Thu, Apr 5, 2018 at 2:23 PM, Arnd Bergmann  wrote:
>
> - the ARM CCN driver is moved out of drivers/bus into drivers/perf,
>   which makes more sense. Similarly, the performance monitoring
>   portion of the CCI driver are moved the same way and cleaned up
>   a little more.

This caused a trivial merge with the perf tree.

But since I don't *build* the trivial merge resolution due to it being
an arm-only file, I wanted to point it out.

Because "not tested" very possibly means "I screwed something silly up
and didn't notice".

So as trivial as it seemed, it should still be checked.

Linus


Re: [GIT PULL 3/3] ARM: SoC driver updates for 4.17

2018-04-05 Thread Linus Torvalds
On Thu, Apr 5, 2018 at 2:23 PM, Arnd Bergmann  wrote:
>
> - the ARM CCN driver is moved out of drivers/bus into drivers/perf,
>   which makes more sense. Similarly, the performance monitoring
>   portion of the CCI driver are moved the same way and cleaned up
>   a little more.

This caused a trivial merge with the perf tree.

But since I don't *build* the trivial merge resolution due to it being
an arm-only file, I wanted to point it out.

Because "not tested" very possibly means "I screwed something silly up
and didn't notice".

So as trivial as it seemed, it should still be checked.

Linus


Re: [PATCH 1/2] af_key: Use DIV_ROUND_UP() instead of open-coded equivalent

2018-04-05 Thread Steffen Klassert
On Wed, Mar 28, 2018 at 09:35:26PM -0400, Kevin Easton wrote:
> On Wed, Mar 28, 2018 at 07:59:25AM +0200, Steffen Klassert wrote:
> > On Mon, Mar 26, 2018 at 07:39:16AM -0400, Kevin Easton wrote:
> > > Several places use (x + 7) / 8 to convert from a number of bits to a 
> > > number
> > > of bytes.  Replace those with DIV_ROUND_UP(x, 8) instead, for consistency
> > > with other parts of the same file.
> > > 
> > > Signed-off-by: Kevin Easton 
> > 
> > Is this a fix or just a cleanup?
> > If it is just a cleanup, please resent it based on ipsec-next.
> 
> Hi Steffen,
> 
> This patch (1/2) is just a cleanup, but it's in preparation for patch 2/2
> which is a fix and modifies some of the same lines.

So please resend the fix without the cleanup, otherwise we can
get conflicts when backporting the fix to the stable trees.


Re: [PATCH 1/2] af_key: Use DIV_ROUND_UP() instead of open-coded equivalent

2018-04-05 Thread Steffen Klassert
On Wed, Mar 28, 2018 at 09:35:26PM -0400, Kevin Easton wrote:
> On Wed, Mar 28, 2018 at 07:59:25AM +0200, Steffen Klassert wrote:
> > On Mon, Mar 26, 2018 at 07:39:16AM -0400, Kevin Easton wrote:
> > > Several places use (x + 7) / 8 to convert from a number of bits to a 
> > > number
> > > of bytes.  Replace those with DIV_ROUND_UP(x, 8) instead, for consistency
> > > with other parts of the same file.
> > > 
> > > Signed-off-by: Kevin Easton 
> > 
> > Is this a fix or just a cleanup?
> > If it is just a cleanup, please resent it based on ipsec-next.
> 
> Hi Steffen,
> 
> This patch (1/2) is just a cleanup, but it's in preparation for patch 2/2
> which is a fix and modifies some of the same lines.

So please resend the fix without the cleanup, otherwise we can
get conflicts when backporting the fix to the stable trees.


Re: [GIT PULL 2/3] ARM: SoC platform updates for 4.17

2018-04-05 Thread Linus Torvalds
On Thu, Apr 5, 2018 at 2:29 PM, Arnd Bergmann  wrote:
>
> One clear sign that the branch is indeed bigger than usual: the
> shortlog+diffstat exceeds the 100KB limit for the linux-arm-kernel
> mailing list.

I think you replied to the wrong pull request, and meant to reply to
the DT one..

The platform updates one was not that big.

Linus


Re: [GIT PULL 2/3] ARM: SoC platform updates for 4.17

2018-04-05 Thread Linus Torvalds
On Thu, Apr 5, 2018 at 2:29 PM, Arnd Bergmann  wrote:
>
> One clear sign that the branch is indeed bigger than usual: the
> shortlog+diffstat exceeds the 100KB limit for the linux-arm-kernel
> mailing list.

I think you replied to the wrong pull request, and meant to reply to
the DT one..

The platform updates one was not that big.

Linus


Re: [PATCH 5/6] rhashtable: support guaranteed successful insertion.

2018-04-05 Thread Herbert Xu
On Fri, Apr 06, 2018 at 01:11:56PM +1000, NeilBrown wrote:
>
> You don't need to handle memory allocation failures at the point where
> you insert into the table - adding to a linked list requires no new
> memory.

You do actually.  The r in rhashtable stands for resizable.  We
cannot completely rely on a background hash table resize because
the insertions can be triggered from softirq context and be without
rate-limits of any kind (e.g., incoming TCP connections).

Therefore at some point you either have to wait for the resize or
fail the insertion.  Since we cannot sleep then failing is the only
option.

> The likelihood of the error isn't really the issue - it either can
> happen or it cannot.  If it can, it requires code to handle it.

Sure, but you just have to handle it the same way you would handle
a memory allocation failure, because that's what it essentially is.

I'm sorry if that means you have to restructure your code to do that
but that's what you pay for having a resizable hash table because
at some point you just have to perform a resize.

> Ahhh... I see what you are thinking now.  You aren't suggestion a
> sharded count that is summed periodically.  Rather you are suggesting that
> we divide the hash space into N regions each with their own independent
> counter, and resize the table if any one region reaches 70% occupancy -
> on the assumption that the other regions are likely to be close.  Is
> that right?

Yes.

> It would probably be dangerous to allow automatic shrinking (when just
> one drops below 30%) in that case, but it might be OK for growing.

At the greatest granularity it would be a counter per bucket, so
clearly we need to maintain some kind of bound on the granularity
so our sample space is not too small.

Automatic shrinking shouldn't be an issue because that's the slow
kind of resizing that we punt to kthread context and it can afford
to do a careful count.

Cheers,
-- 
Email: Herbert Xu 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: [PATCH 5/6] rhashtable: support guaranteed successful insertion.

2018-04-05 Thread Herbert Xu
On Fri, Apr 06, 2018 at 01:11:56PM +1000, NeilBrown wrote:
>
> You don't need to handle memory allocation failures at the point where
> you insert into the table - adding to a linked list requires no new
> memory.

You do actually.  The r in rhashtable stands for resizable.  We
cannot completely rely on a background hash table resize because
the insertions can be triggered from softirq context and be without
rate-limits of any kind (e.g., incoming TCP connections).

Therefore at some point you either have to wait for the resize or
fail the insertion.  Since we cannot sleep then failing is the only
option.

> The likelihood of the error isn't really the issue - it either can
> happen or it cannot.  If it can, it requires code to handle it.

Sure, but you just have to handle it the same way you would handle
a memory allocation failure, because that's what it essentially is.

I'm sorry if that means you have to restructure your code to do that
but that's what you pay for having a resizable hash table because
at some point you just have to perform a resize.

> Ahhh... I see what you are thinking now.  You aren't suggestion a
> sharded count that is summed periodically.  Rather you are suggesting that
> we divide the hash space into N regions each with their own independent
> counter, and resize the table if any one region reaches 70% occupancy -
> on the assumption that the other regions are likely to be close.  Is
> that right?

Yes.

> It would probably be dangerous to allow automatic shrinking (when just
> one drops below 30%) in that case, but it might be OK for growing.

At the greatest granularity it would be a counter per bucket, so
clearly we need to maintain some kind of bound on the granularity
so our sample space is not too small.

Automatic shrinking shouldn't be an issue because that's the slow
kind of resizing that we punt to kthread context and it can afford
to do a careful count.

Cheers,
-- 
Email: Herbert Xu 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: [PATCH v4 0/1] Safe LSM (un)loading, and immutable hooks

2018-04-05 Thread Peter Dolding
On Fri, Apr 6, 2018 at 11:31 AM, Sargun Dhillon  wrote:
>
>
> On Thu, Apr 5, 2018 at 9:29 AM, Casey Schaufler 
> wrote:
>>
>> On 4/5/2018 3:31 AM, Peter Dolding wrote:
>> > On Thu, Apr 5, 2018 at 7:55 PM, Igor Stoppa 
>> > wrote:
>> >> On 01/04/18 08:41, Sargun Dhillon wrote:
>> >>> The biggest security benefit of this patchset is the introduction of
>> >>> read-only hooks, even if some security modules have mutable hooks.
>> >>> Currently, if you have any LSMs with mutable hooks it will render all
>> >>> heads, and
>> >>> list nodes mutable. These are a prime place to attack, because being
>> >>> able to
>> >>> manipulate those hooks is a way to bypass all LSMs easily, and to
>> >>> create a
>> >>> persistent, covert channel to intercept nearly all calls.
>> >>>
>> >>>
>> >>> If LSMs have a model to be unloaded, or are compled as modules, they
>> >>> should mark
>> >>> themselves mutable at compile time, and use the LSM_HOOK_INIT_MUTABLE
>> >>> macro
>> >>> instead of the LSM_HOOK_INIT macro, so their hooks are on the mutable
>> >>> chain.
>> >>
>> >> I'd rather consider these types of hooks:
>> >>
>> >> A) hooks that are either const or marked as RO after init
>> >>
>> >> B) hooks that are writable for a short time, long enough to load
>> >> additional, non built-in modules, but then get locked down
>> >> I provided an example some time ago [1]
>> >>
>> >> C) hooks that are unloadable (and therefore always attackable?)
>> >>
>> >> Maybe type-A could be dropped and used only as type-B, if it's
>> >> acceptable that type-A hooks are vulnerable before lock-down of type-B
>> >> hooks.
>> >>
>> >> I have some doubts about the usefulness of type-C, though.
>> >> The benefit I see htat it brings is that it avoids having to reboot
>> >> when
>> >> a mutable LSM is changed, at the price of leaving it attackable.
>> >>
>> >> Do you have any specific case in mind where this trade-off would be
>> >> acceptable?
>> >>
>> > A useful case for loadable/unloadable LSM is development automate QA.
>> >
>> > So you have built a new program and you you want to test it against a
>> > list of different LSM configurations without having to reboot the
>> > system.  So a run testsuite with LSM off  then enabled LSM1 run
>> > testsuite again disable LSM1 enable LSM2. run testsuite disable
>> > LSM2... Basically repeating process.
>> >
>> > I would say normal production machines being able to swap LSM like
>> > this does not have much use.
>> >
>> > Sometimes for productivity it makes sense to be able to breach
>> > security.   The fact you need to test with LSM disabled to know if any
>> > of the defects you are seeing is LSM configuration related that
>> > instance is already in the camp of non secure anyhow..
>> >
>> > There is a shade of grey between something being a security hazard and
>> > something being a useful feature.
>>
>> If the only value of a feature is development I strongly
>> advocate against it. The number of times I've seen things
>> completely messed up because it makes development easier
>> is astonishing. If you have to enable something dangerous
>> just for testing you have to wonder about the testing.
>>
Casey Schaufler we have had different points of view before.   I will
point out some serous issues here.  If you look a PPA and many other
locations you will find no LSM configuration files.

Majority of QA servers around the place run with LSM off.   There is a
practical annoying reason.   No point running application with new
code with LSM on at first you run with LSM off to make sure program
works.   If program works and you have the resources then transfer to
another machine/reboot to test with LSM this creates a broken
workflow.   When customer gets untested LSM configuration files and
they don't work what do support straight up recommend turning the LSM
off.

Reality enabling LSM module loading and unloading on the fly on QA
servers will not change their security 1 bit because they are most
running without LSM at all.   Making it simple to implement LSM
configuration testing on QA servers will reduce the number of times
end users at told to turn LSM off on their machines that will effect
over all security.

So we need to make the process of testing LSM configurations against
applications on the QA servers way smoother.

> So, first, this gives us a security benefit for LSMs which do not have
> unloadable hooks. For those, they will always be able to load at boot-time,
> and get protected hooks. Given that we can't really remove
> security_delete_hooks until this SELinux removes their dependency on it, I'm
> not sure we that this happy accident of safe (un)loading should be
> sacrificed.
>
> I think having LSMs that are loadable after boot is extremely valuable. In
> our specific use case, we've wanted to implement specific security policies
> which are not capable of being implemented on the traditional LSMs. We have
> the capability 

Re: [PATCH v4 0/1] Safe LSM (un)loading, and immutable hooks

2018-04-05 Thread Peter Dolding
On Fri, Apr 6, 2018 at 11:31 AM, Sargun Dhillon  wrote:
>
>
> On Thu, Apr 5, 2018 at 9:29 AM, Casey Schaufler 
> wrote:
>>
>> On 4/5/2018 3:31 AM, Peter Dolding wrote:
>> > On Thu, Apr 5, 2018 at 7:55 PM, Igor Stoppa 
>> > wrote:
>> >> On 01/04/18 08:41, Sargun Dhillon wrote:
>> >>> The biggest security benefit of this patchset is the introduction of
>> >>> read-only hooks, even if some security modules have mutable hooks.
>> >>> Currently, if you have any LSMs with mutable hooks it will render all
>> >>> heads, and
>> >>> list nodes mutable. These are a prime place to attack, because being
>> >>> able to
>> >>> manipulate those hooks is a way to bypass all LSMs easily, and to
>> >>> create a
>> >>> persistent, covert channel to intercept nearly all calls.
>> >>>
>> >>>
>> >>> If LSMs have a model to be unloaded, or are compled as modules, they
>> >>> should mark
>> >>> themselves mutable at compile time, and use the LSM_HOOK_INIT_MUTABLE
>> >>> macro
>> >>> instead of the LSM_HOOK_INIT macro, so their hooks are on the mutable
>> >>> chain.
>> >>
>> >> I'd rather consider these types of hooks:
>> >>
>> >> A) hooks that are either const or marked as RO after init
>> >>
>> >> B) hooks that are writable for a short time, long enough to load
>> >> additional, non built-in modules, but then get locked down
>> >> I provided an example some time ago [1]
>> >>
>> >> C) hooks that are unloadable (and therefore always attackable?)
>> >>
>> >> Maybe type-A could be dropped and used only as type-B, if it's
>> >> acceptable that type-A hooks are vulnerable before lock-down of type-B
>> >> hooks.
>> >>
>> >> I have some doubts about the usefulness of type-C, though.
>> >> The benefit I see htat it brings is that it avoids having to reboot
>> >> when
>> >> a mutable LSM is changed, at the price of leaving it attackable.
>> >>
>> >> Do you have any specific case in mind where this trade-off would be
>> >> acceptable?
>> >>
>> > A useful case for loadable/unloadable LSM is development automate QA.
>> >
>> > So you have built a new program and you you want to test it against a
>> > list of different LSM configurations without having to reboot the
>> > system.  So a run testsuite with LSM off  then enabled LSM1 run
>> > testsuite again disable LSM1 enable LSM2. run testsuite disable
>> > LSM2... Basically repeating process.
>> >
>> > I would say normal production machines being able to swap LSM like
>> > this does not have much use.
>> >
>> > Sometimes for productivity it makes sense to be able to breach
>> > security.   The fact you need to test with LSM disabled to know if any
>> > of the defects you are seeing is LSM configuration related that
>> > instance is already in the camp of non secure anyhow..
>> >
>> > There is a shade of grey between something being a security hazard and
>> > something being a useful feature.
>>
>> If the only value of a feature is development I strongly
>> advocate against it. The number of times I've seen things
>> completely messed up because it makes development easier
>> is astonishing. If you have to enable something dangerous
>> just for testing you have to wonder about the testing.
>>
Casey Schaufler we have had different points of view before.   I will
point out some serous issues here.  If you look a PPA and many other
locations you will find no LSM configuration files.

Majority of QA servers around the place run with LSM off.   There is a
practical annoying reason.   No point running application with new
code with LSM on at first you run with LSM off to make sure program
works.   If program works and you have the resources then transfer to
another machine/reboot to test with LSM this creates a broken
workflow.   When customer gets untested LSM configuration files and
they don't work what do support straight up recommend turning the LSM
off.

Reality enabling LSM module loading and unloading on the fly on QA
servers will not change their security 1 bit because they are most
running without LSM at all.   Making it simple to implement LSM
configuration testing on QA servers will reduce the number of times
end users at told to turn LSM off on their machines that will effect
over all security.

So we need to make the process of testing LSM configurations against
applications on the QA servers way smoother.

> So, first, this gives us a security benefit for LSMs which do not have
> unloadable hooks. For those, they will always be able to load at boot-time,
> and get protected hooks. Given that we can't really remove
> security_delete_hooks until this SELinux removes their dependency on it, I'm
> not sure we that this happy accident of safe (un)loading should be
> sacrificed.
>
> I think having LSMs that are loadable after boot is extremely valuable. In
> our specific use case, we've wanted to implement specific security policies
> which are not capable of being implemented on the traditional LSMs. We have
> the capability of deploying a Linux Kernel Module throughout our fleet.
> Recent 

Re: [PATCH net-next] netns: filter uevents correctly

2018-04-05 Thread Eric W. Biederman
Christian Brauner  writes:

> On Thu, Apr 05, 2018 at 05:26:59PM +0300, Kirill Tkhai wrote:
>> On 05.04.2018 17:07, Christian Brauner wrote:
>> > On Thu, Apr 05, 2018 at 04:01:03PM +0300, Kirill Tkhai wrote:
>> >> On 04.04.2018 22:48, Christian Brauner wrote:
>> >>> commit 07e98962fa77 ("kobject: Send hotplug events in all network 
>> >>> namespaces")
>> >>>
>> >>> enabled sending hotplug events into all network namespaces back in 2010.
>> >>> Over time the set of uevents that get sent into all network namespaces 
>> >>> has
>> >>> shrunk. We have now reached the point where hotplug events for all 
>> >>> devices
>> >>> that carry a namespace tag are filtered according to that namespace.
>> >>>
>> >>> Specifically, they are filtered whenever the namespace tag of the kobject
>> >>> does not match the namespace tag of the netlink socket. One example are
>> >>> network devices. Uevents for network devices only show up in the network
>> >>> namespaces these devices are moved to or created in.
>> >>>
>> >>> However, any uevent for a kobject that does not have a namespace tag
>> >>> associated with it will not be filtered and we will *try* to broadcast it
>> >>> into all network namespaces.
>> >>>
>> >>> The original patchset was written in 2010 before user namespaces were a
>> >>> thing. With the introduction of user namespaces sending out uevents 
>> >>> became
>> >>> partially isolated as they were filtered by user namespaces:
>> >>>
>> >>> net/netlink/af_netlink.c:do_one_broadcast()
>> >>>
>> >>> if (!net_eq(sock_net(sk), p->net)) {
>> >>> if (!(nlk->flags & NETLINK_F_LISTEN_ALL_NSID))
>> >>> return;
>> >>>
>> >>> if (!peernet_has_id(sock_net(sk), p->net))
>> >>> return;
>> >>>
>> >>> if (!file_ns_capable(sk->sk_socket->file, p->net->user_ns,
>> >>>  CAP_NET_BROADCAST))
>> >>> j   return;
>> >>> }
>> >>>
>> >>> The file_ns_capable() check will check whether the caller had
>> >>> CAP_NET_BROADCAST at the time of opening the netlink socket in the user
>> >>> namespace of interest. This check is fine in general but seems 
>> >>> insufficient
>> >>> to me when paired with uevents. The reason is that devices always belong 
>> >>> to
>> >>> the initial user namespace so uevents for kobjects that do not carry a
>> >>> namespace tag should never be sent into another user namespace. This has
>> >>> been the intention all along. But there's one case where this breaks,
>> >>> namely if a new user namespace is created by root on the host and an
>> >>> identity mapping is established between root on the host and root in the
>> >>> new user namespace. Here's a reproducer:
>> >>>
>> >>>  sudo unshare -U --map-root
>> >>>  udevadm monitor -k
>> >>>  # Now change to initial user namespace and e.g. do
>> >>>  modprobe kvm
>> >>>  # or
>> >>>  rmmod kvm
>> >>>
>> >>> will allow the non-initial user namespace to retrieve all uevents from 
>> >>> the
>> >>> host. This seems very anecdotal given that in the general case user
>> >>> namespaces do not see any uevents and also can't really do anything 
>> >>> useful
>> >>> with them.
>> >>>
>> >>> Additionally, it is now possible to send uevents from userspace. As such 
>> >>> we
>> >>> can let a sufficiently privileged (CAP_SYS_ADMIN in the owning user
>> >>> namespace of the network namespace of the netlink socket) userspace 
>> >>> process
>> >>> make a decision what uevents should be sent.
>> >>>
>> >>> This makes me think that we should simply ensure that uevents for 
>> >>> kobjects
>> >>> that do not carry a namespace tag are *always* filtered by user namespace
>> >>> in kobj_bcast_filter(). Specifically:
>> >>> - If the owning user namespace of the uevent socket is not init_user_ns 
>> >>> the
>> >>>   event will always be filtered.
>> >>> - If the network namespace the uevent socket belongs to was created in 
>> >>> the
>> >>>   initial user namespace but was opened from a non-initial user namespace
>> >>>   the event will be filtered as well.
>> >>> Put another way, uevents for kobjects not carrying a namespace tag are 
>> >>> now
>> >>> always only sent to the initial user namespace. The regression potential
>> >>> for this is near to non-existent since user namespaces can't really do
>> >>> anything with interesting devices.
>> >>>
>> >>> Signed-off-by: Christian Brauner 
>> >>> ---
>> >>>  lib/kobject_uevent.c | 10 +-
>> >>>  1 file changed, 9 insertions(+), 1 deletion(-)
>> >>>
>> >>> diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c
>> >>> index 15ea216a67ce..cb98cddb6e3b 100644
>> >>> --- a/lib/kobject_uevent.c
>> >>> +++ b/lib/kobject_uevent.c
>> >>> @@ -251,7 +251,15 @@ static int kobj_bcast_filter(struct sock *dsk, 
>> >>> struct sk_buff *skb, void *data)
>> >>>  return sock_ns != ns;
>> >>>  }
>> >>>  
>> >>> -return 0;
>> >>> +/*
>> >>> + * 

Re: [PATCH net-next] netns: filter uevents correctly

2018-04-05 Thread Eric W. Biederman
Christian Brauner  writes:

> On Thu, Apr 05, 2018 at 05:26:59PM +0300, Kirill Tkhai wrote:
>> On 05.04.2018 17:07, Christian Brauner wrote:
>> > On Thu, Apr 05, 2018 at 04:01:03PM +0300, Kirill Tkhai wrote:
>> >> On 04.04.2018 22:48, Christian Brauner wrote:
>> >>> commit 07e98962fa77 ("kobject: Send hotplug events in all network 
>> >>> namespaces")
>> >>>
>> >>> enabled sending hotplug events into all network namespaces back in 2010.
>> >>> Over time the set of uevents that get sent into all network namespaces 
>> >>> has
>> >>> shrunk. We have now reached the point where hotplug events for all 
>> >>> devices
>> >>> that carry a namespace tag are filtered according to that namespace.
>> >>>
>> >>> Specifically, they are filtered whenever the namespace tag of the kobject
>> >>> does not match the namespace tag of the netlink socket. One example are
>> >>> network devices. Uevents for network devices only show up in the network
>> >>> namespaces these devices are moved to or created in.
>> >>>
>> >>> However, any uevent for a kobject that does not have a namespace tag
>> >>> associated with it will not be filtered and we will *try* to broadcast it
>> >>> into all network namespaces.
>> >>>
>> >>> The original patchset was written in 2010 before user namespaces were a
>> >>> thing. With the introduction of user namespaces sending out uevents 
>> >>> became
>> >>> partially isolated as they were filtered by user namespaces:
>> >>>
>> >>> net/netlink/af_netlink.c:do_one_broadcast()
>> >>>
>> >>> if (!net_eq(sock_net(sk), p->net)) {
>> >>> if (!(nlk->flags & NETLINK_F_LISTEN_ALL_NSID))
>> >>> return;
>> >>>
>> >>> if (!peernet_has_id(sock_net(sk), p->net))
>> >>> return;
>> >>>
>> >>> if (!file_ns_capable(sk->sk_socket->file, p->net->user_ns,
>> >>>  CAP_NET_BROADCAST))
>> >>> j   return;
>> >>> }
>> >>>
>> >>> The file_ns_capable() check will check whether the caller had
>> >>> CAP_NET_BROADCAST at the time of opening the netlink socket in the user
>> >>> namespace of interest. This check is fine in general but seems 
>> >>> insufficient
>> >>> to me when paired with uevents. The reason is that devices always belong 
>> >>> to
>> >>> the initial user namespace so uevents for kobjects that do not carry a
>> >>> namespace tag should never be sent into another user namespace. This has
>> >>> been the intention all along. But there's one case where this breaks,
>> >>> namely if a new user namespace is created by root on the host and an
>> >>> identity mapping is established between root on the host and root in the
>> >>> new user namespace. Here's a reproducer:
>> >>>
>> >>>  sudo unshare -U --map-root
>> >>>  udevadm monitor -k
>> >>>  # Now change to initial user namespace and e.g. do
>> >>>  modprobe kvm
>> >>>  # or
>> >>>  rmmod kvm
>> >>>
>> >>> will allow the non-initial user namespace to retrieve all uevents from 
>> >>> the
>> >>> host. This seems very anecdotal given that in the general case user
>> >>> namespaces do not see any uevents and also can't really do anything 
>> >>> useful
>> >>> with them.
>> >>>
>> >>> Additionally, it is now possible to send uevents from userspace. As such 
>> >>> we
>> >>> can let a sufficiently privileged (CAP_SYS_ADMIN in the owning user
>> >>> namespace of the network namespace of the netlink socket) userspace 
>> >>> process
>> >>> make a decision what uevents should be sent.
>> >>>
>> >>> This makes me think that we should simply ensure that uevents for 
>> >>> kobjects
>> >>> that do not carry a namespace tag are *always* filtered by user namespace
>> >>> in kobj_bcast_filter(). Specifically:
>> >>> - If the owning user namespace of the uevent socket is not init_user_ns 
>> >>> the
>> >>>   event will always be filtered.
>> >>> - If the network namespace the uevent socket belongs to was created in 
>> >>> the
>> >>>   initial user namespace but was opened from a non-initial user namespace
>> >>>   the event will be filtered as well.
>> >>> Put another way, uevents for kobjects not carrying a namespace tag are 
>> >>> now
>> >>> always only sent to the initial user namespace. The regression potential
>> >>> for this is near to non-existent since user namespaces can't really do
>> >>> anything with interesting devices.
>> >>>
>> >>> Signed-off-by: Christian Brauner 
>> >>> ---
>> >>>  lib/kobject_uevent.c | 10 +-
>> >>>  1 file changed, 9 insertions(+), 1 deletion(-)
>> >>>
>> >>> diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c
>> >>> index 15ea216a67ce..cb98cddb6e3b 100644
>> >>> --- a/lib/kobject_uevent.c
>> >>> +++ b/lib/kobject_uevent.c
>> >>> @@ -251,7 +251,15 @@ static int kobj_bcast_filter(struct sock *dsk, 
>> >>> struct sk_buff *skb, void *data)
>> >>>  return sock_ns != ns;
>> >>>  }
>> >>>  
>> >>> -return 0;
>> >>> +/*
>> >>> + * The kobject does not carry a namespace tag so filter by user
>> 

Re: [PATCH 2/1] Kbuild: fix # escaping in .cmd files for future Make

2018-04-05 Thread Masahiro Yamada
2018-04-06 6:43 GMT+09:00 Rasmus Villemoes :
> On 2018-03-26 13:48, Masahiro Yamada wrote:
>> 2018-03-26 8:09 GMT+09:00 Rasmus Villemoes :
>>> The latest official Make release is 4.2.1 from mid-2016, but the current
>>> git release has this relevant note in the NEWS file:
>>>
>>>   * WARNING: Backward-incompatibility!
>>> Number signs (#) appearing inside a macro reference or function 
>>> invocation
>>> no longer introduce comments and should not be escaped with backslashes:
>>> thus a call such as:
>>>   foo := $(shell echo '#')
>>> is legal.  Previously the number sign needed to be escaped, for example:
>>>   foo := $(shell echo '\#')
>>> Now this latter will resolve to "\#".  If you want to write makefiles
>>> portable to both versions, assign the number sign to a variable:
>>>   C := \#
>>>   foo := $(shell echo '$C')
>>> This was claimed to be fixed in 3.81, but wasn't, for some reason.
>>> To detect this change search for 'nocomment' in the .FEATURES variable.
>>>
>>> Prepare for whatever future Make release contains that change by fixing
>>> up the .cmd file escaping - without this, make always thinks the command
>>> string has changed and hence rebuilds everything.
>>>
>>> Signed-off-by: Rasmus Villemoes 
>>> ---
>>> So the previous patch made everything build, but building again
>>> revealed that this central place very much also needed fixing.
>>>
>>>  scripts/Kbuild.include | 3 ++-
>>>  1 file changed, 2 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/scripts/Kbuild.include b/scripts/Kbuild.include
>>> index 065324a8046f..7a926e4688f4 100644
>>> --- a/scripts/Kbuild.include
>>> +++ b/scripts/Kbuild.include
>>> @@ -10,6 +10,7 @@ space   := $(empty) $(empty)
>>>  space_escape := _-_SPACE_-_
>>>  right_paren := )
>>>  left_paren := (
>>> +pound := \#
>>>
>>>  ###
>>>  # Name of target with a '.' as filename prefix. foo/bar.o => foo/.bar.o
>>> @@ -328,7 +329,7 @@ endif
>>>  # (needed for make)
>>>  # Replace >'< with >'\''< to be able to enclose the whole string in '...'
>>>  # (needed for the shell)
>>> -make-cmd = $(call escsq,$(subst \#,\\\#,$(subst $$,,$(cmd_$(1)
>>> +make-cmd = $(call escsq,$(subst $(pound),\\$(pound),$(subst 
>>> $$,,$(cmd_$(1)
>>>
>>
>> Thanks for the patch, but this changes the behavior.
>> With '#' replaced with $(pound), '\\' does not escape anything,
>> so it is treated as '\\'.
>>
>>
>> The following keeps the current behavior:
>>
>> make-cmd = $(call escsq,$(subst $(pound),\$(pound),$(subst
>> $$,,$(cmd_$(1)
>>
>>
>>
>> But, I think the following is an even better fix:
>>
>> make-cmd = $(call escsq,$(subst $(pound),$$(pound),$(subst
>> $$,,$(cmd_$(1)
>
> Makefile quoting has never been my strong suit. I actually thought I
> tested my change by looking at some .o.cmd files containing \# before
> and after, but I was fooled by tools/ containing and using their own
> copy of make-cmd - so tools/build/Build.include will also need fixing.
>
> Yes, writing $(pound) to the .cmd file seems like the safest choice. Do
> you want me to respin or can/will you do the patch(es) yourself?
>

Can you send v2 please?

I will pick it up shortly.



-- 
Best Regards
Masahiro Yamada


Re: [PATCH 2/1] Kbuild: fix # escaping in .cmd files for future Make

2018-04-05 Thread Masahiro Yamada
2018-04-06 6:43 GMT+09:00 Rasmus Villemoes :
> On 2018-03-26 13:48, Masahiro Yamada wrote:
>> 2018-03-26 8:09 GMT+09:00 Rasmus Villemoes :
>>> The latest official Make release is 4.2.1 from mid-2016, but the current
>>> git release has this relevant note in the NEWS file:
>>>
>>>   * WARNING: Backward-incompatibility!
>>> Number signs (#) appearing inside a macro reference or function 
>>> invocation
>>> no longer introduce comments and should not be escaped with backslashes:
>>> thus a call such as:
>>>   foo := $(shell echo '#')
>>> is legal.  Previously the number sign needed to be escaped, for example:
>>>   foo := $(shell echo '\#')
>>> Now this latter will resolve to "\#".  If you want to write makefiles
>>> portable to both versions, assign the number sign to a variable:
>>>   C := \#
>>>   foo := $(shell echo '$C')
>>> This was claimed to be fixed in 3.81, but wasn't, for some reason.
>>> To detect this change search for 'nocomment' in the .FEATURES variable.
>>>
>>> Prepare for whatever future Make release contains that change by fixing
>>> up the .cmd file escaping - without this, make always thinks the command
>>> string has changed and hence rebuilds everything.
>>>
>>> Signed-off-by: Rasmus Villemoes 
>>> ---
>>> So the previous patch made everything build, but building again
>>> revealed that this central place very much also needed fixing.
>>>
>>>  scripts/Kbuild.include | 3 ++-
>>>  1 file changed, 2 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/scripts/Kbuild.include b/scripts/Kbuild.include
>>> index 065324a8046f..7a926e4688f4 100644
>>> --- a/scripts/Kbuild.include
>>> +++ b/scripts/Kbuild.include
>>> @@ -10,6 +10,7 @@ space   := $(empty) $(empty)
>>>  space_escape := _-_SPACE_-_
>>>  right_paren := )
>>>  left_paren := (
>>> +pound := \#
>>>
>>>  ###
>>>  # Name of target with a '.' as filename prefix. foo/bar.o => foo/.bar.o
>>> @@ -328,7 +329,7 @@ endif
>>>  # (needed for make)
>>>  # Replace >'< with >'\''< to be able to enclose the whole string in '...'
>>>  # (needed for the shell)
>>> -make-cmd = $(call escsq,$(subst \#,\\\#,$(subst $$,,$(cmd_$(1)
>>> +make-cmd = $(call escsq,$(subst $(pound),\\$(pound),$(subst 
>>> $$,,$(cmd_$(1)
>>>
>>
>> Thanks for the patch, but this changes the behavior.
>> With '#' replaced with $(pound), '\\' does not escape anything,
>> so it is treated as '\\'.
>>
>>
>> The following keeps the current behavior:
>>
>> make-cmd = $(call escsq,$(subst $(pound),\$(pound),$(subst
>> $$,,$(cmd_$(1)
>>
>>
>>
>> But, I think the following is an even better fix:
>>
>> make-cmd = $(call escsq,$(subst $(pound),$$(pound),$(subst
>> $$,,$(cmd_$(1)
>
> Makefile quoting has never been my strong suit. I actually thought I
> tested my change by looking at some .o.cmd files containing \# before
> and after, but I was fooled by tools/ containing and using their own
> copy of make-cmd - so tools/build/Build.include will also need fixing.
>
> Yes, writing $(pound) to the .cmd file seems like the safest choice. Do
> you want me to respin or can/will you do the patch(es) yourself?
>

Can you send v2 please?

I will pick it up shortly.



-- 
Best Regards
Masahiro Yamada


make xmldocs failed with error after 4.17 merge period

2018-04-05 Thread Masanari Iida
After merge following patch during 4.17 merger period,
make xmldocs start to fail with error.

 [bdecb33af34f79cbfbb656661210f77c8b8b5b5f]
usb: typec: API for controlling USB Type-C Multiplexers

Error messages.
reST markup error:
/home/iida/Repo/linux-2.6/Documentation/driver-api/usb/typec.rst:215:
(SEVERE/4) Unexpected section title or transition.


Documentation/Makefile:93: recipe for target 'xmldocs' failed
make[1]: *** [xmldocs] Error 1
Makefile:1527: recipe for target 'xmldocs' failed
make: *** [xmldocs] Error 2

$

An ascii graphic in typec.rst cause the error.

Masanari Iida


make xmldocs failed with error after 4.17 merge period

2018-04-05 Thread Masanari Iida
After merge following patch during 4.17 merger period,
make xmldocs start to fail with error.

 [bdecb33af34f79cbfbb656661210f77c8b8b5b5f]
usb: typec: API for controlling USB Type-C Multiplexers

Error messages.
reST markup error:
/home/iida/Repo/linux-2.6/Documentation/driver-api/usb/typec.rst:215:
(SEVERE/4) Unexpected section title or transition.


Documentation/Makefile:93: recipe for target 'xmldocs' failed
make[1]: *** [xmldocs] Error 1
Makefile:1527: recipe for target 'xmldocs' failed
make: *** [xmldocs] Error 2

$

An ascii graphic in typec.rst cause the error.

Masanari Iida


Re: [PATCH 3/6] aio: refactor read/write iocb setup

2018-04-05 Thread Al Viro
On Wed, Mar 28, 2018 at 09:26:36AM +0200, Christoph Hellwig wrote:
> + struct inode *inode = file_inode(file);
> +
>   req->ki_flags |= IOCB_WRITE;
>   file_start_write(file);
> - ret = aio_ret(req, call_write_iter(file, req, ));
> + ret = aio_rw_ret(req, call_write_iter(file, req, ));
>   /*
> -  * We release freeze protection in aio_complete().  Fool lockdep
> -  * by telling it the lock got released so that it doesn't
> -  * complain about held lock when we return to userspace.
> +  * We release freeze protection in aio_complete_rw().  Fool
> +  * lockdep by telling it the lock got released so that it
> +  * doesn't complain about held lock when we return to userspace.
>*/
> - if (S_ISREG(file_inode(file)->i_mode))
> - __sb_writers_release(file_inode(file)->i_sb, 
> SB_FREEZE_WRITE);
> + if (S_ISREG(inode->i_mode))

... and that's another use-after-free, since we might've already done fput() of
that sucker by that point.

> + __sb_writers_release(inode->i_sb, SB_FREEZE_WRITE);


Re: [PATCH 3/6] aio: refactor read/write iocb setup

2018-04-05 Thread Al Viro
On Wed, Mar 28, 2018 at 09:26:36AM +0200, Christoph Hellwig wrote:
> + struct inode *inode = file_inode(file);
> +
>   req->ki_flags |= IOCB_WRITE;
>   file_start_write(file);
> - ret = aio_ret(req, call_write_iter(file, req, ));
> + ret = aio_rw_ret(req, call_write_iter(file, req, ));
>   /*
> -  * We release freeze protection in aio_complete().  Fool lockdep
> -  * by telling it the lock got released so that it doesn't
> -  * complain about held lock when we return to userspace.
> +  * We release freeze protection in aio_complete_rw().  Fool
> +  * lockdep by telling it the lock got released so that it
> +  * doesn't complain about held lock when we return to userspace.
>*/
> - if (S_ISREG(file_inode(file)->i_mode))
> - __sb_writers_release(file_inode(file)->i_sb, 
> SB_FREEZE_WRITE);
> + if (S_ISREG(inode->i_mode))

... and that's another use-after-free, since we might've already done fput() of
that sucker by that point.

> + __sb_writers_release(inode->i_sb, SB_FREEZE_WRITE);


Re: io_pgetevents & aio fsync V2

2018-04-05 Thread Al Viro
On Wed, Mar 28, 2018 at 09:26:33AM +0200, Christoph Hellwig wrote:
> Hi all,
> 
> this patch adds workqueue based fsync offload.  Version of this
> patch have been floating around for a couple years, but we now
> have a user with seastar used by ScyllaDB (who sponsored this
> work) that really wants this in addition to the aio poll support.
> More details are in the patch itself.
> 
> Because the iocb types have been defined sine day one (and probably
> were supported by RHEL3) libaio already supports these calls as-is.
> 
> This also pulls in the aio cleanups and io_pgetevents support previously
> submitted and review as part of the aio poll series.  The aio poll
> series will be resubmitted on top of this series

BTW, this is only tangentially related, but... does *anything* call
io_submit() for huge amounts of iocb?  Check in do_io_submit() is
insane - "no more than MAX_LONG total of _pointers_".  Compat variant
goes for "no more than a page worth of pointers" and there's
a hard limit in ioctx_alloc() - we can't ever get more than
8M slots in ring buffer...


Re: io_pgetevents & aio fsync V2

2018-04-05 Thread Al Viro
On Wed, Mar 28, 2018 at 09:26:33AM +0200, Christoph Hellwig wrote:
> Hi all,
> 
> this patch adds workqueue based fsync offload.  Version of this
> patch have been floating around for a couple years, but we now
> have a user with seastar used by ScyllaDB (who sponsored this
> work) that really wants this in addition to the aio poll support.
> More details are in the patch itself.
> 
> Because the iocb types have been defined sine day one (and probably
> were supported by RHEL3) libaio already supports these calls as-is.
> 
> This also pulls in the aio cleanups and io_pgetevents support previously
> submitted and review as part of the aio poll series.  The aio poll
> series will be resubmitted on top of this series

BTW, this is only tangentially related, but... does *anything* call
io_submit() for huge amounts of iocb?  Check in do_io_submit() is
insane - "no more than MAX_LONG total of _pointers_".  Compat variant
goes for "no more than a page worth of pointers" and there's
a hard limit in ioctx_alloc() - we can't ever get more than
8M slots in ring buffer...


[PATCH] mm: shmem: enable thp migration (Re: [PATCH v1] mm: consider non-anonymous thp as unmovable page)

2018-04-05 Thread Naoya Horiguchi
Hi everyone,

On Thu, Apr 05, 2018 at 06:03:17PM +0200, Michal Hocko wrote:
> On Thu 05-04-18 18:55:51, Kirill A. Shutemov wrote:
> > On Thu, Apr 05, 2018 at 05:05:47PM +0200, Michal Hocko wrote:
> > > On Thu 05-04-18 16:40:45, Kirill A. Shutemov wrote:
> > > > On Thu, Apr 05, 2018 at 02:48:30PM +0200, Michal Hocko wrote:
> > > [...]
> > > > > RIght, I confused the two. What is the proper layer to fix that then?
> > > > > rmap_walk_file?
> > > > 
> > > > Maybe something like this? Totally untested.
> > > 
> > > This looks way too complex. Why cannot we simply split THP page cache
> > > during migration?
> > 
> > This way we unify the codepath for archictures that don't support THP
> > migration and shmem THP.
> 
> But why? There shouldn't be really nothing to prevent THP (anon or
> shemem) to be migratable. If we cannot migrate it at once we can always
> split it. So why should we add another thp specific handling all over
> the place?

If thp migration works fine for shmem, we can keep anon/shmem thp to
be migratable and we don't need any ad-hoc workaround.
So I wrote a patch to enable it.
This patch does not change any shmem specific code, so I think that
it works for file thp (not only shmem,) but I don't test it yet.

Thanks,
Naoya Horiguchi
-
>From e31ec037701d1cc76b26226e4b66d8c783d40889 Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi 
Date: Fri, 6 Apr 2018 10:58:35 +0900
Subject: [PATCH] mm: enable thp migration for shmem thp

My testing for the latest kernel supporting thp migration showed an
infinite loop in offlining the memory block that is filled with shmem
thps.  We can get out of the loop with a signal, but kernel should
return with failure in this case.

What happens in the loop is that scan_movable_pages() repeats returning
the same pfn without any progress. That's because page migration always
fails for shmem thps.

In memory offline code, memory blocks containing unmovable pages should
be prevented from being offline targets by has_unmovable_pages() inside
start_isolate_page_range(). So it's possible to change migratability
for non-anonymous thps to avoid the issue, but it introduces more complex
and thp-specific handling in migration code, so it might not good.

So this patch is suggesting to fix the issue by enabling thp migration
for shmem thp. Both of anon/shmem thp are migratable so we don't need
precheck about the type of thps.

Fixes: commit 72b39cfc4d75 ("mm, memory_hotplug: do not fail offlining too 
early")
Signed-off-by: Naoya Horiguchi 
Cc: sta...@vger.kernel.org # v4.15+
---
 mm/huge_memory.c |  5 -
 mm/migrate.c | 19 ---
 mm/rmap.c|  3 ---
 3 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 2aff58624886..933c1bbd3464 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2926,7 +2926,10 @@ void remove_migration_pmd(struct page_vma_mapped_walk 
*pvmw, struct page *new)
pmde = maybe_pmd_mkwrite(pmde, vma);
 
flush_cache_range(vma, mmun_start, mmun_start + HPAGE_PMD_SIZE);
-   page_add_anon_rmap(new, vma, mmun_start, true);
+   if (PageAnon(new))
+   page_add_anon_rmap(new, vma, mmun_start, true);
+   else
+   page_add_file_rmap(new, true);
set_pmd_at(mm, mmun_start, pvmw->pmd, pmde);
if (vma->vm_flags & VM_LOCKED)
mlock_vma_page(new);
diff --git a/mm/migrate.c b/mm/migrate.c
index bdef905b1737..f92dd9f50981 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -472,7 +472,7 @@ int migrate_page_move_mapping(struct address_space *mapping,
pslot = radix_tree_lookup_slot(>i_pages,
page_index(page));
 
-   expected_count += 1 + page_has_private(page);
+   expected_count += hpage_nr_pages(page) + page_has_private(page);
if (page_count(page) != expected_count ||
radix_tree_deref_slot_protected(pslot,
>i_pages.xa_lock) != page) {
@@ -505,7 +505,7 @@ int migrate_page_move_mapping(struct address_space *mapping,
 */
newpage->index = page->index;
newpage->mapping = page->mapping;
-   get_page(newpage);  /* add cache reference */
+   page_ref_add(newpage, hpage_nr_pages(page)); /* add cache reference */
if (PageSwapBacked(page)) {
__SetPageSwapBacked(newpage);
if (PageSwapCache(page)) {
@@ -524,13 +524,26 @@ int migrate_page_move_mapping(struct address_space 
*mapping,
}
 
radix_tree_replace_slot(>i_pages, pslot, newpage);
+   if (PageTransHuge(page)) {
+   int i;
+   int index = page_index(page);
+
+   for (i = 0; i < HPAGE_PMD_NR; i++) {
+   pslot = radix_tree_lookup_slot(>i_pages,
+  index + i);
+   

[PATCH] mm: shmem: enable thp migration (Re: [PATCH v1] mm: consider non-anonymous thp as unmovable page)

2018-04-05 Thread Naoya Horiguchi
Hi everyone,

On Thu, Apr 05, 2018 at 06:03:17PM +0200, Michal Hocko wrote:
> On Thu 05-04-18 18:55:51, Kirill A. Shutemov wrote:
> > On Thu, Apr 05, 2018 at 05:05:47PM +0200, Michal Hocko wrote:
> > > On Thu 05-04-18 16:40:45, Kirill A. Shutemov wrote:
> > > > On Thu, Apr 05, 2018 at 02:48:30PM +0200, Michal Hocko wrote:
> > > [...]
> > > > > RIght, I confused the two. What is the proper layer to fix that then?
> > > > > rmap_walk_file?
> > > > 
> > > > Maybe something like this? Totally untested.
> > > 
> > > This looks way too complex. Why cannot we simply split THP page cache
> > > during migration?
> > 
> > This way we unify the codepath for archictures that don't support THP
> > migration and shmem THP.
> 
> But why? There shouldn't be really nothing to prevent THP (anon or
> shemem) to be migratable. If we cannot migrate it at once we can always
> split it. So why should we add another thp specific handling all over
> the place?

If thp migration works fine for shmem, we can keep anon/shmem thp to
be migratable and we don't need any ad-hoc workaround.
So I wrote a patch to enable it.
This patch does not change any shmem specific code, so I think that
it works for file thp (not only shmem,) but I don't test it yet.

Thanks,
Naoya Horiguchi
-
>From e31ec037701d1cc76b26226e4b66d8c783d40889 Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi 
Date: Fri, 6 Apr 2018 10:58:35 +0900
Subject: [PATCH] mm: enable thp migration for shmem thp

My testing for the latest kernel supporting thp migration showed an
infinite loop in offlining the memory block that is filled with shmem
thps.  We can get out of the loop with a signal, but kernel should
return with failure in this case.

What happens in the loop is that scan_movable_pages() repeats returning
the same pfn without any progress. That's because page migration always
fails for shmem thps.

In memory offline code, memory blocks containing unmovable pages should
be prevented from being offline targets by has_unmovable_pages() inside
start_isolate_page_range(). So it's possible to change migratability
for non-anonymous thps to avoid the issue, but it introduces more complex
and thp-specific handling in migration code, so it might not good.

So this patch is suggesting to fix the issue by enabling thp migration
for shmem thp. Both of anon/shmem thp are migratable so we don't need
precheck about the type of thps.

Fixes: commit 72b39cfc4d75 ("mm, memory_hotplug: do not fail offlining too 
early")
Signed-off-by: Naoya Horiguchi 
Cc: sta...@vger.kernel.org # v4.15+
---
 mm/huge_memory.c |  5 -
 mm/migrate.c | 19 ---
 mm/rmap.c|  3 ---
 3 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 2aff58624886..933c1bbd3464 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2926,7 +2926,10 @@ void remove_migration_pmd(struct page_vma_mapped_walk 
*pvmw, struct page *new)
pmde = maybe_pmd_mkwrite(pmde, vma);
 
flush_cache_range(vma, mmun_start, mmun_start + HPAGE_PMD_SIZE);
-   page_add_anon_rmap(new, vma, mmun_start, true);
+   if (PageAnon(new))
+   page_add_anon_rmap(new, vma, mmun_start, true);
+   else
+   page_add_file_rmap(new, true);
set_pmd_at(mm, mmun_start, pvmw->pmd, pmde);
if (vma->vm_flags & VM_LOCKED)
mlock_vma_page(new);
diff --git a/mm/migrate.c b/mm/migrate.c
index bdef905b1737..f92dd9f50981 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -472,7 +472,7 @@ int migrate_page_move_mapping(struct address_space *mapping,
pslot = radix_tree_lookup_slot(>i_pages,
page_index(page));
 
-   expected_count += 1 + page_has_private(page);
+   expected_count += hpage_nr_pages(page) + page_has_private(page);
if (page_count(page) != expected_count ||
radix_tree_deref_slot_protected(pslot,
>i_pages.xa_lock) != page) {
@@ -505,7 +505,7 @@ int migrate_page_move_mapping(struct address_space *mapping,
 */
newpage->index = page->index;
newpage->mapping = page->mapping;
-   get_page(newpage);  /* add cache reference */
+   page_ref_add(newpage, hpage_nr_pages(page)); /* add cache reference */
if (PageSwapBacked(page)) {
__SetPageSwapBacked(newpage);
if (PageSwapCache(page)) {
@@ -524,13 +524,26 @@ int migrate_page_move_mapping(struct address_space 
*mapping,
}
 
radix_tree_replace_slot(>i_pages, pslot, newpage);
+   if (PageTransHuge(page)) {
+   int i;
+   int index = page_index(page);
+
+   for (i = 0; i < HPAGE_PMD_NR; i++) {
+   pslot = radix_tree_lookup_slot(>i_pages,
+  index + i);
+   radix_tree_replace_slot(>i_pages, pslot,
+  

Re: KASAN: use-after-free Write in irq_bypass_register_consumer

2018-04-05 Thread Eric Biggers
On Mon, Jan 29, 2018 at 01:29:48PM +0800, Tianyu Lan wrote:
> 
> 
> On 1/27/2018 7:27 AM, Eric Biggers wrote:
> > On Sat, Dec 16, 2017 at 04:37:02PM +0800, Lan, Tianyu wrote:
> > > The root cause is that kvm_irqfd_assign() and kvm_irqfd_deassign() can't
> > > be run in parallel. Some data structure(e.g, irqfd->consumer) will be
> > > crashed because irqfd may be freed in deassign path before they are used
> > > in assign path. The other data maybe used in deassign path before
> > > initialization. Syzbot test hit such case. Add mutx between
> > > kvm_irqfd_assign() and kvm_irqfd_deassign() can fix such issue. Will
> > > send patch to fix it.
> > > 
> > > On 12/16/2017 12:53 PM, Tianyu Lan wrote:
> > > > I reproduced the issue. Will have a look.
> > > > 
> > > > -- Best regards Tianyu Lan 2017-12-15 18:14 GMT+08:00 syzbot
> > > > :
> > > > > syzkaller has found reproducer for the following crash on
> > > > > 82bcf1def3b5f1251177ad47c44f7e17af039b4b
> > > > > git://git.cmpxchg.org/linux-mmots.git/master
> > > > > compiler: gcc (GCC) 7.1.1 20170620
> > > > > .config is attached
> > > > > Raw console output is attached.
> > > > > C reproducer is attached
> > > > > syzkaller reproducer is attached. Seehttps://goo.gl/kgGztJ
> > > > > for information about syzkaller reproducers
> > > > > 
> > > > > 
> > > > > ==
> > > > > BUG: KASAN: use-after-free in __list_add include/linux/list.h:64 
> > > > > [inline]
> > > > > BUG: KASAN: use-after-free in list_add include/linux/list.h:79 
> > > > > [inline]
> > > > > BUG: KASAN: use-after-free in irq_bypass_register_consumer+0x4b4/0x500
> > > > > virt/lib/irqbypass.c:217
> > > > > Write of size 8 at addr 8801cdf51180 by task syzkaller436086/15031
> > > > > 
> > > > > CPU: 1 PID: 15031 Comm: syzkaller436086 Not tainted 4.15.0-rc2-mm1+ 
> > > > > #39
> > > > > Hardware name: Google Google Compute Engine/Google Compute Engine, 
> > > > > BIOS
> > > > > Google 01/01/2011
> > > > > Call Trace:
> > > > >__dump_stack lib/dump_stack.c:17 [inline]
> > > > >dump_stack+0x194/0x257 lib/dump_stack.c:53
> > > > >print_address_description+0x73/0x250 mm/kasan/report.c:252
> > > > >kasan_report_error mm/kasan/report.c:351 [inline]
> > > > >kasan_report+0x25b/0x340 mm/kasan/report.c:409
> > > > >__asan_report_store8_noabort+0x17/0x20 mm/kasan/report.c:435
> > > > >__list_add include/linux/list.h:64 [inline]
> > > > >list_add include/linux/list.h:79 [inline]
> > > > >irq_bypass_register_consumer+0x4b4/0x500 virt/lib/irqbypass.c:217
> > > > >kvm_irqfd_assign arch/x86/kvm/../../../virt/kvm/eventfd.c:417 
> > > > > [inline]
> > > > >kvm_irqfd+0x137f/0x1d50 
> > > > > arch/x86/kvm/../../../virt/kvm/eventfd.c:572
> > > > >kvm_vm_ioctl+0x1079/0x1c40 
> > > > > arch/x86/kvm/../../../virt/kvm/kvm_main.c:2992
> > > > >vfs_ioctl fs/ioctl.c:46 [inline]
> > > > >do_vfs_ioctl+0x1b1/0x1530 fs/ioctl.c:686
> > > > >SYSC_ioctl fs/ioctl.c:701 [inline]
> > > > >SyS_ioctl+0x8f/0xc0 fs/ioctl.c:692
> > > > >entry_SYSCALL_64_fastpath+0x1f/0x96
> > > > > RIP: 0033:0x44d379
> > > > > RSP: 002b:7fc5ff9a9d08 EFLAGS: 0246 ORIG_RAX: 0010
> > > > > RAX: ffda RBX: 7fc5ff9aa700 RCX: 0044d379
> > > > > RDX: 20080fe0 RSI: 4020ae76 RDI: 0005
> > > > > RBP: 007ff900 R08: 7fc5ff9aa700 R09: 7fc5ff9aa700
> > > > > R10: 7fc5ff9aa700 R11: 0246 R12: 
> > > > > R13: 007ff8ff R14: 7fc5ff9aa9c0 R15: 
> > > > > 
> > > > > Allocated by task 15031:
> > > > >save_stack+0x43/0xd0 mm/kasan/kasan.c:447
> > > > >set_track mm/kasan/kasan.c:459 [inline]
> > > > >kasan_kmalloc+0xad/0xe0 mm/kasan/kasan.c:551
> > > > >kmem_cache_alloc_trace+0x136/0x750 mm/slab.c:3614
> > > > >kmalloc include/linux/slab.h:516 [inline]
> > > > >kzalloc include/linux/slab.h:705 [inline]
> > > > >kvm_irqfd_assign arch/x86/kvm/../../../virt/kvm/eventfd.c:296 
> > > > > [inline]
> > > > >kvm_irqfd+0x16c/0x1d50 arch/x86/kvm/../../../virt/kvm/eventfd.c:572
> > > > >kvm_vm_ioctl+0x1079/0x1c40 
> > > > > arch/x86/kvm/../../../virt/kvm/kvm_main.c:2992
> > > > >vfs_ioctl fs/ioctl.c:46 [inline]
> > > > >do_vfs_ioctl+0x1b1/0x1530 fs/ioctl.c:686
> > > > >SYSC_ioctl fs/ioctl.c:701 [inline]
> > > > >SyS_ioctl+0x8f/0xc0 fs/ioctl.c:692
> > > > >entry_SYSCALL_64_fastpath+0x1f/0x96
> > > > > 
> > > > > Freed by task 1402:
> > > > >save_stack+0x43/0xd0 mm/kasan/kasan.c:447
> > > > >set_track mm/kasan/kasan.c:459 [inline]
> > > > >kasan_slab_free+0x71/0xc0 mm/kasan/kasan.c:524
> > > > >__cache_free mm/slab.c:3492 [inline]
> > > > >kfree+0xca/0x250 mm/slab.c:3807
> > > > >irqfd_shutdown+0x13c/0x1a0 
> > > > > arch/x86/kvm/../../../virt/kvm/eventfd.c:148
> > > > 

Re: KASAN: use-after-free Write in irq_bypass_register_consumer

2018-04-05 Thread Eric Biggers
On Mon, Jan 29, 2018 at 01:29:48PM +0800, Tianyu Lan wrote:
> 
> 
> On 1/27/2018 7:27 AM, Eric Biggers wrote:
> > On Sat, Dec 16, 2017 at 04:37:02PM +0800, Lan, Tianyu wrote:
> > > The root cause is that kvm_irqfd_assign() and kvm_irqfd_deassign() can't
> > > be run in parallel. Some data structure(e.g, irqfd->consumer) will be
> > > crashed because irqfd may be freed in deassign path before they are used
> > > in assign path. The other data maybe used in deassign path before
> > > initialization. Syzbot test hit such case. Add mutx between
> > > kvm_irqfd_assign() and kvm_irqfd_deassign() can fix such issue. Will
> > > send patch to fix it.
> > > 
> > > On 12/16/2017 12:53 PM, Tianyu Lan wrote:
> > > > I reproduced the issue. Will have a look.
> > > > 
> > > > -- Best regards Tianyu Lan 2017-12-15 18:14 GMT+08:00 syzbot
> > > > :
> > > > > syzkaller has found reproducer for the following crash on
> > > > > 82bcf1def3b5f1251177ad47c44f7e17af039b4b
> > > > > git://git.cmpxchg.org/linux-mmots.git/master
> > > > > compiler: gcc (GCC) 7.1.1 20170620
> > > > > .config is attached
> > > > > Raw console output is attached.
> > > > > C reproducer is attached
> > > > > syzkaller reproducer is attached. Seehttps://goo.gl/kgGztJ
> > > > > for information about syzkaller reproducers
> > > > > 
> > > > > 
> > > > > ==
> > > > > BUG: KASAN: use-after-free in __list_add include/linux/list.h:64 
> > > > > [inline]
> > > > > BUG: KASAN: use-after-free in list_add include/linux/list.h:79 
> > > > > [inline]
> > > > > BUG: KASAN: use-after-free in irq_bypass_register_consumer+0x4b4/0x500
> > > > > virt/lib/irqbypass.c:217
> > > > > Write of size 8 at addr 8801cdf51180 by task syzkaller436086/15031
> > > > > 
> > > > > CPU: 1 PID: 15031 Comm: syzkaller436086 Not tainted 4.15.0-rc2-mm1+ 
> > > > > #39
> > > > > Hardware name: Google Google Compute Engine/Google Compute Engine, 
> > > > > BIOS
> > > > > Google 01/01/2011
> > > > > Call Trace:
> > > > >__dump_stack lib/dump_stack.c:17 [inline]
> > > > >dump_stack+0x194/0x257 lib/dump_stack.c:53
> > > > >print_address_description+0x73/0x250 mm/kasan/report.c:252
> > > > >kasan_report_error mm/kasan/report.c:351 [inline]
> > > > >kasan_report+0x25b/0x340 mm/kasan/report.c:409
> > > > >__asan_report_store8_noabort+0x17/0x20 mm/kasan/report.c:435
> > > > >__list_add include/linux/list.h:64 [inline]
> > > > >list_add include/linux/list.h:79 [inline]
> > > > >irq_bypass_register_consumer+0x4b4/0x500 virt/lib/irqbypass.c:217
> > > > >kvm_irqfd_assign arch/x86/kvm/../../../virt/kvm/eventfd.c:417 
> > > > > [inline]
> > > > >kvm_irqfd+0x137f/0x1d50 
> > > > > arch/x86/kvm/../../../virt/kvm/eventfd.c:572
> > > > >kvm_vm_ioctl+0x1079/0x1c40 
> > > > > arch/x86/kvm/../../../virt/kvm/kvm_main.c:2992
> > > > >vfs_ioctl fs/ioctl.c:46 [inline]
> > > > >do_vfs_ioctl+0x1b1/0x1530 fs/ioctl.c:686
> > > > >SYSC_ioctl fs/ioctl.c:701 [inline]
> > > > >SyS_ioctl+0x8f/0xc0 fs/ioctl.c:692
> > > > >entry_SYSCALL_64_fastpath+0x1f/0x96
> > > > > RIP: 0033:0x44d379
> > > > > RSP: 002b:7fc5ff9a9d08 EFLAGS: 0246 ORIG_RAX: 0010
> > > > > RAX: ffda RBX: 7fc5ff9aa700 RCX: 0044d379
> > > > > RDX: 20080fe0 RSI: 4020ae76 RDI: 0005
> > > > > RBP: 007ff900 R08: 7fc5ff9aa700 R09: 7fc5ff9aa700
> > > > > R10: 7fc5ff9aa700 R11: 0246 R12: 
> > > > > R13: 007ff8ff R14: 7fc5ff9aa9c0 R15: 
> > > > > 
> > > > > Allocated by task 15031:
> > > > >save_stack+0x43/0xd0 mm/kasan/kasan.c:447
> > > > >set_track mm/kasan/kasan.c:459 [inline]
> > > > >kasan_kmalloc+0xad/0xe0 mm/kasan/kasan.c:551
> > > > >kmem_cache_alloc_trace+0x136/0x750 mm/slab.c:3614
> > > > >kmalloc include/linux/slab.h:516 [inline]
> > > > >kzalloc include/linux/slab.h:705 [inline]
> > > > >kvm_irqfd_assign arch/x86/kvm/../../../virt/kvm/eventfd.c:296 
> > > > > [inline]
> > > > >kvm_irqfd+0x16c/0x1d50 arch/x86/kvm/../../../virt/kvm/eventfd.c:572
> > > > >kvm_vm_ioctl+0x1079/0x1c40 
> > > > > arch/x86/kvm/../../../virt/kvm/kvm_main.c:2992
> > > > >vfs_ioctl fs/ioctl.c:46 [inline]
> > > > >do_vfs_ioctl+0x1b1/0x1530 fs/ioctl.c:686
> > > > >SYSC_ioctl fs/ioctl.c:701 [inline]
> > > > >SyS_ioctl+0x8f/0xc0 fs/ioctl.c:692
> > > > >entry_SYSCALL_64_fastpath+0x1f/0x96
> > > > > 
> > > > > Freed by task 1402:
> > > > >save_stack+0x43/0xd0 mm/kasan/kasan.c:447
> > > > >set_track mm/kasan/kasan.c:459 [inline]
> > > > >kasan_slab_free+0x71/0xc0 mm/kasan/kasan.c:524
> > > > >__cache_free mm/slab.c:3492 [inline]
> > > > >kfree+0xca/0x250 mm/slab.c:3807
> > > > >irqfd_shutdown+0x13c/0x1a0 
> > > > > arch/x86/kvm/../../../virt/kvm/eventfd.c:148
> > > > >process_one_work+0xbfd/0x1bc0 kernel/workqueue.c:2113
> > > > >

Re: [PATCH 5/6] rhashtable: support guaranteed successful insertion.

2018-04-05 Thread NeilBrown
On Thu, Mar 29 2018, Herbert Xu wrote:

> On Thu, Mar 29, 2018 at 08:26:21AM +1100, NeilBrown wrote:
>>
>> I say "astronomically unlikely", you say "probability .. is extremely
>> low".  I think we are in agreement here.
>> 
>> The point remains that if an error *can* be returned then I have to
>> write code to handle it and test that code.  I'd rather not.
>
> You have be able to handle errors anyway because of memory allocation
> failures.  Ultimately if you keep inserting you will eventually
> fail with ENOMEM.  So I don't see the issue with an additional
> error value.

You don't need to handle memory allocation failures at the point where
you insert into the table - adding to a linked list requires no new
memory.

If you are running out of memory, you will fail to allocate new objects,
so you won't be able to add them to the rhashtable.  Obviously you have
to handle that failure, and that is likely to save rhashtable from
getting many mem-alloc failures.

But once you have allocated a new object, rhashtable should just accept
it.  It doesn't help anyone to have the insertion fail.
The insertion can currently fail even when allocating a new object would
succeed, as assertion can require a GFP_ATOMIC allocation to succeed,
while allocating a new object might only need a GFP_KERNEL allocation to
succeed. 

>
>> > Even if it does happen we won't fail because we will perform
>> > an immediate rehash.  We only fail if it happens right away
>> > after the rehash (that is, at least another 16 elements have
>> > been inserted and you're trying to insert a 17th element, all
>> > while the new hash table has not been completely populated),
>> > which means that somebody has figured out our hash secret and
>> > failing in that case makes sense.
>
> BTW, you didn't acknowledge this bit which I think is crucial to
> how likely such an error is.

The likelihood of the error isn't really the issue - it either can
happen or it cannot.  If it can, it requires code to handle it.

>
>> I never suggested retrying, but I would have to handle it somehow.  I'd
>> rather not.
>
> ...
>
>> While I have no doubt that there are hashtables where someone could try
>> to attack the hash, I am quite sure there are others where is such an
>> attack is meaningless - any code which could generate the required range of
>> keys, could do far worse things more easily.
>
> Our network hashtable has to be secure against adversaries.  I
> understand that this may not be important to your use-case.  However,
> given the fact that the failure would only occur if an adversary
> is present and actively attacking your hash table, I don't think
> it has that much of a negative effect on your use-case either.

I certainly accept that there are circumstances where it is a real
possibility that an adversary might succeed in attacking the hash
function, and changing the seed for each table is an excellent idea.
It isn't clear to me that failing insertions is needed - it is done
rather late and so doesn't throttle much, and could give the attacker
feedback that they had succeeded (?).

Not all hash tables are susceptible to attack.  It would be nice if
developers working in less exposed areas could use rhashtable without ever
having to check for errors.

Error codes are messages from the implementation to the caller.
They should be chosen to help the caller make useful choices, not to
allow the implementation to avoid awkward situations.

>
> Of course if you can reproduce the EBUSY error without your
> disable_count patch or even after you have fixed the issue I
> have pointed out in your disable_count patch you can still reproduce
> it then that would suggest a real bug and we would need to fix it,
> for everyone.

I suspect that I cannot.  However experience tells me that customers do
things that I cannot and would never expect - they can often trigger
errors that I cannot.  It is best if the error cannot be returned.

>
>> Yes, storing a sharded count in the spinlock table does seem like an
>> appropriate granularity.  However that leads me to ask: why do we have
>> the spinlock table?  Why not bit spinlocks in the hashchain head like
>> include/linux/list_bl uses?
>
> The spinlock table predates rhashtable.  Perhaps Thomas/Eric/Dave
> can elucidate this.

Maybe I'll submit an RFC patch to change it - if I can make it work.

>
>> I don't understand how it can ever be "too late", though I appreciate
>> that in some cases "sooner" is better than "later"
>> If we give up on the single atomic_t counter, then we must accept that
>> the number of elements could exceed any given value.  The only promise
>> we can provide is that it wont exceed N% of the table size for more than
>> T seconds.
>
> Sure.  However, assuming we use an estimate that is hash-based,
> that *should* be fairly accurate assuming that your hash function
> is working in the first place.  It's completely different vs.
> estimating based on a per-cpu count which could be wildly 

Re: [PATCH 5/6] rhashtable: support guaranteed successful insertion.

2018-04-05 Thread NeilBrown
On Thu, Mar 29 2018, Herbert Xu wrote:

> On Thu, Mar 29, 2018 at 08:26:21AM +1100, NeilBrown wrote:
>>
>> I say "astronomically unlikely", you say "probability .. is extremely
>> low".  I think we are in agreement here.
>> 
>> The point remains that if an error *can* be returned then I have to
>> write code to handle it and test that code.  I'd rather not.
>
> You have be able to handle errors anyway because of memory allocation
> failures.  Ultimately if you keep inserting you will eventually
> fail with ENOMEM.  So I don't see the issue with an additional
> error value.

You don't need to handle memory allocation failures at the point where
you insert into the table - adding to a linked list requires no new
memory.

If you are running out of memory, you will fail to allocate new objects,
so you won't be able to add them to the rhashtable.  Obviously you have
to handle that failure, and that is likely to save rhashtable from
getting many mem-alloc failures.

But once you have allocated a new object, rhashtable should just accept
it.  It doesn't help anyone to have the insertion fail.
The insertion can currently fail even when allocating a new object would
succeed, as assertion can require a GFP_ATOMIC allocation to succeed,
while allocating a new object might only need a GFP_KERNEL allocation to
succeed. 

>
>> > Even if it does happen we won't fail because we will perform
>> > an immediate rehash.  We only fail if it happens right away
>> > after the rehash (that is, at least another 16 elements have
>> > been inserted and you're trying to insert a 17th element, all
>> > while the new hash table has not been completely populated),
>> > which means that somebody has figured out our hash secret and
>> > failing in that case makes sense.
>
> BTW, you didn't acknowledge this bit which I think is crucial to
> how likely such an error is.

The likelihood of the error isn't really the issue - it either can
happen or it cannot.  If it can, it requires code to handle it.

>
>> I never suggested retrying, but I would have to handle it somehow.  I'd
>> rather not.
>
> ...
>
>> While I have no doubt that there are hashtables where someone could try
>> to attack the hash, I am quite sure there are others where is such an
>> attack is meaningless - any code which could generate the required range of
>> keys, could do far worse things more easily.
>
> Our network hashtable has to be secure against adversaries.  I
> understand that this may not be important to your use-case.  However,
> given the fact that the failure would only occur if an adversary
> is present and actively attacking your hash table, I don't think
> it has that much of a negative effect on your use-case either.

I certainly accept that there are circumstances where it is a real
possibility that an adversary might succeed in attacking the hash
function, and changing the seed for each table is an excellent idea.
It isn't clear to me that failing insertions is needed - it is done
rather late and so doesn't throttle much, and could give the attacker
feedback that they had succeeded (?).

Not all hash tables are susceptible to attack.  It would be nice if
developers working in less exposed areas could use rhashtable without ever
having to check for errors.

Error codes are messages from the implementation to the caller.
They should be chosen to help the caller make useful choices, not to
allow the implementation to avoid awkward situations.

>
> Of course if you can reproduce the EBUSY error without your
> disable_count patch or even after you have fixed the issue I
> have pointed out in your disable_count patch you can still reproduce
> it then that would suggest a real bug and we would need to fix it,
> for everyone.

I suspect that I cannot.  However experience tells me that customers do
things that I cannot and would never expect - they can often trigger
errors that I cannot.  It is best if the error cannot be returned.

>
>> Yes, storing a sharded count in the spinlock table does seem like an
>> appropriate granularity.  However that leads me to ask: why do we have
>> the spinlock table?  Why not bit spinlocks in the hashchain head like
>> include/linux/list_bl uses?
>
> The spinlock table predates rhashtable.  Perhaps Thomas/Eric/Dave
> can elucidate this.

Maybe I'll submit an RFC patch to change it - if I can make it work.

>
>> I don't understand how it can ever be "too late", though I appreciate
>> that in some cases "sooner" is better than "later"
>> If we give up on the single atomic_t counter, then we must accept that
>> the number of elements could exceed any given value.  The only promise
>> we can provide is that it wont exceed N% of the table size for more than
>> T seconds.
>
> Sure.  However, assuming we use an estimate that is hash-based,
> that *should* be fairly accurate assuming that your hash function
> is working in the first place.  It's completely different vs.
> estimating based on a per-cpu count which could be wildly 

Re: KASAN: use-after-free Read in worker_thread (2)

2018-04-05 Thread Eric Biggers
On Sat, Nov 11, 2017 at 07:56:01AM -0800, syzbot wrote:
> syzkaller has found reproducer for the following crash on
> d9e0e63d9a6f88440eb201e1491fcf730272c706
> git://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/master
> compiler: gcc (GCC) 7.1.1 20170620
> .config is attached
> Raw console output is attached.
> 
> syzkaller reproducer is attached. See https://goo.gl/kgGztJ
> for information about syzkaller reproducers
> 
> 
> BUG: KASAN: use-after-free in worker_thread+0x15bb/0x1990
> kernel/workqueue.c:2244
> Read of size 8 at addr 88002d0e3de0 by task kworker/u8:1/1209
> 
> CPU: 0 PID: 1209 Comm: kworker/u8:1 Not tainted 4.14.0-rc8-next-20171110+
> #12
> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
> Call Trace:
>  __dump_stack lib/dump_stack.c:17 [inline]
>  dump_stack+0x194/0x257 lib/dump_stack.c:53
>  print_address_description+0x73/0x250 mm/kasan/report.c:252
>  kasan_report_error mm/kasan/report.c:351 [inline]
>  kasan_report+0x25b/0x340 mm/kasan/report.c:409
>  __asan_report_load8_noabort+0x14/0x20 mm/kasan/report.c:430
>  worker_thread+0x15bb/0x1990 kernel/workqueue.c:2244
>  kthread+0x37a/0x440 kernel/kthread.c:238
>  ret_from_fork+0x24/0x30 arch/x86/entry/entry_64.S:437
> 
> Allocated by task 11866:
>  save_stack+0x43/0xd0 mm/kasan/kasan.c:447
>  set_track mm/kasan/kasan.c:459 [inline]
>  kasan_kmalloc+0xad/0xe0 mm/kasan/kasan.c:551
>  kasan_slab_alloc+0x12/0x20 mm/kasan/kasan.c:489
>  kmem_cache_alloc+0x12e/0x760 mm/slab.c:3548
>  kmem_cache_zalloc include/linux/slab.h:693 [inline]
>  kcm_attach net/kcm/kcmsock.c:1394 [inline]
>  kcm_attach_ioctl net/kcm/kcmsock.c:1460 [inline]
>  kcm_ioctl+0x2d1/0x1610 net/kcm/kcmsock.c:1695
>  sock_do_ioctl+0x65/0xb0 net/socket.c:960
>  sock_ioctl+0x2c2/0x440 net/socket.c:1057
>  vfs_ioctl fs/ioctl.c:46 [inline]
>  do_vfs_ioctl+0x1b1/0x1530 fs/ioctl.c:686
>  SYSC_ioctl fs/ioctl.c:701 [inline]
>  SyS_ioctl+0x8f/0xc0 fs/ioctl.c:692
>  entry_SYSCALL_64_fastpath+0x1f/0x96
> 
> Freed by task 11867:
>  save_stack+0x43/0xd0 mm/kasan/kasan.c:447
>  set_track mm/kasan/kasan.c:459 [inline]
>  kasan_slab_free+0x71/0xc0 mm/kasan/kasan.c:524
>  __cache_free mm/slab.c:3492 [inline]
>  kmem_cache_free+0x77/0x280 mm/slab.c:3750
>  kcm_unattach+0xe50/0x1510 net/kcm/kcmsock.c:1563
>  kcm_unattach_ioctl net/kcm/kcmsock.c:1608 [inline]
>  kcm_ioctl+0xdf0/0x1610 net/kcm/kcmsock.c:1705
>  sock_do_ioctl+0x65/0xb0 net/socket.c:960
>  sock_ioctl+0x2c2/0x440 net/socket.c:1057
>  vfs_ioctl fs/ioctl.c:46 [inline]
>  do_vfs_ioctl+0x1b1/0x1530 fs/ioctl.c:686
>  SYSC_ioctl fs/ioctl.c:701 [inline]
>  SyS_ioctl+0x8f/0xc0 fs/ioctl.c:692
>  entry_SYSCALL_64_fastpath+0x1f/0x96
> 
> The buggy address belongs to the object at 88002d0e3d00
>  which belongs to the cache kcm_psock_cache of size 576
> The buggy address is located 224 bytes inside of
>  576-byte region [88002d0e3d00, 88002d0e3f40)
> The buggy address belongs to the page:
> page:eab43880 count:1 mapcount:0 mapping:88002d0e2180 index:0x0
> compound_mapcount: 0
> flags: 0x1008100(slab|head)
> raw: 01008100 88002d0e2180  0001000b
> raw: eab14920 eab27e20 88002b0089c0 
> page dumped because: kasan: bad access detected
> 
> Memory state around the buggy address:
>  88002d0e3c80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
>  88002d0e3d00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
> > 88002d0e3d80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
>^
>  88002d0e3e00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
>  88002d0e3e80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
> ==

No longer occurring, the fix seems to have been commit 7e9964574ee97:

#syz fix: kcm: Only allow TCP sockets to be attached to a KCM mux

- Eric


Re: KASAN: use-after-free Read in worker_thread (2)

2018-04-05 Thread Eric Biggers
On Sat, Nov 11, 2017 at 07:56:01AM -0800, syzbot wrote:
> syzkaller has found reproducer for the following crash on
> d9e0e63d9a6f88440eb201e1491fcf730272c706
> git://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/master
> compiler: gcc (GCC) 7.1.1 20170620
> .config is attached
> Raw console output is attached.
> 
> syzkaller reproducer is attached. See https://goo.gl/kgGztJ
> for information about syzkaller reproducers
> 
> 
> BUG: KASAN: use-after-free in worker_thread+0x15bb/0x1990
> kernel/workqueue.c:2244
> Read of size 8 at addr 88002d0e3de0 by task kworker/u8:1/1209
> 
> CPU: 0 PID: 1209 Comm: kworker/u8:1 Not tainted 4.14.0-rc8-next-20171110+
> #12
> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
> Call Trace:
>  __dump_stack lib/dump_stack.c:17 [inline]
>  dump_stack+0x194/0x257 lib/dump_stack.c:53
>  print_address_description+0x73/0x250 mm/kasan/report.c:252
>  kasan_report_error mm/kasan/report.c:351 [inline]
>  kasan_report+0x25b/0x340 mm/kasan/report.c:409
>  __asan_report_load8_noabort+0x14/0x20 mm/kasan/report.c:430
>  worker_thread+0x15bb/0x1990 kernel/workqueue.c:2244
>  kthread+0x37a/0x440 kernel/kthread.c:238
>  ret_from_fork+0x24/0x30 arch/x86/entry/entry_64.S:437
> 
> Allocated by task 11866:
>  save_stack+0x43/0xd0 mm/kasan/kasan.c:447
>  set_track mm/kasan/kasan.c:459 [inline]
>  kasan_kmalloc+0xad/0xe0 mm/kasan/kasan.c:551
>  kasan_slab_alloc+0x12/0x20 mm/kasan/kasan.c:489
>  kmem_cache_alloc+0x12e/0x760 mm/slab.c:3548
>  kmem_cache_zalloc include/linux/slab.h:693 [inline]
>  kcm_attach net/kcm/kcmsock.c:1394 [inline]
>  kcm_attach_ioctl net/kcm/kcmsock.c:1460 [inline]
>  kcm_ioctl+0x2d1/0x1610 net/kcm/kcmsock.c:1695
>  sock_do_ioctl+0x65/0xb0 net/socket.c:960
>  sock_ioctl+0x2c2/0x440 net/socket.c:1057
>  vfs_ioctl fs/ioctl.c:46 [inline]
>  do_vfs_ioctl+0x1b1/0x1530 fs/ioctl.c:686
>  SYSC_ioctl fs/ioctl.c:701 [inline]
>  SyS_ioctl+0x8f/0xc0 fs/ioctl.c:692
>  entry_SYSCALL_64_fastpath+0x1f/0x96
> 
> Freed by task 11867:
>  save_stack+0x43/0xd0 mm/kasan/kasan.c:447
>  set_track mm/kasan/kasan.c:459 [inline]
>  kasan_slab_free+0x71/0xc0 mm/kasan/kasan.c:524
>  __cache_free mm/slab.c:3492 [inline]
>  kmem_cache_free+0x77/0x280 mm/slab.c:3750
>  kcm_unattach+0xe50/0x1510 net/kcm/kcmsock.c:1563
>  kcm_unattach_ioctl net/kcm/kcmsock.c:1608 [inline]
>  kcm_ioctl+0xdf0/0x1610 net/kcm/kcmsock.c:1705
>  sock_do_ioctl+0x65/0xb0 net/socket.c:960
>  sock_ioctl+0x2c2/0x440 net/socket.c:1057
>  vfs_ioctl fs/ioctl.c:46 [inline]
>  do_vfs_ioctl+0x1b1/0x1530 fs/ioctl.c:686
>  SYSC_ioctl fs/ioctl.c:701 [inline]
>  SyS_ioctl+0x8f/0xc0 fs/ioctl.c:692
>  entry_SYSCALL_64_fastpath+0x1f/0x96
> 
> The buggy address belongs to the object at 88002d0e3d00
>  which belongs to the cache kcm_psock_cache of size 576
> The buggy address is located 224 bytes inside of
>  576-byte region [88002d0e3d00, 88002d0e3f40)
> The buggy address belongs to the page:
> page:eab43880 count:1 mapcount:0 mapping:88002d0e2180 index:0x0
> compound_mapcount: 0
> flags: 0x1008100(slab|head)
> raw: 01008100 88002d0e2180  0001000b
> raw: eab14920 eab27e20 88002b0089c0 
> page dumped because: kasan: bad access detected
> 
> Memory state around the buggy address:
>  88002d0e3c80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
>  88002d0e3d00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
> > 88002d0e3d80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
>^
>  88002d0e3e00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
>  88002d0e3e80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
> ==

No longer occurring, the fix seems to have been commit 7e9964574ee97:

#syz fix: kcm: Only allow TCP sockets to be attached to a KCM mux

- Eric


[PATCH v3] cpufreq: cppc_cpufreq: Initialize shared cpu's perf capabilities

2018-04-05 Thread Shunyong Yang
When multiple cpus are related in one cpufreq policy, the first online
cpu will be chosen by default to handle cpufreq operations. Let's take
cpu0 and cpu1 as an example.

When cpu0 is offline, policy->cpu will be shifted to cpu1. Cpu1's perf
capabilities should be initialized. Otherwise, perf capabilities are 0s
and speed change can not take effect.

This patch copies perf capabilities of the first online cpu to other
shared cpus when policy shared type is CPUFREQ_SHARED_TYPE_ANY.

Cc: Joey Zheng 
Acked-by: Viresh Kumar 
Signed-off-by: Shunyong Yang 
---

Changes in v3:
  -Remove unnecessary wrap per Kumar's comments.

Changes in v2:
  -Add unlikely in cpu comparison per Kumar's comments.
  -Fix coding style per Kumar's comments.

Changes in v1:
  -Drop RFC tag,
 The original RFC link,
 https://patchwork.kernel.org/patch/10299055/.

 This patch solves same issue as RFC above.

 Patch name is changed as code is too much different with RFC above.

  -Remove extra init() per Viresh Kumar's comments and only handle
   CPPC CPUFREQ_SHARED_TYPE_ANY case.

---
 drivers/cpufreq/cppc_cpufreq.c | 14 --
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c
index 8300a9fcb80c..8b432d6e846d 100644
--- a/drivers/cpufreq/cppc_cpufreq.c
+++ b/drivers/cpufreq/cppc_cpufreq.c
@@ -167,9 +167,19 @@ static int cppc_cpufreq_cpu_init(struct cpufreq_policy 
*policy)
NSEC_PER_USEC;
policy->shared_type = cpu->shared_type;
 
-   if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY)
+   if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) {
+   int i;
+
cpumask_copy(policy->cpus, cpu->shared_cpu_map);
-   else if (policy->shared_type == CPUFREQ_SHARED_TYPE_ALL) {
+
+   for_each_cpu(i, policy->cpus) {
+   if (unlikely(i == policy->cpu))
+   continue;
+
+   memcpy(_cpu_data[i]->perf_caps, >perf_caps,
+  sizeof(cpu->perf_caps));
+   }
+   } else if (policy->shared_type == CPUFREQ_SHARED_TYPE_ALL) {
/* Support only SW_ANY for now. */
pr_debug("Unsupported CPU co-ord type\n");
return -EFAULT;
-- 
1.8.3.1



[PATCH v3] cpufreq: cppc_cpufreq: Initialize shared cpu's perf capabilities

2018-04-05 Thread Shunyong Yang
When multiple cpus are related in one cpufreq policy, the first online
cpu will be chosen by default to handle cpufreq operations. Let's take
cpu0 and cpu1 as an example.

When cpu0 is offline, policy->cpu will be shifted to cpu1. Cpu1's perf
capabilities should be initialized. Otherwise, perf capabilities are 0s
and speed change can not take effect.

This patch copies perf capabilities of the first online cpu to other
shared cpus when policy shared type is CPUFREQ_SHARED_TYPE_ANY.

Cc: Joey Zheng 
Acked-by: Viresh Kumar 
Signed-off-by: Shunyong Yang 
---

Changes in v3:
  -Remove unnecessary wrap per Kumar's comments.

Changes in v2:
  -Add unlikely in cpu comparison per Kumar's comments.
  -Fix coding style per Kumar's comments.

Changes in v1:
  -Drop RFC tag,
 The original RFC link,
 https://patchwork.kernel.org/patch/10299055/.

 This patch solves same issue as RFC above.

 Patch name is changed as code is too much different with RFC above.

  -Remove extra init() per Viresh Kumar's comments and only handle
   CPPC CPUFREQ_SHARED_TYPE_ANY case.

---
 drivers/cpufreq/cppc_cpufreq.c | 14 --
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c
index 8300a9fcb80c..8b432d6e846d 100644
--- a/drivers/cpufreq/cppc_cpufreq.c
+++ b/drivers/cpufreq/cppc_cpufreq.c
@@ -167,9 +167,19 @@ static int cppc_cpufreq_cpu_init(struct cpufreq_policy 
*policy)
NSEC_PER_USEC;
policy->shared_type = cpu->shared_type;
 
-   if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY)
+   if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) {
+   int i;
+
cpumask_copy(policy->cpus, cpu->shared_cpu_map);
-   else if (policy->shared_type == CPUFREQ_SHARED_TYPE_ALL) {
+
+   for_each_cpu(i, policy->cpus) {
+   if (unlikely(i == policy->cpu))
+   continue;
+
+   memcpy(_cpu_data[i]->perf_caps, >perf_caps,
+  sizeof(cpu->perf_caps));
+   }
+   } else if (policy->shared_type == CPUFREQ_SHARED_TYPE_ALL) {
/* Support only SW_ANY for now. */
pr_debug("Unsupported CPU co-ord type\n");
return -EFAULT;
-- 
1.8.3.1



Re: [PATCH 5/6] aio: implement IOCB_CMD_FSYNC and IOCB_CMD_FDSYNC

2018-04-05 Thread Al Viro
On Wed, Mar 28, 2018 at 09:26:38AM +0200, Christoph Hellwig wrote:
> +static int aio_fsync(struct fsync_iocb *req, struct iocb *iocb, bool 
> datasync)
> +{
> + int ret;
> +
> + if (iocb->aio_buf)
> + return -EINVAL;
> + if (iocb->aio_offset || iocb->aio_nbytes || iocb->aio_rw_flags)
> + return -EINVAL;
> +
> + req->file = fget(iocb->aio_fildes);
> + if (unlikely(!req->file))
> + return -EBADF;
> +
> + ret = -EINVAL;
> + if (!req->file->f_op->fsync)
> + goto out_fput;
> +
> + req->datasync = datasync;
> + INIT_WORK(>work, aio_fsync_work);
> + schedule_work(>work);
> + return -EIOCBQUEUED;
> +out_fput:
> + if (unlikely(ret && ret != -EIOCBQUEUED))

Really?  The only way to get there is if we got NULL ->f_op->fsync.
So this "if (unlikely(...))" is actually if (true) and I'm not
sure we want that separated anyway - simply

if (unlikely(!req->file->f_op->fsync)) {
fput(req->file);
return -EINVAL;
}

> + fput(req->file);

> + return ret;
> +}



Re: [PATCH 5/6] aio: implement IOCB_CMD_FSYNC and IOCB_CMD_FDSYNC

2018-04-05 Thread Al Viro
On Wed, Mar 28, 2018 at 09:26:38AM +0200, Christoph Hellwig wrote:
> +static int aio_fsync(struct fsync_iocb *req, struct iocb *iocb, bool 
> datasync)
> +{
> + int ret;
> +
> + if (iocb->aio_buf)
> + return -EINVAL;
> + if (iocb->aio_offset || iocb->aio_nbytes || iocb->aio_rw_flags)
> + return -EINVAL;
> +
> + req->file = fget(iocb->aio_fildes);
> + if (unlikely(!req->file))
> + return -EBADF;
> +
> + ret = -EINVAL;
> + if (!req->file->f_op->fsync)
> + goto out_fput;
> +
> + req->datasync = datasync;
> + INIT_WORK(>work, aio_fsync_work);
> + schedule_work(>work);
> + return -EIOCBQUEUED;
> +out_fput:
> + if (unlikely(ret && ret != -EIOCBQUEUED))

Really?  The only way to get there is if we got NULL ->f_op->fsync.
So this "if (unlikely(...))" is actually if (true) and I'm not
sure we want that separated anyway - simply

if (unlikely(!req->file->f_op->fsync)) {
fput(req->file);
return -EINVAL;
}

> + fput(req->file);

> + return ret;
> +}



Re: [GIT PULL] printk for 4.17

2018-04-05 Thread Linus Torvalds
On Wed, Apr 4, 2018 at 5:39 AM, Petr Mladek  wrote:
>
> Petr Mladek (1):
>   Merge branch 'for-4.17' into for-linus

Please don't do this.

There was no reason for it, and the commit message says:

  "Merge branch 'for-4.17' into for-linus"

which also doesn't tell why that merge would have existed.

You should just have asked me to pull "for-4.17". Or renamed the
branch to "for-linus". There was no reason for the merge that I can
see.

So what I actually did was to just pull your 'for-4.17' branch
instead, which got me the exact same end result.

   Linus


Re: [GIT PULL] printk for 4.17

2018-04-05 Thread Linus Torvalds
On Wed, Apr 4, 2018 at 5:39 AM, Petr Mladek  wrote:
>
> Petr Mladek (1):
>   Merge branch 'for-4.17' into for-linus

Please don't do this.

There was no reason for it, and the commit message says:

  "Merge branch 'for-4.17' into for-linus"

which also doesn't tell why that merge would have existed.

You should just have asked me to pull "for-4.17". Or renamed the
branch to "for-linus". There was no reason for the merge that I can
see.

So what I actually did was to just pull your 'for-4.17' branch
instead, which got me the exact same end result.

   Linus


Re: [RFC] virtio: Use DMA MAP API for devices without an IOMMU

2018-04-05 Thread Anshuman Khandual
On 04/06/2018 02:48 AM, Benjamin Herrenschmidt wrote:
> On Thu, 2018-04-05 at 21:34 +0300, Michael S. Tsirkin wrote:
>>> In this specific case, because that would make qemu expect an iommu,
>>> and there isn't one.
>>
>>
>> I think that you can set iommu_platform in qemu without an iommu.
> 
> No I mean the platform has one but it's not desirable for it to be used
> due to the performance hit.

Also the only requirement is to bounce the I/O buffers through SWIOTLB
implemented as DMA API which the virtio core understands. There is no
need for an IOMMU to be involved for the device representation in this
case IMHO.



Re: [RFC] virtio: Use DMA MAP API for devices without an IOMMU

2018-04-05 Thread Anshuman Khandual
On 04/06/2018 02:48 AM, Benjamin Herrenschmidt wrote:
> On Thu, 2018-04-05 at 21:34 +0300, Michael S. Tsirkin wrote:
>>> In this specific case, because that would make qemu expect an iommu,
>>> and there isn't one.
>>
>>
>> I think that you can set iommu_platform in qemu without an iommu.
> 
> No I mean the platform has one but it's not desirable for it to be used
> due to the performance hit.

Also the only requirement is to bounce the I/O buffers through SWIOTLB
implemented as DMA API which the virtio core understands. There is no
need for an IOMMU to be involved for the device representation in this
case IMHO.



Re: [PATCH] alpha: io: reorder barriers to guarantee writeX() and iowriteX() ordering

2018-04-05 Thread Matt Turner
On Thu, Apr 5, 2018 at 6:35 PM, Sinan Kaya  wrote:
> On 4/2/2018 1:48 PM, Sinan Kaya wrote:
>> memory-barriers.txt has been updated with the following requirement.
>>
>> "When using writel(), a prior wmb() is not needed to guarantee that the
>> cache coherent memory writes have completed before writing to the MMIO
>> region."
>>
>> Current writeX() and iowriteX() implementations on alpha are not
>> satisfying this requirement as the barrier is after the register write.
>>
>> Move mb() in writeX() and iowriteX() functions to guarantee that HW
>> observes memory changes before performing register operations.
>>
>> Signed-off-by: Sinan Kaya 
>> Reported-by: Arnd Bergmann 
>> ---
>>  arch/alpha/include/asm/io.h | 14 +++---
>>  1 file changed, 7 insertions(+), 7 deletions(-)
>>
>> diff --git a/arch/alpha/include/asm/io.h b/arch/alpha/include/asm/io.h
>> index d123ff9..4c533fc 100644
>> --- a/arch/alpha/include/asm/io.h
>> +++ b/arch/alpha/include/asm/io.h
>> @@ -341,14 +341,14 @@ extern inline unsigned int ioread16(void __iomem *addr)
>>
>>  extern inline void iowrite8(u8 b, void __iomem *addr)
>>  {
>> - IO_CONCAT(__IO_PREFIX,iowrite8)(b, addr);
>>   mb();
>> + IO_CONCAT(__IO_PREFIX, iowrite8)(b, addr);
>>  }
>>
>>  extern inline void iowrite16(u16 b, void __iomem *addr)
>>  {
>> - IO_CONCAT(__IO_PREFIX,iowrite16)(b, addr);
>>   mb();
>> + IO_CONCAT(__IO_PREFIX, iowrite16)(b, addr);
>>  }
>>
>>  extern inline u8 inb(unsigned long port)
>> @@ -382,8 +382,8 @@ extern inline unsigned int ioread32(void __iomem *addr)
>>
>>  extern inline void iowrite32(u32 b, void __iomem *addr)
>>  {
>> - IO_CONCAT(__IO_PREFIX,iowrite32)(b, addr);
>>   mb();
>> + IO_CONCAT(__IO_PREFIX, iowrite32)(b, addr);
>>  }
>>
>>  extern inline u32 inl(unsigned long port)
>> @@ -434,14 +434,14 @@ extern inline u16 readw(const volatile void __iomem 
>> *addr)
>>
>>  extern inline void writeb(u8 b, volatile void __iomem *addr)
>>  {
>> - __raw_writeb(b, addr);
>>   mb();
>> + __raw_writeb(b, addr);
>>  }
>>
>>  extern inline void writew(u16 b, volatile void __iomem *addr)
>>  {
>> - __raw_writew(b, addr);
>>   mb();
>> + __raw_writew(b, addr);
>>  }
>>  #endif
>>
>> @@ -482,14 +482,14 @@ extern inline u64 readq(const volatile void __iomem 
>> *addr)
>>
>>  extern inline void writel(u32 b, volatile void __iomem *addr)
>>  {
>> - __raw_writel(b, addr);
>>   mb();
>> + __raw_writel(b, addr);
>>  }
>>
>>  extern inline void writeq(u64 b, volatile void __iomem *addr)
>>  {
>> - __raw_writeq(b, addr);
>>   mb();
>> + __raw_writeq(b, addr);
>>  }
>>  #endif
>>
>>
>
>
> Can we get these merged to 4.17?
>
> There was a consensus to fix the architectures having API violation issues.
> https://www.mail-archive.com/netdev@vger.kernel.org/msg225971.html

I expect so. Sorry I haven't had time to collect patches yet. I think
tomorrow or the next day I will.


Re: [PATCH] alpha: io: reorder barriers to guarantee writeX() and iowriteX() ordering

2018-04-05 Thread Matt Turner
On Thu, Apr 5, 2018 at 6:35 PM, Sinan Kaya  wrote:
> On 4/2/2018 1:48 PM, Sinan Kaya wrote:
>> memory-barriers.txt has been updated with the following requirement.
>>
>> "When using writel(), a prior wmb() is not needed to guarantee that the
>> cache coherent memory writes have completed before writing to the MMIO
>> region."
>>
>> Current writeX() and iowriteX() implementations on alpha are not
>> satisfying this requirement as the barrier is after the register write.
>>
>> Move mb() in writeX() and iowriteX() functions to guarantee that HW
>> observes memory changes before performing register operations.
>>
>> Signed-off-by: Sinan Kaya 
>> Reported-by: Arnd Bergmann 
>> ---
>>  arch/alpha/include/asm/io.h | 14 +++---
>>  1 file changed, 7 insertions(+), 7 deletions(-)
>>
>> diff --git a/arch/alpha/include/asm/io.h b/arch/alpha/include/asm/io.h
>> index d123ff9..4c533fc 100644
>> --- a/arch/alpha/include/asm/io.h
>> +++ b/arch/alpha/include/asm/io.h
>> @@ -341,14 +341,14 @@ extern inline unsigned int ioread16(void __iomem *addr)
>>
>>  extern inline void iowrite8(u8 b, void __iomem *addr)
>>  {
>> - IO_CONCAT(__IO_PREFIX,iowrite8)(b, addr);
>>   mb();
>> + IO_CONCAT(__IO_PREFIX, iowrite8)(b, addr);
>>  }
>>
>>  extern inline void iowrite16(u16 b, void __iomem *addr)
>>  {
>> - IO_CONCAT(__IO_PREFIX,iowrite16)(b, addr);
>>   mb();
>> + IO_CONCAT(__IO_PREFIX, iowrite16)(b, addr);
>>  }
>>
>>  extern inline u8 inb(unsigned long port)
>> @@ -382,8 +382,8 @@ extern inline unsigned int ioread32(void __iomem *addr)
>>
>>  extern inline void iowrite32(u32 b, void __iomem *addr)
>>  {
>> - IO_CONCAT(__IO_PREFIX,iowrite32)(b, addr);
>>   mb();
>> + IO_CONCAT(__IO_PREFIX, iowrite32)(b, addr);
>>  }
>>
>>  extern inline u32 inl(unsigned long port)
>> @@ -434,14 +434,14 @@ extern inline u16 readw(const volatile void __iomem 
>> *addr)
>>
>>  extern inline void writeb(u8 b, volatile void __iomem *addr)
>>  {
>> - __raw_writeb(b, addr);
>>   mb();
>> + __raw_writeb(b, addr);
>>  }
>>
>>  extern inline void writew(u16 b, volatile void __iomem *addr)
>>  {
>> - __raw_writew(b, addr);
>>   mb();
>> + __raw_writew(b, addr);
>>  }
>>  #endif
>>
>> @@ -482,14 +482,14 @@ extern inline u64 readq(const volatile void __iomem 
>> *addr)
>>
>>  extern inline void writel(u32 b, volatile void __iomem *addr)
>>  {
>> - __raw_writel(b, addr);
>>   mb();
>> + __raw_writel(b, addr);
>>  }
>>
>>  extern inline void writeq(u64 b, volatile void __iomem *addr)
>>  {
>> - __raw_writeq(b, addr);
>>   mb();
>> + __raw_writeq(b, addr);
>>  }
>>  #endif
>>
>>
>
>
> Can we get these merged to 4.17?
>
> There was a consensus to fix the architectures having API violation issues.
> https://www.mail-archive.com/netdev@vger.kernel.org/msg225971.html

I expect so. Sorry I haven't had time to collect patches yet. I think
tomorrow or the next day I will.


[PATCH] staging:rtl8712: required line 80 over and multiple line

2018-04-05 Thread GyeDo Park
clean up checkpatch warning:
Warning: line over 80 characters
Warning: Avoid multiple line dereference

Signed-off-by: GyeDo Park 
---
 drivers/staging/rtl8712/rtl871x_mlme.c | 92 +-
 1 file changed, 36 insertions(+), 56 deletions(-)

diff --git a/drivers/staging/rtl8712/rtl871x_mlme.c 
b/drivers/staging/rtl8712/rtl871x_mlme.c
index 7824508..db50677 100644
--- a/drivers/staging/rtl8712/rtl871x_mlme.c
+++ b/drivers/staging/rtl8712/rtl871x_mlme.c
@@ -469,9 +469,8 @@ static int is_desired_network(struct _adapter *adapter,
bselected = false;
if (check_fwstate(>mlmepriv, WIFI_ADHOC_STATE)) {
if (pnetwork->network.InfrastructureMode !=
-   adapter->mlmepriv.cur_network.network.
-   InfrastructureMode)
-   bselected = false;
+   adapter->mlmepriv.cur_network.network.InfrastructureMode)
+   bselected = false;
}
return bselected;
 }

@@ -721,27 +720,25 @@ void r8712_joinbss_event_callback(struct _adapter 
*adapter, u8 *pbuf)
pnetwork->network.Privacy = le32_to_cpu(pnetwork->network.Privacy);
pnetwork->network.Rssi = le32_to_cpu(pnetwork->network.Rssi);
pnetwork->network.NetworkTypeInUse =
-le32_to_cpu(pnetwork->network.NetworkTypeInUse);
+ le32_to_cpu(pnetwork->network.NetworkTypeInUse);
pnetwork->network.Configuration.ATIMWindow =
-le32_to_cpu(pnetwork->network.Configuration.ATIMWindow);
+ le32_to_cpu(pnetwork->network.Configuration.ATIMWindow);
pnetwork->network.Configuration.BeaconPeriod =
-le32_to_cpu(pnetwork->network.Configuration.BeaconPeriod);
+ le32_to_cpu(pnetwork->network.Configuration.BeaconPeriod);
pnetwork->network.Configuration.DSConfig =
-le32_to_cpu(pnetwork->network.Configuration.DSConfig);
+ le32_to_cpu(pnetwork->network.Configuration.DSConfig);
pnetwork->network.Configuration.FHConfig.DwellTime =
-le32_to_cpu(pnetwork->network.Configuration.FHConfig.
-DwellTime);
+ le32_to_cpu(pnetwork->network.Configuration.FHConfig.DwellTime);
pnetwork->network.Configuration.FHConfig.HopPattern =
-le32_to_cpu(pnetwork->network.Configuration.
-FHConfig.HopPattern);
+ le32_to_cpu(pnetwork->network.Configuration.FHConfig.HopPattern);
pnetwork->network.Configuration.FHConfig.HopSet =
-le32_to_cpu(pnetwork->network.Configuration.FHConfig.HopSet);
+ le32_to_cpu(pnetwork->network.Configuration.FHConfig.HopSet);
pnetwork->network.Configuration.FHConfig.Length =
-le32_to_cpu(pnetwork->network.Configuration.FHConfig.Length);
+ le32_to_cpu(pnetwork->network.Configuration.FHConfig.Length);
pnetwork->network.Configuration.Length =
-le32_to_cpu(pnetwork->network.Configuration.Length);
+ le32_to_cpu(pnetwork->network.Configuration.Length);
pnetwork->network.InfrastructureMode =
-le32_to_cpu(pnetwork->network.InfrastructureMode);
+ le32_to_cpu(pnetwork->network.InfrastructureMode);
pnetwork->network.IELength = le32_to_cpu(pnetwork->network.IELength);
 #endif

-- 
2.7.4



[PATCH] staging:rtl8712: required line 80 over and multiple line

2018-04-05 Thread GyeDo Park
clean up checkpatch warning:
Warning: line over 80 characters
Warning: Avoid multiple line dereference

Signed-off-by: GyeDo Park 
---
 drivers/staging/rtl8712/rtl871x_mlme.c | 92 +-
 1 file changed, 36 insertions(+), 56 deletions(-)

diff --git a/drivers/staging/rtl8712/rtl871x_mlme.c 
b/drivers/staging/rtl8712/rtl871x_mlme.c
index 7824508..db50677 100644
--- a/drivers/staging/rtl8712/rtl871x_mlme.c
+++ b/drivers/staging/rtl8712/rtl871x_mlme.c
@@ -469,9 +469,8 @@ static int is_desired_network(struct _adapter *adapter,
bselected = false;
if (check_fwstate(>mlmepriv, WIFI_ADHOC_STATE)) {
if (pnetwork->network.InfrastructureMode !=
-   adapter->mlmepriv.cur_network.network.
-   InfrastructureMode)
-   bselected = false;
+   adapter->mlmepriv.cur_network.network.InfrastructureMode)
+   bselected = false;
}
return bselected;
 }

@@ -721,27 +720,25 @@ void r8712_joinbss_event_callback(struct _adapter 
*adapter, u8 *pbuf)
pnetwork->network.Privacy = le32_to_cpu(pnetwork->network.Privacy);
pnetwork->network.Rssi = le32_to_cpu(pnetwork->network.Rssi);
pnetwork->network.NetworkTypeInUse =
-le32_to_cpu(pnetwork->network.NetworkTypeInUse);
+ le32_to_cpu(pnetwork->network.NetworkTypeInUse);
pnetwork->network.Configuration.ATIMWindow =
-le32_to_cpu(pnetwork->network.Configuration.ATIMWindow);
+ le32_to_cpu(pnetwork->network.Configuration.ATIMWindow);
pnetwork->network.Configuration.BeaconPeriod =
-le32_to_cpu(pnetwork->network.Configuration.BeaconPeriod);
+ le32_to_cpu(pnetwork->network.Configuration.BeaconPeriod);
pnetwork->network.Configuration.DSConfig =
-le32_to_cpu(pnetwork->network.Configuration.DSConfig);
+ le32_to_cpu(pnetwork->network.Configuration.DSConfig);
pnetwork->network.Configuration.FHConfig.DwellTime =
-le32_to_cpu(pnetwork->network.Configuration.FHConfig.
-DwellTime);
+ le32_to_cpu(pnetwork->network.Configuration.FHConfig.DwellTime);
pnetwork->network.Configuration.FHConfig.HopPattern =
-le32_to_cpu(pnetwork->network.Configuration.
-FHConfig.HopPattern);
+ le32_to_cpu(pnetwork->network.Configuration.FHConfig.HopPattern);
pnetwork->network.Configuration.FHConfig.HopSet =
-le32_to_cpu(pnetwork->network.Configuration.FHConfig.HopSet);
+ le32_to_cpu(pnetwork->network.Configuration.FHConfig.HopSet);
pnetwork->network.Configuration.FHConfig.Length =
-le32_to_cpu(pnetwork->network.Configuration.FHConfig.Length);
+ le32_to_cpu(pnetwork->network.Configuration.FHConfig.Length);
pnetwork->network.Configuration.Length =
-le32_to_cpu(pnetwork->network.Configuration.Length);
+ le32_to_cpu(pnetwork->network.Configuration.Length);
pnetwork->network.InfrastructureMode =
-le32_to_cpu(pnetwork->network.InfrastructureMode);
+ le32_to_cpu(pnetwork->network.InfrastructureMode);
pnetwork->network.IELength = le32_to_cpu(pnetwork->network.IELength);
 #endif

-- 
2.7.4



Re: [PATCH v9 05/10] cpuidle: Return nohz hint from cpuidle_select()

2018-04-05 Thread Frederic Weisbecker
On Wed, Apr 04, 2018 at 10:39:50AM +0200, Rafael J. Wysocki wrote:
> From: Rafael J. Wysocki 
> Index: linux-pm/kernel/time/tick-sched.c
> ===
> --- linux-pm.orig/kernel/time/tick-sched.c
> +++ linux-pm/kernel/time/tick-sched.c
> @@ -991,6 +991,20 @@ void tick_nohz_irq_exit(void)
>  }
>  
>  /**
> + * tick_nohz_idle_got_tick - Check whether or not the tick handler has run
> + */
> +bool tick_nohz_idle_got_tick(void)
> +{
> + struct tick_sched *ts = this_cpu_ptr(_cpu_sched);
> +
> + if (ts->inidle > 1) {
> + ts->inidle = 1;
> + return true;
> + }
> + return false;
> +}
> +
> +/**
>   * tick_nohz_get_sleep_length - return the length of the current sleep
>   *
>   * Called from power state control code with interrupts disabled
> @@ -1101,6 +1115,9 @@ static void tick_nohz_handler(struct clo
>   struct pt_regs *regs = get_irq_regs();
>   ktime_t now = ktime_get();
>  
> + if (ts->inidle)
> + ts->inidle = 2;
> +

You can move that to tick_sched_do_timer() to avoid code duplication.

Also these constants are very opaque. And even with proper symbols it wouldn't 
look
right to extend ts->inidle that way.

Perhaps you should add a field such as ts->got_idle_tick under the boolean 
fields
after the below patch:

--
>From c7b2ca5a4c512517ddfeb9f922d5999f82542ced Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker 
Date: Fri, 6 Apr 2018 04:32:37 +0200
Subject: [PATCH] nohz: Gather tick_sched booleans under a common flag field

This optimize the space and leave plenty of room for further flags.

Signed-off-by: Frederic Weisbecker 
---
 kernel/time/tick-sched.h | 10 ++
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h
index 954b43d..38f24dc 100644
--- a/kernel/time/tick-sched.h
+++ b/kernel/time/tick-sched.h
@@ -45,14 +45,17 @@ struct tick_sched {
struct hrtimer  sched_timer;
unsigned long   check_clocks;
enum tick_nohz_mode nohz_mode;
+
+   unsigned intinidle  : 1;
+   unsigned inttick_stopped: 1;
+   unsigned intidle_active : 1;
+   unsigned intdo_timer_last   : 1;
+
ktime_t last_tick;
ktime_t next_tick;
-   int inidle;
-   int tick_stopped;
unsigned long   idle_jiffies;
unsigned long   idle_calls;
unsigned long   idle_sleeps;
-   int idle_active;
ktime_t idle_entrytime;
ktime_t idle_waketime;
ktime_t idle_exittime;
@@ -62,7 +65,6 @@ struct tick_sched {
unsigned long   last_jiffies;
u64 next_timer;
ktime_t idle_expires;
-   int do_timer_last;
atomic_ttick_dep_mask;
 };
 
-- 
2.7.4





Re: [PATCH v9 05/10] cpuidle: Return nohz hint from cpuidle_select()

2018-04-05 Thread Frederic Weisbecker
On Wed, Apr 04, 2018 at 10:39:50AM +0200, Rafael J. Wysocki wrote:
> From: Rafael J. Wysocki 
> Index: linux-pm/kernel/time/tick-sched.c
> ===
> --- linux-pm.orig/kernel/time/tick-sched.c
> +++ linux-pm/kernel/time/tick-sched.c
> @@ -991,6 +991,20 @@ void tick_nohz_irq_exit(void)
>  }
>  
>  /**
> + * tick_nohz_idle_got_tick - Check whether or not the tick handler has run
> + */
> +bool tick_nohz_idle_got_tick(void)
> +{
> + struct tick_sched *ts = this_cpu_ptr(_cpu_sched);
> +
> + if (ts->inidle > 1) {
> + ts->inidle = 1;
> + return true;
> + }
> + return false;
> +}
> +
> +/**
>   * tick_nohz_get_sleep_length - return the length of the current sleep
>   *
>   * Called from power state control code with interrupts disabled
> @@ -1101,6 +1115,9 @@ static void tick_nohz_handler(struct clo
>   struct pt_regs *regs = get_irq_regs();
>   ktime_t now = ktime_get();
>  
> + if (ts->inidle)
> + ts->inidle = 2;
> +

You can move that to tick_sched_do_timer() to avoid code duplication.

Also these constants are very opaque. And even with proper symbols it wouldn't 
look
right to extend ts->inidle that way.

Perhaps you should add a field such as ts->got_idle_tick under the boolean 
fields
after the below patch:

--
>From c7b2ca5a4c512517ddfeb9f922d5999f82542ced Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker 
Date: Fri, 6 Apr 2018 04:32:37 +0200
Subject: [PATCH] nohz: Gather tick_sched booleans under a common flag field

This optimize the space and leave plenty of room for further flags.

Signed-off-by: Frederic Weisbecker 
---
 kernel/time/tick-sched.h | 10 ++
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h
index 954b43d..38f24dc 100644
--- a/kernel/time/tick-sched.h
+++ b/kernel/time/tick-sched.h
@@ -45,14 +45,17 @@ struct tick_sched {
struct hrtimer  sched_timer;
unsigned long   check_clocks;
enum tick_nohz_mode nohz_mode;
+
+   unsigned intinidle  : 1;
+   unsigned inttick_stopped: 1;
+   unsigned intidle_active : 1;
+   unsigned intdo_timer_last   : 1;
+
ktime_t last_tick;
ktime_t next_tick;
-   int inidle;
-   int tick_stopped;
unsigned long   idle_jiffies;
unsigned long   idle_calls;
unsigned long   idle_sleeps;
-   int idle_active;
ktime_t idle_entrytime;
ktime_t idle_waketime;
ktime_t idle_exittime;
@@ -62,7 +65,6 @@ struct tick_sched {
unsigned long   last_jiffies;
u64 next_timer;
ktime_t idle_expires;
-   int do_timer_last;
atomic_ttick_dep_mask;
 };
 
-- 
2.7.4





Re: [PATCH 2/2] net: phy: dp83640: Read strapped configuration settings

2018-04-05 Thread David Miller
From: Andrew Lunn 
Date: Thu, 5 Apr 2018 22:40:49 +0200

> Or could it still contain whatever state the last boot of Linux, or
> maybe the bootloader, left the PHY in?

Right, this is my concern as well.


Re: [PATCH 2/2] net: phy: dp83640: Read strapped configuration settings

2018-04-05 Thread David Miller
From: Andrew Lunn 
Date: Thu, 5 Apr 2018 22:40:49 +0200

> Or could it still contain whatever state the last boot of Linux, or
> maybe the bootloader, left the PHY in?

Right, this is my concern as well.


Re: [PATCH 0/4] hv_netvsc: Fix shutdown issues on older Windows hosts

2018-04-05 Thread David Miller
From: Mohammed Gamal 
Date: Thu,  5 Apr 2018 21:09:17 +0200

> Guests running on WS2012 hosts would not shutdown when changing network
> interface setting (e.g. Number of channels, MTU ... etc). 
> 
> This patch series addresses these shutdown issues we enecountered with WS2012
> hosts. It's essentialy a rework of the series sent in 
> https://lkml.org/lkml/2018/1/23/111 on top of latest upstream
> 
> Fixes: 0ef58b0a05c1 ("hv_netvsc: change GPAD teardown order on older 
> versions")

Series applied, thank you.


Re: [PATCH 0/4] hv_netvsc: Fix shutdown issues on older Windows hosts

2018-04-05 Thread David Miller
From: Mohammed Gamal 
Date: Thu,  5 Apr 2018 21:09:17 +0200

> Guests running on WS2012 hosts would not shutdown when changing network
> interface setting (e.g. Number of channels, MTU ... etc). 
> 
> This patch series addresses these shutdown issues we enecountered with WS2012
> hosts. It's essentialy a rework of the series sent in 
> https://lkml.org/lkml/2018/1/23/111 on top of latest upstream
> 
> Fixes: 0ef58b0a05c1 ("hv_netvsc: change GPAD teardown order on older 
> versions")

Series applied, thank you.


Re: 4.15.14 crash with iscsi target and dvd

2018-04-05 Thread Bart Van Assche
On Thu, 2018-04-05 at 22:06 -0400, Wakko Warner wrote:
> I know now why scsi_print_command isn't doing anything.  cmd->cmnd is null.
> I added a dev_printk in scsi_print_command where the 2 if statements return.
> Logs:
> [  29.866415] sr 3:0:0:0: cmd->cmnd is NULL

That's something that should never happen. As one can see in
scsi_setup_scsi_cmnd() and scsi_setup_fs_cmnd() both functions initialize
that pointer. Since I have not yet been able to reproduce myself what you
reported, would it be possible for you to bisect this issue? You will need
to follow something like the following procedure (see also
https://git-scm.com/docs/git-bisect):

git clone git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
git bisect start
git bisect bad v4.10
git bisect good v4.9

and then build the kernel, install it, boot the kernel and test it.
Depending on the result, run either git bisect bad or git bisect good and
keep going until git bisect comes to a conclusion. This can take an hour or
more.

Bart.





Re: 4.15.14 crash with iscsi target and dvd

2018-04-05 Thread Bart Van Assche
On Thu, 2018-04-05 at 22:06 -0400, Wakko Warner wrote:
> I know now why scsi_print_command isn't doing anything.  cmd->cmnd is null.
> I added a dev_printk in scsi_print_command where the 2 if statements return.
> Logs:
> [  29.866415] sr 3:0:0:0: cmd->cmnd is NULL

That's something that should never happen. As one can see in
scsi_setup_scsi_cmnd() and scsi_setup_fs_cmnd() both functions initialize
that pointer. Since I have not yet been able to reproduce myself what you
reported, would it be possible for you to bisect this issue? You will need
to follow something like the following procedure (see also
https://git-scm.com/docs/git-bisect):

git clone git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
git bisect start
git bisect bad v4.10
git bisect good v4.9

and then build the kernel, install it, boot the kernel and test it.
Depending on the result, run either git bisect bad or git bisect good and
keep going until git bisect comes to a conclusion. This can take an hour or
more.

Bart.





Re: [PATCH net-next] hv_netvsc: Add NetVSP v6 into version negotiation

2018-04-05 Thread David Miller
From: Haiyang Zhang 
Date: Thu,  5 Apr 2018 11:42:22 -0700

> From: Haiyang Zhang 
> 
> This patch adds the NetVSP v6 message structures, and includes this
> version into NetVSC/NetVSP version negotiation process.
> 
> Signed-off-by: Haiyang Zhang 

The net-next tree is closed, please resubmit this when the net-next
tree reopens.

Thank you.


Re: [PATCH net-next] hv_netvsc: Add NetVSP v6 into version negotiation

2018-04-05 Thread David Miller
From: Haiyang Zhang 
Date: Thu,  5 Apr 2018 11:42:22 -0700

> From: Haiyang Zhang 
> 
> This patch adds the NetVSP v6 message structures, and includes this
> version into NetVSC/NetVSP version negotiation process.
> 
> Signed-off-by: Haiyang Zhang 

The net-next tree is closed, please resubmit this when the net-next
tree reopens.

Thank you.


Re: [PATCH -mm] mm, gup: prevent pmd checking race in follow_pmd_mask()

2018-04-05 Thread Zi Yan
On 5 Apr 2018, at 21:57, huang ying wrote:

> On Wed, Apr 4, 2018 at 11:02 PM, Zi Yan  wrote:
>> On 3 Apr 2018, at 23:22, Huang, Ying wrote:
>>
>>> From: Huang Ying 
>>>
>>> mmap_sem will be read locked when calling follow_pmd_mask().  But this
>>> cannot prevent PMD from being changed for all cases when PTL is
>>> unlocked, for example, from pmd_trans_huge() to pmd_none() via
>>> MADV_DONTNEED.  So it is possible for the pmd_present() check in
>>> follow_pmd_mask() encounter a none PMD.  This may cause incorrect
>>> VM_BUG_ON() or infinite loop.  Fixed this via reading PMD entry again
>>> but only once and checking the local variable and pmd_none() in the
>>> retry loop.
>>>
>>> As Kirill pointed out, with PTL unlocked, the *pmd may be changed
>>> under us, so read it directly again and again may incur weird bugs.
>>> So although using *pmd directly other than pmd_present() checking may
>>> be safe, it is still better to replace them to read *pmd once and
>>> check the local variable for multiple times.
>>
>> I see you point there. The patch wants to provide a consistent value
>> for all race checks. Specifically, this patch is trying to avoid the 
>> inconsistent
>> reads of *pmd for if-statements, which causes problem when both if-condition 
>> reads *pmd and
>> the statements inside "if" reads *pmd again and two reads can give different 
>> values.
>> Am I right about this?
>
> Yes.
>
>> If yes, the problem can be solved by something like:
>>
>> if (!pmd_present(tmpval = *pmd)) {
>> check tmpval instead of *pmd;
>> }
>>
>> Right?
>
> I think this isn't enough yet.  we need
>
> tmpval = READ_ONCE(*pmd);
>
> To prevent compiler to generate code to read *pmd again and again.
> Please check the comments of pmd_none_or_trans_huge_or_clear_bad()
> about barrier.

Got it. And if there is a barrier (implicit or explicit) inside if-statement, 
like
pmd_migrationt_entry_wait(mm, pmd), we need to update tmpval with READ_ONCE() 
after the barrier.

The patch looks good to me. Thanks.

Reviewed-by: Zi Yan 

—
Best Regards,
Yan Zi


signature.asc
Description: OpenPGP digital signature


Re: [PATCH -mm] mm, gup: prevent pmd checking race in follow_pmd_mask()

2018-04-05 Thread Zi Yan
On 5 Apr 2018, at 21:57, huang ying wrote:

> On Wed, Apr 4, 2018 at 11:02 PM, Zi Yan  wrote:
>> On 3 Apr 2018, at 23:22, Huang, Ying wrote:
>>
>>> From: Huang Ying 
>>>
>>> mmap_sem will be read locked when calling follow_pmd_mask().  But this
>>> cannot prevent PMD from being changed for all cases when PTL is
>>> unlocked, for example, from pmd_trans_huge() to pmd_none() via
>>> MADV_DONTNEED.  So it is possible for the pmd_present() check in
>>> follow_pmd_mask() encounter a none PMD.  This may cause incorrect
>>> VM_BUG_ON() or infinite loop.  Fixed this via reading PMD entry again
>>> but only once and checking the local variable and pmd_none() in the
>>> retry loop.
>>>
>>> As Kirill pointed out, with PTL unlocked, the *pmd may be changed
>>> under us, so read it directly again and again may incur weird bugs.
>>> So although using *pmd directly other than pmd_present() checking may
>>> be safe, it is still better to replace them to read *pmd once and
>>> check the local variable for multiple times.
>>
>> I see you point there. The patch wants to provide a consistent value
>> for all race checks. Specifically, this patch is trying to avoid the 
>> inconsistent
>> reads of *pmd for if-statements, which causes problem when both if-condition 
>> reads *pmd and
>> the statements inside "if" reads *pmd again and two reads can give different 
>> values.
>> Am I right about this?
>
> Yes.
>
>> If yes, the problem can be solved by something like:
>>
>> if (!pmd_present(tmpval = *pmd)) {
>> check tmpval instead of *pmd;
>> }
>>
>> Right?
>
> I think this isn't enough yet.  we need
>
> tmpval = READ_ONCE(*pmd);
>
> To prevent compiler to generate code to read *pmd again and again.
> Please check the comments of pmd_none_or_trans_huge_or_clear_bad()
> about barrier.

Got it. And if there is a barrier (implicit or explicit) inside if-statement, 
like
pmd_migrationt_entry_wait(mm, pmd), we need to update tmpval with READ_ONCE() 
after the barrier.

The patch looks good to me. Thanks.

Reviewed-by: Zi Yan 

—
Best Regards,
Yan Zi


signature.asc
Description: OpenPGP digital signature


Re: [PATCH 08/32] aio: replace kiocb_set_cancel_fn with a cancel_kiocb file operation

2018-04-05 Thread Al Viro
On Fri, Mar 30, 2018 at 05:07:45PM +0200, Christoph Hellwig wrote:
> The current kiocb_set_cancel_fn implementation assumes the kiocb is
> embedded into an aio_kiocb, which is fundamentally unsafe as it might
> have been submitted by non-aio callers.  Instead add a cancel_kiocb
> file operation that replaced the ki_cancel function pointer set by
> kiocb_set_cancel_fn, and only adds iocbs to the active list when
> the read/write_iter methods return -EIOCBQUEUED and the file has
> a cancel_kiocb method.

> @@ -1440,6 +1423,16 @@ static inline ssize_t aio_rw_ret(struct kiocb *req, 
> ssize_t ret)
>  {
>   switch (ret) {
>   case -EIOCBQUEUED:
> + if (req->ki_filp->f_op->cancel_kiocb) {

... and by that point req might've been already freed by IO completion on
another CPU.
> + struct aio_kiocb *iocb =
> + container_of(req, struct aio_kiocb, rw);
> + struct kioctx *ctx = iocb->ki_ctx;
> + unsigned long flags;
> +
> + spin_lock_irqsave(>ctx_lock, flags);
> + list_add_tail(>ki_list, >active_reqs);



Re: [PATCH 08/32] aio: replace kiocb_set_cancel_fn with a cancel_kiocb file operation

2018-04-05 Thread Al Viro
On Fri, Mar 30, 2018 at 05:07:45PM +0200, Christoph Hellwig wrote:
> The current kiocb_set_cancel_fn implementation assumes the kiocb is
> embedded into an aio_kiocb, which is fundamentally unsafe as it might
> have been submitted by non-aio callers.  Instead add a cancel_kiocb
> file operation that replaced the ki_cancel function pointer set by
> kiocb_set_cancel_fn, and only adds iocbs to the active list when
> the read/write_iter methods return -EIOCBQUEUED and the file has
> a cancel_kiocb method.

> @@ -1440,6 +1423,16 @@ static inline ssize_t aio_rw_ret(struct kiocb *req, 
> ssize_t ret)
>  {
>   switch (ret) {
>   case -EIOCBQUEUED:
> + if (req->ki_filp->f_op->cancel_kiocb) {

... and by that point req might've been already freed by IO completion on
another CPU.
> + struct aio_kiocb *iocb =
> + container_of(req, struct aio_kiocb, rw);
> + struct kioctx *ctx = iocb->ki_ctx;
> + unsigned long flags;
> +
> + spin_lock_irqsave(>ctx_lock, flags);
> + list_add_tail(>ki_list, >active_reqs);



Re: [PATCH v2 4/4] mm/vmscan: Don't mess with pgdat->flags in memcg reclaim.

2018-04-05 Thread Shakeel Butt
On Fri, Mar 23, 2018 at 8:20 AM, Andrey Ryabinin
 wrote:
> memcg reclaim may alter pgdat->flags based on the state of LRU lists
> in cgroup and its children. PGDAT_WRITEBACK may force kswapd to sleep
> congested_wait(), PGDAT_DIRTY may force kswapd to writeback filesystem
> pages. But the worst here is PGDAT_CONGESTED, since it may force all
> direct reclaims to stall in wait_iff_congested(). Note that only kswapd
> have powers to clear any of these bits. This might just never happen if
> cgroup limits configured that way. So all direct reclaims will stall
> as long as we have some congested bdi in the system.
>
> Leave all pgdat->flags manipulations to kswapd. kswapd scans the whole
> pgdat, only kswapd can clear pgdat->flags once node is balance, thus
> it's reasonable to leave all decisions about node state to kswapd.

What about global reclaimers? Is the assumption that when global
reclaimers hit such condition, kswapd will be running and correctly
set PGDAT_CONGESTED?

>
> Moving pgdat->flags manipulation to kswapd, means that cgroup2 recalim
> now loses its congestion throttling mechanism. Add per-cgroup congestion
> state and throttle cgroup2 reclaimers if memcg is in congestion state.
>
> Currently there is no need in per-cgroup PGDAT_WRITEBACK and PGDAT_DIRTY
> bits since they alter only kswapd behavior.
>
> The problem could be easily demonstrated by creating heavy congestion
> in one cgroup:
>
> echo "+memory" > /sys/fs/cgroup/cgroup.subtree_control
> mkdir -p /sys/fs/cgroup/congester
> echo 512M > /sys/fs/cgroup/congester/memory.max
> echo $$ > /sys/fs/cgroup/congester/cgroup.procs
> /* generate a lot of diry data on slow HDD */
> while true; do dd if=/dev/zero of=/mnt/sdb/zeroes bs=1M count=1024; done &
> 
> while true; do dd if=/dev/zero of=/mnt/sdb/zeroes bs=1M count=1024; done &
>
> and some job in another cgroup:
>
> mkdir /sys/fs/cgroup/victim
> echo 128M > /sys/fs/cgroup/victim/memory.max
>
> # time cat /dev/sda > /dev/null
> real10m15.054s
> user0m0.487s
> sys 1m8.505s
>
> According to the tracepoint in wait_iff_congested(), the 'cat' spent 50%
> of the time sleeping there.
>
> With the patch, cat don't waste time anymore:
>
> # time cat /dev/sda > /dev/null
> real5m32.911s
> user0m0.411s
> sys 0m56.664s
>
> Signed-off-by: Andrey Ryabinin 
> ---
>  include/linux/backing-dev.h |  2 +-
>  include/linux/memcontrol.h  |  2 ++
>  mm/backing-dev.c| 19 --
>  mm/vmscan.c | 86 
> -
>  4 files changed, 71 insertions(+), 38 deletions(-)
>
> diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
> index f8894dbc0b19..539a5cf94fe2 100644
> --- a/include/linux/backing-dev.h
> +++ b/include/linux/backing-dev.h
> @@ -175,7 +175,7 @@ static inline int wb_congested(struct bdi_writeback *wb, 
> int cong_bits)
>  }
>
>  long congestion_wait(int sync, long timeout);
> -long wait_iff_congested(struct pglist_data *pgdat, int sync, long timeout);
> +long wait_iff_congested(int sync, long timeout);
>
>  static inline bool bdi_cap_synchronous_io(struct backing_dev_info *bdi)
>  {
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 4525b4404a9e..44422e1d3def 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -190,6 +190,8 @@ struct mem_cgroup {
> /* vmpressure notifications */
> struct vmpressure vmpressure;
>
> +   unsigned long flags;
> +

nit(you can ignore it): The name 'flags' is too general IMO. Something
more specific would be helpful.

Question: Does this 'flags' has any hierarchical meaning? Does
congested parent means all descendents are congested?
Question: Should this 'flags' be per-node? Is it ok for a congested
memcg to call wait_iff_congested for all nodes?

> /*
>  * Should the accounting and control be hierarchical, per subtree?
>  */
> diff --git a/mm/backing-dev.c b/mm/backing-dev.c
> index 2eba1f54b1d3..2fc3f38e4c4f 100644
> --- a/mm/backing-dev.c
> +++ b/mm/backing-dev.c
> @@ -1055,23 +1055,18 @@ EXPORT_SYMBOL(congestion_wait);
>
>  /**
>   * wait_iff_congested - Conditionally wait for a backing_dev to become 
> uncongested or a pgdat to complete writes
> - * @pgdat: A pgdat to check if it is heavily congested
>   * @sync: SYNC or ASYNC IO
>   * @timeout: timeout in jiffies
>   *
> - * In the event of a congested backing_dev (any backing_dev) and the given
> - * @pgdat has experienced recent congestion, this waits for up to @timeout
> - * jiffies for either a BDI to exit congestion of the given @sync queue
> - * or a write to complete.
> - *
> - * In the absence of pgdat congestion, cond_resched() is called to yield
> - * the processor if necessary but otherwise does not sleep.
> + * In the event of a congested backing_dev (any backing_dev) this 

Re: [PATCH v2 4/4] mm/vmscan: Don't mess with pgdat->flags in memcg reclaim.

2018-04-05 Thread Shakeel Butt
On Fri, Mar 23, 2018 at 8:20 AM, Andrey Ryabinin
 wrote:
> memcg reclaim may alter pgdat->flags based on the state of LRU lists
> in cgroup and its children. PGDAT_WRITEBACK may force kswapd to sleep
> congested_wait(), PGDAT_DIRTY may force kswapd to writeback filesystem
> pages. But the worst here is PGDAT_CONGESTED, since it may force all
> direct reclaims to stall in wait_iff_congested(). Note that only kswapd
> have powers to clear any of these bits. This might just never happen if
> cgroup limits configured that way. So all direct reclaims will stall
> as long as we have some congested bdi in the system.
>
> Leave all pgdat->flags manipulations to kswapd. kswapd scans the whole
> pgdat, only kswapd can clear pgdat->flags once node is balance, thus
> it's reasonable to leave all decisions about node state to kswapd.

What about global reclaimers? Is the assumption that when global
reclaimers hit such condition, kswapd will be running and correctly
set PGDAT_CONGESTED?

>
> Moving pgdat->flags manipulation to kswapd, means that cgroup2 recalim
> now loses its congestion throttling mechanism. Add per-cgroup congestion
> state and throttle cgroup2 reclaimers if memcg is in congestion state.
>
> Currently there is no need in per-cgroup PGDAT_WRITEBACK and PGDAT_DIRTY
> bits since they alter only kswapd behavior.
>
> The problem could be easily demonstrated by creating heavy congestion
> in one cgroup:
>
> echo "+memory" > /sys/fs/cgroup/cgroup.subtree_control
> mkdir -p /sys/fs/cgroup/congester
> echo 512M > /sys/fs/cgroup/congester/memory.max
> echo $$ > /sys/fs/cgroup/congester/cgroup.procs
> /* generate a lot of diry data on slow HDD */
> while true; do dd if=/dev/zero of=/mnt/sdb/zeroes bs=1M count=1024; done &
> 
> while true; do dd if=/dev/zero of=/mnt/sdb/zeroes bs=1M count=1024; done &
>
> and some job in another cgroup:
>
> mkdir /sys/fs/cgroup/victim
> echo 128M > /sys/fs/cgroup/victim/memory.max
>
> # time cat /dev/sda > /dev/null
> real10m15.054s
> user0m0.487s
> sys 1m8.505s
>
> According to the tracepoint in wait_iff_congested(), the 'cat' spent 50%
> of the time sleeping there.
>
> With the patch, cat don't waste time anymore:
>
> # time cat /dev/sda > /dev/null
> real5m32.911s
> user0m0.411s
> sys 0m56.664s
>
> Signed-off-by: Andrey Ryabinin 
> ---
>  include/linux/backing-dev.h |  2 +-
>  include/linux/memcontrol.h  |  2 ++
>  mm/backing-dev.c| 19 --
>  mm/vmscan.c | 86 
> -
>  4 files changed, 71 insertions(+), 38 deletions(-)
>
> diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
> index f8894dbc0b19..539a5cf94fe2 100644
> --- a/include/linux/backing-dev.h
> +++ b/include/linux/backing-dev.h
> @@ -175,7 +175,7 @@ static inline int wb_congested(struct bdi_writeback *wb, 
> int cong_bits)
>  }
>
>  long congestion_wait(int sync, long timeout);
> -long wait_iff_congested(struct pglist_data *pgdat, int sync, long timeout);
> +long wait_iff_congested(int sync, long timeout);
>
>  static inline bool bdi_cap_synchronous_io(struct backing_dev_info *bdi)
>  {
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 4525b4404a9e..44422e1d3def 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -190,6 +190,8 @@ struct mem_cgroup {
> /* vmpressure notifications */
> struct vmpressure vmpressure;
>
> +   unsigned long flags;
> +

nit(you can ignore it): The name 'flags' is too general IMO. Something
more specific would be helpful.

Question: Does this 'flags' has any hierarchical meaning? Does
congested parent means all descendents are congested?
Question: Should this 'flags' be per-node? Is it ok for a congested
memcg to call wait_iff_congested for all nodes?

> /*
>  * Should the accounting and control be hierarchical, per subtree?
>  */
> diff --git a/mm/backing-dev.c b/mm/backing-dev.c
> index 2eba1f54b1d3..2fc3f38e4c4f 100644
> --- a/mm/backing-dev.c
> +++ b/mm/backing-dev.c
> @@ -1055,23 +1055,18 @@ EXPORT_SYMBOL(congestion_wait);
>
>  /**
>   * wait_iff_congested - Conditionally wait for a backing_dev to become 
> uncongested or a pgdat to complete writes
> - * @pgdat: A pgdat to check if it is heavily congested
>   * @sync: SYNC or ASYNC IO
>   * @timeout: timeout in jiffies
>   *
> - * In the event of a congested backing_dev (any backing_dev) and the given
> - * @pgdat has experienced recent congestion, this waits for up to @timeout
> - * jiffies for either a BDI to exit congestion of the given @sync queue
> - * or a write to complete.
> - *
> - * In the absence of pgdat congestion, cond_resched() is called to yield
> - * the processor if necessary but otherwise does not sleep.
> + * In the event of a congested backing_dev (any backing_dev) this waits
> + * for up to @timeout jiffies for either a 

Re: [PATCH net] net: mvpp2: Fix parser entry init boundary check

2018-04-05 Thread David Miller
From: Maxime Chevallier 
Date: Thu,  5 Apr 2018 11:55:48 +0200

> Boundary check in mvpp2_prs_init_from_hw must be done according to the
> passed "tid" parameter, not the mvpp2_prs_entry index, which is not yet
> initialized at the time of the check.
> 
> Fixes: 47e0e14eb1a6 ("net: mvpp2: Make mvpp2_prs_hw_read a parser entry init 
> function")
> Signed-off-by: Maxime Chevallier 

Applied, thanks.


Re: [PATCH net] net: mvpp2: Fix parser entry init boundary check

2018-04-05 Thread David Miller
From: Maxime Chevallier 
Date: Thu,  5 Apr 2018 11:55:48 +0200

> Boundary check in mvpp2_prs_init_from_hw must be done according to the
> passed "tid" parameter, not the mvpp2_prs_entry index, which is not yet
> initialized at the time of the check.
> 
> Fixes: 47e0e14eb1a6 ("net: mvpp2: Make mvpp2_prs_hw_read a parser entry init 
> function")
> Signed-off-by: Maxime Chevallier 

Applied, thanks.


Re: 4.15.14 crash with iscsi target and dvd

2018-04-05 Thread Wakko Warner
Wakko Warner wrote:
> Bart Van Assche wrote:
> > On Sun, 2018-04-01 at 14:27 -0400, Wakko Warner wrote:
> > > Wakko Warner wrote:
> > > > Wakko Warner wrote:
> > > > > I tested 4.14.32 last night with the same oops.  4.9.91 works fine.
> > > > > From the initiator, if I do cat /dev/sr1 > /dev/null it works.  If I 
> > > > > mount
> > > > > /dev/sr1 and then do find -type f | xargs cat > /dev/null the target
> > > > > crashes.  I'm using the builtin iscsi target with pscsi.  I can burn 
> > > > > from
> > > > > the initiator with out problems.  I'll test other kernels between 4.9 
> > > > > and
> > > > > 4.14.
> > > > 
> > > > So I've tested 4.x.y where x one of 10 11 12 14 15 and y is the latest 
> > > > patch
> > > > (except for 4.15 which was 1 behind)
> > > > Each of these kernels crash within seconds or immediate of doing find 
> > > > -type
> > > > f | xargs cat > /dev/null from the initiator.
> > > 
> > > I tried 4.10.0.  It doesn't completely lockup the system, but the device
> > > that was used hangs.  So from the initiator, it's /dev/sr1 and from the
> > > target it's /dev/sr0.  Attempting to read /dev/sr0 after the oops causes 
> > > the
> > > process to hang in D state.
> > 
> > Hello Wakko,
> > 
> > Thank you for having narrowed down this further. I think that you 
> > encountered
> > a regression either in the block layer core or in the SCSI core. 
> > Unfortunately
> > the number of changes between kernel versions v4.9 and v4.10 in these two
> > subsystems is huge. I see two possible ways forward:
> > - Either that you perform a bisect to identify the patch that introduced 
> > this
> >   regression. However, I'm not sure whether you are familiar with the bisect
> >   process.
> > - Or that you identify the command that triggers this crash such that others
> >   can reproduce this issue without needing access to your setup.
> > 
> > How about reproducing this crash with the below patch applied on top of
> > kernel v4.15.x? The additional output sent by this patch to the system log
> > should allow us to reproduce this issue by submitting the same SCSI command
> > with sg_raw.
> 
> Ok, so I tried this, but scsi_print_command doesn't print anything.  I added
> a check for !rq and the same thing that blk_rq_nr_phys_segments does in an
> if statement above this thinking it might have crashed during WARN_ON_ONCE.
> It still didn't print anything.  My printk shows this:
> [  36.263193] sr 3:0:0:0: cmd->request->nr_phys_segments is 0
> 
> I also had scsi_print_command in the same if block which again didn't print
> anything.  Is there some debug option I need to turn on to make it print?  I
> tried looking through the code for this and following some of the function
> calls but didn't see any config options.

I know now why scsi_print_command isn't doing anything.  cmd->cmnd is null.
I added a dev_printk in scsi_print_command where the 2 if statements return.
Logs:
[  29.866415] sr 3:0:0:0: cmd->cmnd is NULL

> > Subject: [PATCH] Report commands with no physical segments in the system log
> > 
> > ---
> >  drivers/scsi/scsi_lib.c | 4 +++-
> >  1 file changed, 3 insertions(+), 1 deletion(-)
> > 
> > diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
> > index 6b6a6705f6e5..74a39db57d49 100644
> > --- a/drivers/scsi/scsi_lib.c
> > +++ b/drivers/scsi/scsi_lib.c
> > @@ -1093,8 +1093,10 @@ int scsi_init_io(struct scsi_cmnd *cmd)
> > bool is_mq = (rq->mq_ctx != NULL);
> > int error = BLKPREP_KILL;
> >  
> > -   if (WARN_ON_ONCE(!blk_rq_nr_phys_segments(rq)))
> > +   if (WARN_ON_ONCE(!blk_rq_nr_phys_segments(rq))) {
> > +   scsi_print_command(cmd);
> > goto err_exit;
> > +   }
> >  
> > error = scsi_init_sgtable(rq, >sdb);
> > if (error)
> -- 
>  Microsoft has beaten Volkswagen's world record.  Volkswagen only created 22
>  million bugs.
-- 
 Microsoft has beaten Volkswagen's world record.  Volkswagen only created 22
 million bugs.


Re: 4.15.14 crash with iscsi target and dvd

2018-04-05 Thread Wakko Warner
Wakko Warner wrote:
> Bart Van Assche wrote:
> > On Sun, 2018-04-01 at 14:27 -0400, Wakko Warner wrote:
> > > Wakko Warner wrote:
> > > > Wakko Warner wrote:
> > > > > I tested 4.14.32 last night with the same oops.  4.9.91 works fine.
> > > > > From the initiator, if I do cat /dev/sr1 > /dev/null it works.  If I 
> > > > > mount
> > > > > /dev/sr1 and then do find -type f | xargs cat > /dev/null the target
> > > > > crashes.  I'm using the builtin iscsi target with pscsi.  I can burn 
> > > > > from
> > > > > the initiator with out problems.  I'll test other kernels between 4.9 
> > > > > and
> > > > > 4.14.
> > > > 
> > > > So I've tested 4.x.y where x one of 10 11 12 14 15 and y is the latest 
> > > > patch
> > > > (except for 4.15 which was 1 behind)
> > > > Each of these kernels crash within seconds or immediate of doing find 
> > > > -type
> > > > f | xargs cat > /dev/null from the initiator.
> > > 
> > > I tried 4.10.0.  It doesn't completely lockup the system, but the device
> > > that was used hangs.  So from the initiator, it's /dev/sr1 and from the
> > > target it's /dev/sr0.  Attempting to read /dev/sr0 after the oops causes 
> > > the
> > > process to hang in D state.
> > 
> > Hello Wakko,
> > 
> > Thank you for having narrowed down this further. I think that you 
> > encountered
> > a regression either in the block layer core or in the SCSI core. 
> > Unfortunately
> > the number of changes between kernel versions v4.9 and v4.10 in these two
> > subsystems is huge. I see two possible ways forward:
> > - Either that you perform a bisect to identify the patch that introduced 
> > this
> >   regression. However, I'm not sure whether you are familiar with the bisect
> >   process.
> > - Or that you identify the command that triggers this crash such that others
> >   can reproduce this issue without needing access to your setup.
> > 
> > How about reproducing this crash with the below patch applied on top of
> > kernel v4.15.x? The additional output sent by this patch to the system log
> > should allow us to reproduce this issue by submitting the same SCSI command
> > with sg_raw.
> 
> Ok, so I tried this, but scsi_print_command doesn't print anything.  I added
> a check for !rq and the same thing that blk_rq_nr_phys_segments does in an
> if statement above this thinking it might have crashed during WARN_ON_ONCE.
> It still didn't print anything.  My printk shows this:
> [  36.263193] sr 3:0:0:0: cmd->request->nr_phys_segments is 0
> 
> I also had scsi_print_command in the same if block which again didn't print
> anything.  Is there some debug option I need to turn on to make it print?  I
> tried looking through the code for this and following some of the function
> calls but didn't see any config options.

I know now why scsi_print_command isn't doing anything.  cmd->cmnd is null.
I added a dev_printk in scsi_print_command where the 2 if statements return.
Logs:
[  29.866415] sr 3:0:0:0: cmd->cmnd is NULL

> > Subject: [PATCH] Report commands with no physical segments in the system log
> > 
> > ---
> >  drivers/scsi/scsi_lib.c | 4 +++-
> >  1 file changed, 3 insertions(+), 1 deletion(-)
> > 
> > diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
> > index 6b6a6705f6e5..74a39db57d49 100644
> > --- a/drivers/scsi/scsi_lib.c
> > +++ b/drivers/scsi/scsi_lib.c
> > @@ -1093,8 +1093,10 @@ int scsi_init_io(struct scsi_cmnd *cmd)
> > bool is_mq = (rq->mq_ctx != NULL);
> > int error = BLKPREP_KILL;
> >  
> > -   if (WARN_ON_ONCE(!blk_rq_nr_phys_segments(rq)))
> > +   if (WARN_ON_ONCE(!blk_rq_nr_phys_segments(rq))) {
> > +   scsi_print_command(cmd);
> > goto err_exit;
> > +   }
> >  
> > error = scsi_init_sgtable(rq, >sdb);
> > if (error)
> -- 
>  Microsoft has beaten Volkswagen's world record.  Volkswagen only created 22
>  million bugs.
-- 
 Microsoft has beaten Volkswagen's world record.  Volkswagen only created 22
 million bugs.


Re: [GIT PULL] x86/build changes for v4.17

2018-04-05 Thread Linus Torvalds
On Thu, Apr 5, 2018 at 3:51 PM, James Y Knight  wrote:
>
> Unfortunately, that behavior is required by the standard, it's not up to
> compiler optimization to change.

I actually mis-read your example - in your case it obviously does pass
the array itself down to the call, and yes, it obviously needs to be
allocated.

I had a test-case at one point where gcc avoided the stack allocation
entirely for a regular array, but not for a VLA (of the same constant
size) because the VLA logic is apparently different enough - even when
the size of the array is a compile-time constant.

We had that issue because we had a lot of trouble coming up with a
"max()" macro that was still an I-C-E (and we had a number of array
sizes that used "max()"). So all the array sizes were compile-time
constants, they just weren't traditional C arrays.

But now I can't recreate the thing. Maybe I had screwed up in my
test-case somehow.

   Linus


Re: [GIT PULL] x86/build changes for v4.17

2018-04-05 Thread Linus Torvalds
On Thu, Apr 5, 2018 at 3:51 PM, James Y Knight  wrote:
>
> Unfortunately, that behavior is required by the standard, it's not up to
> compiler optimization to change.

I actually mis-read your example - in your case it obviously does pass
the array itself down to the call, and yes, it obviously needs to be
allocated.

I had a test-case at one point where gcc avoided the stack allocation
entirely for a regular array, but not for a VLA (of the same constant
size) because the VLA logic is apparently different enough - even when
the size of the array is a compile-time constant.

We had that issue because we had a lot of trouble coming up with a
"max()" macro that was still an I-C-E (and we had a number of array
sizes that used "max()"). So all the array sizes were compile-time
constants, they just weren't traditional C arrays.

But now I can't recreate the thing. Maybe I had screwed up in my
test-case somehow.

   Linus


Re: [PATCH net] netns: filter uevents correctly

2018-04-05 Thread David Miller
From: Christian Brauner 
Date: Thu, 05 Apr 2018 01:27:16 +

> David, is it ok to queue this or would you prefer I resend when net-next
> reopens?

This definitely needs more discussion, and after the discussion some
further clarification in the commit log message based upon that
discussion if we move forward with this change at all.

Thank you.


Re: WARNING in up_write

2018-04-05 Thread Dave Chinner
On Thu, Apr 05, 2018 at 05:13:25PM -0700, Eric Biggers wrote:
> On Fri, Apr 06, 2018 at 08:32:26AM +1000, Dave Chinner wrote:
> > On Wed, Apr 04, 2018 at 08:24:54PM -0700, Matthew Wilcox wrote:
> > > On Wed, Apr 04, 2018 at 11:22:00PM -0400, Theodore Y. Ts'o wrote:
> > > > On Wed, Apr 04, 2018 at 12:35:04PM -0700, Matthew Wilcox wrote:
> > > > > On Wed, Apr 04, 2018 at 09:24:05PM +0200, Dmitry Vyukov wrote:
> > > > > > On Tue, Apr 3, 2018 at 4:01 AM, syzbot
> > > > > >  wrote:
> > > > > > > DEBUG_LOCKS_WARN_ON(sem->owner != get_current())
> > > > > > > WARNING: CPU: 1 PID: 4441 at kernel/locking/rwsem.c:133 
> > > > > > > up_write+0x1cc/0x210
> > > > > > > kernel/locking/rwsem.c:133
> > > > > > > Kernel panic - not syncing: panic_on_warn set ...
> > > > > 
> > > > > Message-Id: <1522852646-2196-1-git-send-email-long...@redhat.com>
> > > > >
> > > > 
> > > > We were way ahead of syzbot in this case.  :-)
> > > 
> > > Not really ... syzbot caught it Monday evening ;-)
> > 
> > Rather than arguing over who reported it first, I think that time
> > would be better spent reflecting on why the syzbot report was
> > completely ignored until *after* Ted diagnosed the issue
> > independently and Waiman had already fixed it
> > 
> > Clearly there is scope for improvement here.
> > 
> > Cheers,
> > 
> 
> Well, ultimately a human needed to investigate the syzbot bug report to figure
> out what was really going on.  In my view, the largest problem is that there 
> are
> simply too many bugs, so many are getting ignored.

Well, yeah. And when there's too many bugs, looking at the ones
people are actually hitting tend to take precedence over those
reported by a bot an image problem...

> If there were only a few bugs, then Dmitry would investigate each
> one and send a "real" bug report of better quality than the
> automated system can provide, or even send a fix directly.  But in
> reality, on the same day this bug was reported, syzbot also found
> 10 other bugs, and in the previous 2 days it had found 38 more.
> No single person can keep up with that. 

And this is precisely why people turn around and ask the syzbot
developers to do things that make it easier for them to diagnose
the problems syzbot reports.

> You can see the current
> bug list, which has 172 open bugs, on the dashboard at
> https://syzkaller.appspot.com/. 

Is that all? That's *nothing*.

> Yes, the kernel really is that
> broken.

Actually, that tells me the kernel is a hell of a lot better than my
experience leads me to beleive it is. I'd have expected thousands of
bugs, even tens of thousands of bugs given how many issues we deal
with in individual subsystems on a day to day basis.

> And although quite a few of these bugs will end up to be
> duplicates or even already fixed, a human still has to look at
> each one to figure that out.  (Though, I do think that syzbot
> should try to automatically detect when a reproducible bug was
> already fixed, via bisection.  It would cause a few bugs to be
> incorrectly considered fixed, but it may be a worthwhile
> tradeoff.)
> 
> These bugs are all over the kernel as well, so most developers
> don't see the big picture but rather just see a few bugs for
> "their" subsystem on "their" subsystem's mailing list and
> sometimes demand special attention.  Of course, it's great when
> people suggest ways to improve the process.

That's not the response I got

> But it's not great
> when people just don't feel responsible for fixing bugs and wait
> for Someone Else to do it.

The excessive cross posting of the reports is one of the reasons
people think someone else will take care of it. i.e. "Oh, that looks VFS,
that went to -fsdevel, I don't need to look at it"

Put simply: if you're mounting an XFS filesystem image and something
goes bang, then it should be reported to the XFS list. It does not
need to be cross posted to LKML, -fsdevel, 10 individual developers,
etc. If it's not an XFS problem, then the XFS developers will CC the
relevant lists as needed.

> I'm hoping that in the future the syzbot "team", which seems to
> actually be just Dmitry now, can get more resources towards
> helping fix the bugs.  But either way, in the end Linux is a
> community effort.

We don't really need help fixing the bugs - we need help making it
easier to *find the bug* the bot tripped over. That's what the
syzbot team needs to focus on, not tell people that what they got is
all they are going to get.

> Note also that syzbot wasn't super useful in this particular case
> because people running xfstests came across the same bug.  But,
> this is actually a rare case.  Most syzbot bug reports have been
> for weird corner cases or races that no one ever thought of
> before, so there are no existing tests that find them.

Which is exactly what these whacky "mount a filesystem fragment"
tests it is now doing are exercising.  Finding the cause of
corruption related 

Re: [PATCH net] netns: filter uevents correctly

2018-04-05 Thread David Miller
From: Christian Brauner 
Date: Thu, 05 Apr 2018 01:27:16 +

> David, is it ok to queue this or would you prefer I resend when net-next
> reopens?

This definitely needs more discussion, and after the discussion some
further clarification in the commit log message based upon that
discussion if we move forward with this change at all.

Thank you.


Re: WARNING in up_write

2018-04-05 Thread Dave Chinner
On Thu, Apr 05, 2018 at 05:13:25PM -0700, Eric Biggers wrote:
> On Fri, Apr 06, 2018 at 08:32:26AM +1000, Dave Chinner wrote:
> > On Wed, Apr 04, 2018 at 08:24:54PM -0700, Matthew Wilcox wrote:
> > > On Wed, Apr 04, 2018 at 11:22:00PM -0400, Theodore Y. Ts'o wrote:
> > > > On Wed, Apr 04, 2018 at 12:35:04PM -0700, Matthew Wilcox wrote:
> > > > > On Wed, Apr 04, 2018 at 09:24:05PM +0200, Dmitry Vyukov wrote:
> > > > > > On Tue, Apr 3, 2018 at 4:01 AM, syzbot
> > > > > >  wrote:
> > > > > > > DEBUG_LOCKS_WARN_ON(sem->owner != get_current())
> > > > > > > WARNING: CPU: 1 PID: 4441 at kernel/locking/rwsem.c:133 
> > > > > > > up_write+0x1cc/0x210
> > > > > > > kernel/locking/rwsem.c:133
> > > > > > > Kernel panic - not syncing: panic_on_warn set ...
> > > > > 
> > > > > Message-Id: <1522852646-2196-1-git-send-email-long...@redhat.com>
> > > > >
> > > > 
> > > > We were way ahead of syzbot in this case.  :-)
> > > 
> > > Not really ... syzbot caught it Monday evening ;-)
> > 
> > Rather than arguing over who reported it first, I think that time
> > would be better spent reflecting on why the syzbot report was
> > completely ignored until *after* Ted diagnosed the issue
> > independently and Waiman had already fixed it
> > 
> > Clearly there is scope for improvement here.
> > 
> > Cheers,
> > 
> 
> Well, ultimately a human needed to investigate the syzbot bug report to figure
> out what was really going on.  In my view, the largest problem is that there 
> are
> simply too many bugs, so many are getting ignored.

Well, yeah. And when there's too many bugs, looking at the ones
people are actually hitting tend to take precedence over those
reported by a bot an image problem...

> If there were only a few bugs, then Dmitry would investigate each
> one and send a "real" bug report of better quality than the
> automated system can provide, or even send a fix directly.  But in
> reality, on the same day this bug was reported, syzbot also found
> 10 other bugs, and in the previous 2 days it had found 38 more.
> No single person can keep up with that. 

And this is precisely why people turn around and ask the syzbot
developers to do things that make it easier for them to diagnose
the problems syzbot reports.

> You can see the current
> bug list, which has 172 open bugs, on the dashboard at
> https://syzkaller.appspot.com/. 

Is that all? That's *nothing*.

> Yes, the kernel really is that
> broken.

Actually, that tells me the kernel is a hell of a lot better than my
experience leads me to beleive it is. I'd have expected thousands of
bugs, even tens of thousands of bugs given how many issues we deal
with in individual subsystems on a day to day basis.

> And although quite a few of these bugs will end up to be
> duplicates or even already fixed, a human still has to look at
> each one to figure that out.  (Though, I do think that syzbot
> should try to automatically detect when a reproducible bug was
> already fixed, via bisection.  It would cause a few bugs to be
> incorrectly considered fixed, but it may be a worthwhile
> tradeoff.)
> 
> These bugs are all over the kernel as well, so most developers
> don't see the big picture but rather just see a few bugs for
> "their" subsystem on "their" subsystem's mailing list and
> sometimes demand special attention.  Of course, it's great when
> people suggest ways to improve the process.

That's not the response I got

> But it's not great
> when people just don't feel responsible for fixing bugs and wait
> for Someone Else to do it.

The excessive cross posting of the reports is one of the reasons
people think someone else will take care of it. i.e. "Oh, that looks VFS,
that went to -fsdevel, I don't need to look at it"

Put simply: if you're mounting an XFS filesystem image and something
goes bang, then it should be reported to the XFS list. It does not
need to be cross posted to LKML, -fsdevel, 10 individual developers,
etc. If it's not an XFS problem, then the XFS developers will CC the
relevant lists as needed.

> I'm hoping that in the future the syzbot "team", which seems to
> actually be just Dmitry now, can get more resources towards
> helping fix the bugs.  But either way, in the end Linux is a
> community effort.

We don't really need help fixing the bugs - we need help making it
easier to *find the bug* the bot tripped over. That's what the
syzbot team needs to focus on, not tell people that what they got is
all they are going to get.

> Note also that syzbot wasn't super useful in this particular case
> because people running xfstests came across the same bug.  But,
> this is actually a rare case.  Most syzbot bug reports have been
> for weird corner cases or races that no one ever thought of
> before, so there are no existing tests that find them.

Which is exactly what these whacky "mount a filesystem fragment"
tests it is now doing are exercising.  Finding the cause of
corruption related crashes is not easy and takes time. Having the
bot 

  1   2   3   4   5   6   7   8   9   10   >