date:20190912

This introduces fwnode_gpiod_get_index() that iterates through common gpio
suffixes when trying to locate a GPIO within a given firmware node.

We also switch devm_fwnode_gpiod_get_index() to call
fwnode_gpiod_get_index() instead of iterating through GPIO suffixes on
its own.

Reviewed-by: Andy Shevchenko 
Signed-off-by: Dmitry Torokhov 

---

Changes in v2:
- rebased on top of Linus W devel branch
- added Andy's Reviewed-by

 drivers/gpio/gpiolib-devres.c | 16 +---
 drivers/gpio/gpiolib.c| 48 +++
 include/linux/gpio/consumer.h | 13 ++
 3 files changed, 62 insertions(+), 15 deletions(-)

diff --git a/drivers/gpio/gpiolib-devres.c b/drivers/gpio/gpiolib-devres.c
index 9a0475c87f95..4421be09b960 100644
--- a/drivers/gpio/gpiolib-devres.c
+++ b/drivers/gpio/gpiolib-devres.c
@@ -205,29 +205,15 @@ struct gpio_desc *devm_fwnode_gpiod_get_index(struct 
device *dev,
  enum gpiod_flags flags,
  const char *label)
 {
-   char prop_name[32]; /* 32 is max size of property name */
struct gpio_desc **dr;
struct gpio_desc *desc;
-   unsigned int i;
 
dr = devres_alloc(devm_gpiod_release, sizeof(struct gpio_desc *),
  GFP_KERNEL);
if (!dr)
return ERR_PTR(-ENOMEM);
 
-   for (i = 0; i < ARRAY_SIZE(gpio_suffixes); i++) {
-   if (con_id)
-   snprintf(prop_name, sizeof(prop_name), "%s-%s",
-   con_id, gpio_suffixes[i]);
-   else
-   snprintf(prop_name, sizeof(prop_name), "%s",
-   gpio_suffixes[i]);
-
-   desc = fwnode_get_named_gpiod(fwnode, prop_name, index, flags,
- label);
-   if (!IS_ERR(desc) || (PTR_ERR(desc) != -ENOENT))
-   break;
-   }
+   desc = fwnode_gpiod_get_index(fwnode, con_id, index, flags, label);
if (IS_ERR(desc)) {
devres_free(dr);
return desc;
diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index 158e327a1285..11a6f4777436 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -4317,6 +4317,54 @@ static int platform_gpio_count(struct device *dev, const 
char *con_id)
return count;
 }
 
+/**
+ * fwnode_gpiod_get_index - obtain a GPIO from firmware node
+ * @fwnode:handle of the firmware node
+ * @con_id:function within the GPIO consumer
+ * @index: index of the GPIO to obtain for the consumer
+ * @flags: GPIO initialization flags
+ * @label: label to attach to the requested GPIO
+ *
+ * This function can be used for drivers that get their configuration
+ * from opaque firmware.
+ *
+ * The function properly finds the corresponding GPIO using whatever is the
+ * underlying firmware interface and then makes sure that the GPIO
+ * descriptor is requested before it is returned to the caller.
+ *
+ * Returns:
+ * On successful request the GPIO pin is configured in accordance with
+ * provided @flags.
+ *
+ * In case of error an ERR_PTR() is returned.
+ */
+struct gpio_desc *fwnode_gpiod_get_index(struct fwnode_handle *fwnode,
+const char *con_id, int index,
+enum gpiod_flags flags,
+const char *label)
+{
+   struct gpio_desc *desc;
+   char prop_name[32]; /* 32 is max size of property name */
+   unsigned int i;
+
+   for (i = 0; i < ARRAY_SIZE(gpio_suffixes); i++) {
+   if (con_id)
+   snprintf(prop_name, sizeof(prop_name), "%s-%s",
+   con_id, gpio_suffixes[i]);
+   else
+   snprintf(prop_name, sizeof(prop_name), "%s",
+   gpio_suffixes[i]);
+
+   desc = fwnode_get_named_gpiod(fwnode, prop_name, index, flags,
+ label);
+   if (!IS_ERR(desc) || (PTR_ERR(desc) != -ENOENT))
+   break;
+   }
+
+   return desc;
+}
+EXPORT_SYMBOL_GPL(fwnode_gpiod_get_index);
+
 /**
  * gpiod_count - return the number of GPIOs associated with a device / function
  * or -ENOENT if no GPIO has been assigned to the requested 
function
diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h
index dc0ddcd30515..5215fdba6b9a 100644
--- a/include/linux/gpio/consumer.h
+++ b/include/linux/gpio/consumer.h
@@ -176,6 +176,10 @@ struct gpio_desc *fwnode_get_named_gpiod(struct 
fwnode_handle *fwnode,
 const char *propname, int index,
 enum gpiod_flags dflags,
 const char

[PATCH v2 0/2] Add support for software nodes to gpiolib

This is a part of the larger series previously posted at

https://lore.kernel.org/linux-gpio/20190911075215.78047-1-dmitry.torok...@gmail.com

that was rebased on top of linux-gpio devel branch.

Changes in v2:
- switched export to be EXPORT_SYMBOL_GPL to match the new export
  markings for the rest of GPIO devres functions
- rebased on top of Linus W devel branch
- added Andy's Reviewed-by

Dmitry Torokhov (2):
  gpiolib: introduce devm_fwnode_gpiod_get_index()
  gpiolib: introduce fwnode_gpiod_get_index()

 drivers/gpio/gpiolib-devres.c | 33 ++---
 drivers/gpio/gpiolib.c| 48 +++
 include/linux/gpio/consumer.h | 54 ---
 3 files changed, 101 insertions(+), 34 deletions(-)

-- 
2.23.0.237.gc6a4ce50a0-goog

[PATCH v2 1/2] gpiolib: introduce devm_fwnode_gpiod_get_index()

devm_fwnode_get_index_gpiod_from_child() is too long, besides the fwnode
in question does not have to be a child of device node. Let's rename it
to devm_fwnode_gpiod_get_index() and keep the old name for compatibility
for now.

Also let's add a devm_fwnode_gpiod_get() wrapper as majority of the
callers need a single GPIO.

Reviewed-by: Andy Shevchenko 
Signed-off-by: Dmitry Torokhov 

---

Changes in v2:
- switched export to be EXPORT_SYMBOL_GPL to match the new export
  markings for the rest of GPIO devres functions
- rebased on top of Linus W devel branch
- added Andy's Reviewed-by

 drivers/gpio/gpiolib-devres.c | 19 
 include/linux/gpio/consumer.h | 41 ++-
 2 files changed, 40 insertions(+), 20 deletions(-)

diff --git a/drivers/gpio/gpiolib-devres.c b/drivers/gpio/gpiolib-devres.c
index 98e3c20d9730..9a0475c87f95 100644
--- a/drivers/gpio/gpiolib-devres.c
+++ b/drivers/gpio/gpiolib-devres.c
@@ -185,12 +185,11 @@ struct gpio_desc *devm_gpiod_get_from_of_node(struct 
device *dev,
 EXPORT_SYMBOL_GPL(devm_gpiod_get_from_of_node);
 
 /**
- * devm_fwnode_get_index_gpiod_from_child - get a GPIO descriptor from a
- * device's child node
+ * devm_fwnode_gpiod_get_index - get a GPIO descriptor from a given node
  * @dev:   GPIO consumer
+ * @fwnode:firmware node containing GPIO reference
  * @con_id:function within the GPIO consumer
  * @index: index of the GPIO to obtain in the consumer
- * @child: firmware node (child of @dev)
  * @flags: GPIO initialization flags
  * @label: label to attach to the requested GPIO
  *
@@ -200,11 +199,11 @@ EXPORT_SYMBOL_GPL(devm_gpiod_get_from_of_node);
  * On successful request the GPIO pin is configured in accordance with
  * provided @flags.
  */
-struct gpio_desc *devm_fwnode_get_index_gpiod_from_child(struct device *dev,
-   const char *con_id, int index,
-   struct fwnode_handle *child,
-   enum gpiod_flags flags,
-   const char *label)
+struct gpio_desc *devm_fwnode_gpiod_get_index(struct device *dev,
+ struct fwnode_handle *fwnode,
+ const char *con_id, int index,
+ enum gpiod_flags flags,
+ const char *label)
 {
char prop_name[32]; /* 32 is max size of property name */
struct gpio_desc **dr;
@@ -224,7 +223,7 @@ struct gpio_desc 
*devm_fwnode_get_index_gpiod_from_child(struct device *dev,
snprintf(prop_name, sizeof(prop_name), "%s",
gpio_suffixes[i]);
 
-   desc = fwnode_get_named_gpiod(child, prop_name, index, flags,
+   desc = fwnode_get_named_gpiod(fwnode, prop_name, index, flags,
  label);
if (!IS_ERR(desc) || (PTR_ERR(desc) != -ENOENT))
break;
@@ -239,7 +238,7 @@ struct gpio_desc 
*devm_fwnode_get_index_gpiod_from_child(struct device *dev,
 
return desc;
 }
-EXPORT_SYMBOL_GPL(devm_fwnode_get_index_gpiod_from_child);
+EXPORT_SYMBOL_GPL(devm_fwnode_gpiod_get_index);
 
 /**
  * devm_gpiod_get_index_optional - Resource-managed gpiod_get_index_optional()
diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h
index b70af921c614..dc0ddcd30515 100644
--- a/include/linux/gpio/consumer.h
+++ b/include/linux/gpio/consumer.h
@@ -176,11 +176,11 @@ struct gpio_desc *fwnode_get_named_gpiod(struct 
fwnode_handle *fwnode,
 const char *propname, int index,
 enum gpiod_flags dflags,
 const char *label);
-struct gpio_desc *devm_fwnode_get_index_gpiod_from_child(struct device *dev,
-   const char *con_id, int index,
-   struct fwnode_handle *child,
-   enum gpiod_flags flags,
-   const char *label);
+struct gpio_desc *devm_fwnode_gpiod_get_index(struct device *dev,
+ struct fwnode_handle *child,
+ const char *con_id, int index,
+ enum gpiod_flags flags,
+ const char *label);
 
 #else /* CONFIG_GPIOLIB */
 
@@ -531,6 +531,29 @@ struct gpio_desc *fwnode_get_named_gpiod(struct 
fwnode_handle *fwnode,
return ERR_PTR(-ENOSYS);
 }
 
+static inline
+struct gpio_desc *devm_fwnode_gpiod_get_index(struct device *dev,
+

[PATCH 03/11] KVM: x86/mmu: Use fast invalidate mechanism to zap MMIO sptes

Use the fast invalidate mechasim to zap MMIO sptes on a MMIO generation
wrap.  The fast invalidate flow was reintroduced to fix a livelock bug
in kvm_mmu_zap_all() that can occur if kvm_mmu_zap_all() is invoked when
the guest has live vCPUs.  I.e. using kvm_mmu_zap_all() to handle the
MMIO generation wrap is theoretically susceptible to the livelock bug.

This effectively reverts commit 4771450c345dc ("Revert "KVM: MMU: drop
kvm_mmu_zap_mmio_sptes""), i.e. restores the behavior of commit
a8eca9dcc656a ("KVM: MMU: drop kvm_mmu_zap_mmio_sptes").

Note, this actually fixes commit 571c5af06e303 ("KVM: x86/mmu:
Voluntarily reschedule as needed when zapping MMIO sptes"), but there
is no need to incrementally revert back to using fast invalidate, e.g.
doing so doesn't provide any bisection or stability benefits.

Fixes: 571c5af06e303 ("KVM: x86/mmu: Voluntarily reschedule as needed when 
zapping MMIO sptes")
Cc: sta...@vger.kernel.org
Signed-off-by: Sean Christopherson 
---
 arch/x86/include/asm/kvm_host.h |  1 -
 arch/x86/kvm/mmu.c  | 17 +++--
 2 files changed, 3 insertions(+), 15 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index fc279b513446..ef378abac00f 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -320,7 +320,6 @@ struct kvm_mmu_page {
struct list_head link;
struct hlist_node hash_link;
bool unsync;
-   bool mmio_cached;
 
/*
 * The following two entries are used to key the shadow page in the
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 373e6f052f9f..8d3fbc48d1be 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -403,8 +403,6 @@ static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 
*sptep, u64 gfn,
mask |= (gpa & shadow_nonpresent_or_rsvd_mask)
<< shadow_nonpresent_or_rsvd_mask_len;
 
-   page_header(__pa(sptep))->mmio_cached = true;
-
trace_mark_mmio_spte(sptep, gfn, access, gen);
mmu_spte_set(sptep, mask);
 }
@@ -5947,7 +5945,7 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm,
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
 
-static void __kvm_mmu_zap_all(struct kvm *kvm, bool mmio_only)
+void kvm_mmu_zap_all(struct kvm *kvm)
 {
struct kvm_mmu_page *sp, *node;
LIST_HEAD(invalid_list);
@@ -5956,14 +5954,10 @@ static void __kvm_mmu_zap_all(struct kvm *kvm, bool 
mmio_only)
spin_lock(>mmu_lock);
 restart:
list_for_each_entry_safe(sp, node, >arch.active_mmu_pages, link) {
-   if (mmio_only && !sp->mmio_cached)
-   continue;
if (sp->role.invalid && sp->root_count)
continue;
-   if (__kvm_mmu_prepare_zap_page(kvm, sp, _list, )) {
-   WARN_ON_ONCE(mmio_only);
+   if (__kvm_mmu_prepare_zap_page(kvm, sp, _list, ))
goto restart;
-   }
if (cond_resched_lock(>mmu_lock))
goto restart;
}
@@ -5972,11 +5966,6 @@ static void __kvm_mmu_zap_all(struct kvm *kvm, bool 
mmio_only)
spin_unlock(>mmu_lock);
 }
 
-void kvm_mmu_zap_all(struct kvm *kvm)
-{
-   return __kvm_mmu_zap_all(kvm, false);
-}
-
 void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
 {
WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
@@ -5998,7 +5987,7 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 
gen)
 */
if (unlikely(gen == 0)) {
kvm_debug_ratelimited("kvm: zapping shadow pages for mmio 
generation wraparound\n");
-   __kvm_mmu_zap_all(kvm, true);
+   kvm_mmu_zap_all_fast(kvm);
}
 }
 
-- 
2.22.0

[PATCH 00/11] KVM: x86/mmu: Restore fast invalidate/zap flow

Restore the fast invalidate flow for zapping shadow pages and use it
whenever vCPUs can be active in the VM.  This fixes (in theory, not yet
confirmed) a regression reported by James Harvey where KVM can livelock
in kvm_mmu_zap_all() when it's invoked in response to a memslot update.

The fast invalidate flow was removed as it was deemed to be unnecessary
after its primary user, memslot flushing, was reworked to zap only the
memslot in question instead of all shadow pages.  Unfortunately, zapping
only the memslot being (re)moved during a memslot update introduced a
regression for VMs with assigned devices.  Because we could not discern
why zapping only the relevant memslot broke device assignment, or if the
regression extended beyond device assignment, we reverted to zapping all
shadow pages when a memslot is (re)moved.

The revert to "zap all" failed to account for subsequent changes that
have been made to kvm_mmu_zap_all() between then and now.  Specifically,
kvm_mmu_zap_all() now conditionally drops reschedules and drops mmu_lock
if a reschedule is needed or if the lock is contended.  Dropping the lock
allows other vCPUs to add shadow pages, and, with enough vCPUs, can cause
kvm_mmu_zap_all() to get stuck in an infinite loop as it can never zap all
pages before observing lock contention or the need to reschedule.

The reasoning behind having kvm_mmu_zap_all() conditionally reschedule was
that it would only be used when the VM is inaccesible, e.g. when its
mm_struct is dying or when the VM itself is being destroyed.  In that case,
playing nice with the rest of the kernel instead of hogging cycles to free
unused shadow pages made sense.

Since it's unlikely we'll root cause the device assignment regression any
time soon, and that simply removing the conditional rescheduling isn't
guaranteed to return us to a known good state, restore the fast invalidate
flow for zapping on memslot updates, including mmio generation wraparound.
Opportunisticaly tack on a bug fix and a couple enhancements.

Alex and James, it probably goes without saying... please test, especially
patch 01/11 as a standalone patch as that'll likely need to be applied to
stable branches, assuming it works.  Thanks!

Sean Christopherson (11):
  KVM: x86/mmu: Reintroduce fast invalidate/zap for flushing memslot
  KVM: x86/mmu: Treat invalid shadow pages as obsolete
  KVM: x86/mmu: Use fast invalidate mechanism to zap MMIO sptes
  KVM: x86/mmu: Revert "Revert "KVM: MMU: show mmu_valid_gen in shadow
page related tracepoints""
  KVM: x86/mmu: Revert "Revert "KVM: MMU: add tracepoint for
kvm_mmu_invalidate_all_pages""
  KVM: x86/mmu: Revert "Revert "KVM: MMU: zap pages in batch""
  KVM: x86/mmu: Revert "Revert "KVM: MMU: collapse TLB flushes when zap
all pages""
  KVM: x86/mmu: Revert "Revert "KVM: MMU: reclaim the zapped-obsolete
page first""
  KVM: x86/mmu: Revert "KVM: x86/mmu: Remove is_obsolete() call"
  KVM: x86/mmu: Explicitly track only a single invalid mmu generation
  KVM: x86/mmu: Skip invalid pages during zapping iff root_count is zero

 arch/x86/include/asm/kvm_host.h |   4 +-
 arch/x86/kvm/mmu.c  | 154 
 arch/x86/kvm/mmutrace.h |  42 +++--
 arch/x86/kvm/x86.c  |   1 +
 4 files changed, 173 insertions(+), 28 deletions(-)

-- 
2.22.0

[PATCH 02/11] KVM: x86/mmu: Treat invalid shadow pages as obsolete

Treat invalid shadow pages as obsolete to fix a bug where an obsolete
and invalid page with a non-zero root count could become non-obsolete
due to mmu_valid_gen wrapping.  The bug is largely theoretical with the
current code base, as an unsigned long will effectively never wrap on
64-bit KVM, and userspace would have to deliberately stall a vCPU in
order to keep an obsolete invalid page on the active list while
simultaneously modifying memslots billions of times to trigger a wrap.

The obvious alternative is to use a 64-bit value for mmu_valid_gen,
but it's actually desirable to go in the opposite direction, i.e. using
a smaller 8-bit value to reduce KVM's memory footprint by 8 bytes per
shadow page, and relying on proper treatment of invalid pages instead of
preventing the generation from wrapping.

Note, "Fixes" points at a commit that was at one point reverted, but has
since been restored.

Fixes: 5304b8d37c2a5 ("KVM: MMU: fast invalidate all pages")
Signed-off-by: Sean Christopherson 
---
 arch/x86/kvm/mmu.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 5ac5e3f50f92..373e6f052f9f 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2252,7 +2252,7 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
 #define for_each_valid_sp(_kvm, _sp, _gfn) \
hlist_for_each_entry(_sp,   \
  &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \
-   if (is_obsolete_sp((_kvm), (_sp)) || (_sp)->role.invalid) {\
+   if (is_obsolete_sp((_kvm), (_sp))) {\
} else
 
 #define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn)
\
@@ -2311,7 +2311,8 @@ static void mmu_audit_disable(void) { }
 
 static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
-   return unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
+   return sp->role.invalid ||
+  unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
 }
 
 static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
-- 
2.22.0

[PATCH 04/11] KVM: x86/mmu: Revert "Revert "KVM: MMU: show mmu_valid_gen in shadow page related tracepoints""

Now that the fast invalidate mechanism has been reintroduced, restore
tracing of the generation number in shadow page tracepoints.

This reverts commit b59c4830ca185ba0e9f9e046fb1cd10a4a92627a.

Signed-off-by: Sean Christopherson 
---
 arch/x86/kvm/mmutrace.h | 21 -
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index d8001b4bca05..e9832b5ec53c 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -8,16 +8,18 @@
 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM kvmmmu
 
-#define KVM_MMU_PAGE_FIELDS \
-   __field(__u64, gfn) \
-   __field(__u32, role) \
-   __field(__u32, root_count) \
+#define KVM_MMU_PAGE_FIELDS\
+   __field(unsigned long, mmu_valid_gen)   \
+   __field(__u64, gfn) \
+   __field(__u32, role)\
+   __field(__u32, root_count)  \
__field(bool, unsync)
 
-#define KVM_MMU_PAGE_ASSIGN(sp) \
-   __entry->gfn = sp->gfn;  \
-   __entry->role = sp->role.word;   \
-   __entry->root_count = sp->root_count;\
+#define KVM_MMU_PAGE_ASSIGN(sp)\
+   __entry->mmu_valid_gen = sp->mmu_valid_gen; \
+   __entry->gfn = sp->gfn; \
+   __entry->role = sp->role.word;  \
+   __entry->root_count = sp->root_count;   \
__entry->unsync = sp->unsync;
 
 #define KVM_MMU_PAGE_PRINTK() ({   \
@@ -29,8 +31,9 @@
\
role.word = __entry->role;  \
\
-   trace_seq_printf(p, "sp gfn %llx l%u %u-byte q%u%s %s%s"\
+   trace_seq_printf(p, "sp gen %lx gfn %llx l%u %u-byte q%u%s %s%s"\
 " %snxe %sad root %u %s%c",\
+__entry->mmu_valid_gen,\
 __entry->gfn, role.level,  \
 role.gpte_is_8_bytes ? 8 : 4,  \
 role.quadrant, \
-- 
2.22.0

[PATCH 11/11] KVM: x86/mmu: Skip invalid pages during zapping iff root_count is zero

Do not skip invalid shadow pages when zapping obsolete pages if the
pages' root_count has reached zero, in which case the page can be
immediately zapped and freed.

Update the comment accordingly.

Signed-off-by: Sean Christopherson 
---
 arch/x86/kvm/mmu.c | 9 +
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index a7b14750cde9..5e41b1f77a6d 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -5692,11 +5692,12 @@ static void kvm_zap_obsolete_pages(struct kvm *kvm)
break;
 
/*
-* Since we are reversely walking the list and the invalid
-* list will be moved to the head, skip the invalid page
-* can help us to avoid the infinity list walking.
+* Skip invalid pages with a non-zero root count, zapping pages
+* with a non-zero root count will never succeed, i.e. the page
+* will get thrown back on active_mmu_pages and we'll get stuck
+* in an infinite loop.
 */
-   if (sp->role.invalid)
+   if (sp->role.invalid && sp->root_count)
continue;
 
/*
-- 
2.22.0

[PATCH 06/11] KVM: x86/mmu: Revert "Revert "KVM: MMU: zap pages in batch""

Now that the fast invalidate mechanism has been reintroduced, restore
the performance tweaks for fast invalidation that existed prior to its
removal.

Paraphrashing the original changelog:

  Zap at least 10 shadow pages before releasing mmu_lock to reduce the
  overhead associated with re-acquiring the lock.

  Note: "10" is an arbitrary number, speculated to be high enough so
  that a vCPU isn't stuck zapping obsolete pages for an extended period,
  but small enough so that other vCPUs aren't starved waiting for
  mmu_lock.

This reverts commit 43d2b14b105fb00b8864c7b0ee7043cc1cc4a969.

Signed-off-by: Sean Christopherson 
---
 arch/x86/kvm/mmu.c | 35 +--
 1 file changed, 9 insertions(+), 26 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 0bf20afc3e73..827414b12dbd 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -5670,12 +5670,12 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
return alloc_mmu_pages(vcpu);
 }
 
-
+#define BATCH_ZAP_PAGES10
 static void kvm_zap_obsolete_pages(struct kvm *kvm)
 {
struct kvm_mmu_page *sp, *node;
LIST_HEAD(invalid_list);
-   int ign;
+   int nr_zapped, batch = 0;
 
 restart:
list_for_each_entry_safe_reverse(sp, node,
@@ -5688,28 +5688,6 @@ static void kvm_zap_obsolete_pages(struct kvm *kvm)
break;
 
/*
-* Do not repeatedly zap a root page to avoid unnecessary
-* KVM_REQ_MMU_RELOAD, otherwise we may not be able to
-* progress:
-*vcpu 0vcpu 1
-* call vcpu_enter_guest():
-*1): handle KVM_REQ_MMU_RELOAD
-*and require mmu-lock to
-*load mmu
-* repeat:
-*1): zap root page and
-*send KVM_REQ_MMU_RELOAD
-*
-*2): if (cond_resched_lock(mmu-lock))
-*
-*2): hold mmu-lock and load mmu
-*
-*3): see KVM_REQ_MMU_RELOAD bit
-*on vcpu->requests is set
-*then return 1 to call
-*vcpu_enter_guest() again.
-*goto repeat;
-*
 * Since we are reversely walking the list and the invalid
 * list will be moved to the head, skip the invalid page
 * can help us to avoid the infinity list walking.
@@ -5717,14 +5695,19 @@ static void kvm_zap_obsolete_pages(struct kvm *kvm)
if (sp->role.invalid)
continue;
 
-   if (need_resched() || spin_needbreak(>mmu_lock)) {
+   if (batch >= BATCH_ZAP_PAGES &&
+   (need_resched() || spin_needbreak(>mmu_lock))) {
+   batch = 0;
kvm_mmu_commit_zap_page(kvm, _list);
cond_resched_lock(>mmu_lock);
goto restart;
}
 
-   if (__kvm_mmu_prepare_zap_page(kvm, sp, _list, ))
+   if (__kvm_mmu_prepare_zap_page(kvm, sp, _list,
+  _zapped)) {
+   batch += nr_zapped;
goto restart;
+   }
}
 
kvm_mmu_commit_zap_page(kvm, _list);
-- 
2.22.0

[PATCH 08/11] KVM: x86/mmu: Revert "Revert "KVM: MMU: reclaim the zapped-obsolete page first""

Now that the fast invalidate mechanism has been reintroduced, restore
the performance tweaks for fast invalidation that existed prior to its
removal.

Paraphrashing the original changelog:

  Introduce a per-VM list to track obsolete shadow pages, i.e. pages
  which have been deleted from the mmu cache but haven't yet been freed.
  When page reclaiming is needed, zap/free the deleted pages first.

This reverts commit 52d5dedc79bdcbac2976159a172069618cf31be5.

Signed-off-by: Sean Christopherson 
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/mmu.c  | 22 +-
 arch/x86/kvm/x86.c  |  1 +
 3 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index ef378abac00f..6e4fa75351fd 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -862,6 +862,7 @@ struct kvm_arch {
 * Hash table of struct kvm_mmu_page.
 */
struct list_head active_mmu_pages;
+   struct list_head zapped_obsolete_pages;
struct kvm_page_track_notifier_node mmu_sp_tracker;
struct kvm_page_track_notifier_head track_notifier_head;
 
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 8c0648bbc7c1..84d916674529 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -5674,7 +5674,6 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
 static void kvm_zap_obsolete_pages(struct kvm *kvm)
 {
struct kvm_mmu_page *sp, *node;
-   LIST_HEAD(invalid_list);
int nr_zapped, batch = 0;
 
 restart:
@@ -5707,8 +5706,8 @@ static void kvm_zap_obsolete_pages(struct kvm *kvm)
goto restart;
}
 
-   if (__kvm_mmu_prepare_zap_page(kvm, sp, _list,
-  _zapped)) {
+   if (__kvm_mmu_prepare_zap_page(kvm, sp,
+   >arch.zapped_obsolete_pages, _zapped)) {
batch += nr_zapped;
goto restart;
}
@@ -5719,7 +5718,7 @@ static void kvm_zap_obsolete_pages(struct kvm *kvm)
 * KVM is not in the middle of a lockless shadow page table walk, which
 * may reference the pages.
 */
-   kvm_mmu_commit_zap_page(kvm, _list);
+   kvm_mmu_commit_zap_page(kvm, >arch.zapped_obsolete_pages);
 }
 
 /*
@@ -5751,6 +5750,11 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
spin_unlock(>mmu_lock);
 }
 
+static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
+{
+   return unlikely(!list_empty_careful(>arch.zapped_obsolete_pages));
+}
+
 static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
struct kvm_memory_slot *slot,
struct kvm_page_track_notifier_node *node)
@@ -6021,16 +6025,24 @@ mmu_shrink_scan(struct shrinker *shrink, struct 
shrink_control *sc)
 * want to shrink a VM that only started to populate its MMU
 * anyway.
 */
-   if (!kvm->arch.n_used_mmu_pages)
+   if (!kvm->arch.n_used_mmu_pages &&
+   !kvm_has_zapped_obsolete_pages(kvm))
continue;
 
idx = srcu_read_lock(>srcu);
spin_lock(>mmu_lock);
 
+   if (kvm_has_zapped_obsolete_pages(kvm)) {
+   kvm_mmu_commit_zap_page(kvm,
+ >arch.zapped_obsolete_pages);
+   goto unlock;
+   }
+
if (prepare_zap_oldest_mmu_page(kvm, _list))
freed++;
kvm_mmu_commit_zap_page(kvm, _list);
 
+unlock:
spin_unlock(>mmu_lock);
srcu_read_unlock(>srcu, idx);
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b4cfd786d0b6..3d092b0f6bcb 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -9306,6 +9306,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 
INIT_HLIST_HEAD(>arch.mask_notifier_list);
INIT_LIST_HEAD(>arch.active_mmu_pages);
+   INIT_LIST_HEAD(>arch.zapped_obsolete_pages);
INIT_LIST_HEAD(>arch.assigned_dev_head);
atomic_set(>arch.noncoherent_dma_count, 0);
 
-- 
2.22.0

[PATCH 05/11] KVM: x86/mmu: Revert "Revert "KVM: MMU: add tracepoint for kvm_mmu_invalidate_all_pages""

Now that the fast invalidate mechanism has been reintroduced, restore
the tracepoint associated with said mechanism.

Note, the name of the tracepoint deviates from the original tracepoint
so as to match KVM's current nomenclature.

This reverts commit 42560fb1f3c6c7f730897b7fa7a478bc37e0be50.

Signed-off-by: Sean Christopherson 
---
 arch/x86/kvm/mmu.c  |  1 +
 arch/x86/kvm/mmutrace.h | 21 +
 2 files changed, 22 insertions(+)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 8d3fbc48d1be..0bf20afc3e73 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -5742,6 +5742,7 @@ static void kvm_zap_obsolete_pages(struct kvm *kvm)
 static void kvm_mmu_zap_all_fast(struct kvm *kvm)
 {
spin_lock(>mmu_lock);
+   trace_kvm_mmu_zap_all_fast(kvm);
kvm->arch.mmu_valid_gen++;
 
kvm_zap_obsolete_pages(kvm);
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index e9832b5ec53c..1a063ba76281 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -282,6 +282,27 @@ TRACE_EVENT(
)
 );
 
+TRACE_EVENT(
+   kvm_mmu_zap_all_fast,
+   TP_PROTO(struct kvm *kvm),
+   TP_ARGS(kvm),
+
+   TP_STRUCT__entry(
+   __field(unsigned long, mmu_valid_gen)
+   __field(unsigned int, mmu_used_pages)
+   ),
+
+   TP_fast_assign(
+   __entry->mmu_valid_gen = kvm->arch.mmu_valid_gen;
+   __entry->mmu_used_pages = kvm->arch.n_used_mmu_pages;
+   ),
+
+   TP_printk("kvm-mmu-valid-gen %lx used_pages %x",
+ __entry->mmu_valid_gen, __entry->mmu_used_pages
+   )
+);
+
+
 TRACE_EVENT(
check_mmio_spte,
TP_PROTO(u64 spte, unsigned int kvm_gen, unsigned int spte_gen),
-- 
2.22.0

[PATCH 01/11] KVM: x86/mmu: Reintroduce fast invalidate/zap for flushing memslot

Reintroduce the fast invalidate mechanism and use it when zapping shadow
pages in response to a memslot being deleted/moved.  Using the fast
mechanism fixes a livelock reported by James Harvey that was introduced
by commit d012a06ab1d23 ("Revert "KVM: x86/mmu: Zap only the relevant
pages when removing a memslot"").

The livelock occurs because kvm_mmu_zap_all() as it exists today will
voluntarily reschedule and drop KVM's mmu_lock, which allows other vCPUs
to add shadow pages.  With enough vCPUs, kvm_mmu_zap_all() can get stuck
in an infinite loop as it can never zap all pages before observing lock
contention or the need to reschedule.

The equivalent of kvm_mmu_zap_all() that was in use at the time of
the reverted commit (4e103134b8623, "KVM: x86/mmu: Zap only the relevant
pages when removing a memslot") employed a fast invalidate mechanism and
was not susceptible to the above livelock.  Restore the fast invalidate
code and use it when flushing a memslot.

Reverting the revert (commit d012a06ab1d23) is not a viable option as
the revert is needed to fix a regression that occurs when the guest has
one or more assigned devices.

Alternatively, the livelock could be eliminated by removing the
conditional reschedule from kvm_mmu_zap_all().  However, although
removing the reschedule would be a smaller code change, it's less safe
in the sense that the resulting kvm_mmu_zap_all() hasn't been used in
the wild for flushing memslots since the fast invalidate mechanism was
introduced by commit 6ca18b6950f8d ("KVM: x86: use the fast way to
invalidate all pages"), back in 2013.

For all intents and purposes, this is a revert of commit ea145aacf4ae8
("Revert "KVM: MMU: fast invalidate all pages"") and a partial revert of
commit 7390de1e99a70 ("Revert "KVM: x86: use the fast way to invalidate
all pages""), i.e. restores the behavior of commit 5304b8d37c2a5 ("KVM:
MMU: fast invalidate all pages") and commit 6ca18b6950f8d ("KVM: x86:
use the fast way to invalidate all pages") respectively.

Fixes: d012a06ab1d23 ("Revert "KVM: x86/mmu: Zap only the relevant pages when 
removing a memslot"")
Reported-by: James Harvey 
Cc: Alex Willamson 
Cc: Paolo Bonzini 
Cc: sta...@vger.kernel.org
Signed-off-by: Sean Christopherson 
---
 arch/x86/include/asm/kvm_host.h |   2 +
 arch/x86/kvm/mmu.c  | 101 +++-
 2 files changed, 101 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 44a5ce57a905..fc279b513446 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -335,6 +335,7 @@ struct kvm_mmu_page {
int root_count;  /* Currently serving as active root */
unsigned int unsync_children;
struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */
+   unsigned long mmu_valid_gen;
DECLARE_BITMAP(unsync_child_bitmap, 512);
 
 #ifdef CONFIG_X86_32
@@ -856,6 +857,7 @@ struct kvm_arch {
unsigned long n_requested_mmu_pages;
unsigned long n_max_mmu_pages;
unsigned int indirect_shadow_pages;
+   unsigned long mmu_valid_gen;
struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
/*
 * Hash table of struct kvm_mmu_page.
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 4c45ff0cfbd0..5ac5e3f50f92 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2097,6 +2097,12 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct 
kvm_vcpu *vcpu, int direct
if (!direct)
sp->gfns = mmu_memory_cache_alloc(>arch.mmu_page_cache);
set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
+
+   /*
+* active_mmu_pages must be a FIFO list, as kvm_zap_obsolete_pages()
+* depends on valid pages being added to the head of the list.  See
+* comments in kvm_zap_obsolete_pages().
+*/
list_add(>link, >kvm->arch.active_mmu_pages);
kvm_mod_used_mmu_pages(vcpu->kvm, +1);
return sp;
@@ -2246,7 +2252,7 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
 #define for_each_valid_sp(_kvm, _sp, _gfn) \
hlist_for_each_entry(_sp,   \
  &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \
-   if ((_sp)->role.invalid) {\
+   if (is_obsolete_sp((_kvm), (_sp)) || (_sp)->role.invalid) {\
} else
 
 #define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn)
\
@@ -2303,6 +2309,11 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int 
point) { }
 static void mmu_audit_disable(void) { }
 #endif
 
+static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+   return unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
+}
+
 static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 struct list_head *invalid_list)
 {
@@ -2527,6

[PATCH 09/11] KVM: x86/mmu: Revert "KVM: x86/mmu: Remove is_obsolete() call"

Now that the fast invalidate mechanism has been reintroduced, restore
the performance tweaks for fast invalidation that existed prior to its
removal.

Paraphrasing the original changelog (commit 5ff0568374ed2 was itself a
partial revert):

  Don't force reloading the remote mmu when zapping an obsolete page, as
  a MMU_RELOAD request has already been issued by kvm_mmu_zap_all_fast()
  immediately after incrementing mmu_valid_gen, i.e. after marking pages
  obsolete.

This reverts commit 5ff0568374ed2e585376a3832857ade5daccd381.

Signed-off-by: Sean Christopherson 
---
 arch/x86/kvm/mmu.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 84d916674529..bce19918ca5a 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2752,7 +2752,12 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
} else {
list_move(>link, >arch.active_mmu_pages);
 
-   if (!sp->role.invalid)
+   /*
+* Obsolete pages cannot be used on any vCPUs, see the comment
+* in kvm_mmu_zap_all_fast().  Note, is_obsolete_sp() also
+* treats invalid shadow pages as being obsolete.
+*/
+   if (!is_obsolete_sp(kvm, sp))
kvm_reload_remote_mmus(kvm);
}
 
-- 
2.22.0

Re: [PATCH] memcg, kmem: do not fail __GFP_NOFAIL charges

2019-09-12 Thread Shakeel Butt

On Wed, Sep 11, 2019 at 8:16 AM Michal Hocko  wrote:
>
> On Wed 11-09-19 07:37:40, Andrew Morton wrote:
> > On Wed, 11 Sep 2019 14:00:02 +0200 Michal Hocko  wrote:
> >
> > > On Mon 09-09-19 13:22:45, Michal Hocko wrote:
> > > > On Fri 06-09-19 11:24:55, Shakeel Butt wrote:
> > > [...]
> > > > > I wonder what has changed since
> > > > > .
> > > >
> > > > I have completely forgot about that one. It seems that we have just
> > > > repeated the same discussion again. This time we have a poor user who
> > > > actually enabled the kmem limit.
> > > >
> > > > I guess there was no real objection to the change back then. The primary
> > > > discussion revolved around the fact that the accounting will stay broken
> > > > even when this particular part was fixed. Considering this leads to easy
> > > > to trigger crash (with the limit enabled) then I guess we should just
> > > > make it less broken and backport to stable trees and have a serious
> > > > discussion about discontinuing of the limit. Start by simply failing to
> > > > set any limit in the current upstream kernels.
> > >
> > > Any more concerns/objections to the patch? I can add a reference to your
> > > earlier post Shakeel if you want or to credit you the way you prefer.
> > >
> > > Also are there any objections to start deprecating process of kmem
> > > limit? I would see it in two stages
> > > - 1st warn in the kernel log
> > > pr_warn("kmem.limit_in_bytes is deprecated and will be removed.
> > > "Please report your usecase to linux...@kvack.org if you "
> > > "depend on this functionality."
> >
> > pr_warn_once() :)
> >
> > > - 2nd fail any write to kmem.limit_in_bytes
> > > - 3rd remove the control file completely
> >
> > Sounds good to me.
>
> Here we go
>
> From 512822e551fe2960040c23b12c7b27a5fdab9013 Mon Sep 17 00:00:00 2001
> From: Michal Hocko 
> Date: Wed, 11 Sep 2019 17:02:33 +0200
> Subject: [PATCH] memcg, kmem: deprecate kmem.limit_in_bytes
>
> Cgroup v1 memcg controller has exposed a dedicated kmem limit to users
> which turned out to be really a bad idea because there are paths which
> cannot shrink the kernel memory usage enough to get below the limit
> (e.g. because the accounted memory is not reclaimable). There are cases
> when the failure is even not allowed (e.g. __GFP_NOFAIL). This means
> that the kmem limit is in excess to the hard limit without any way to
> shrink and thus completely useless. OOM killer cannot be invoked to
> handle the situation because that would lead to a premature oom killing.
>
> As a result many places might see ENOMEM returning from kmalloc and
> result in unexpected errors. E.g. a global OOM killer when there is a
> lot of free memory because ENOMEM is translated into VM_FAULT_OOM in #PF
> path and therefore pagefault_out_of_memory would result in OOM killer.
>
> Please note that the kernel memory is still accounted to the overall
> limit along with the user memory so removing the kmem specific limit
> should still allow to contain kernel memory consumption. Unlike the kmem
> one, though, it invokes memory reclaim and targeted memcg oom killing if
> necessary.
>
> Start the deprecation process by crying to the kernel log. Let's see
> whether there are relevant usecases and simply return to EINVAL in the
> second stage if nobody complains in few releases.
>
> Signed-off-by: Michal Hocko 

Reviewed-by: Shakeel Butt 

> ---
>  Documentation/admin-guide/cgroup-v1/memory.rst | 3 +++
>  mm/memcontrol.c| 3 +++
>  2 files changed, 6 insertions(+)
>
> diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst 
> b/Documentation/admin-guide/cgroup-v1/memory.rst
> index 41bdc038dad9..e53fc2f31549 100644
> --- a/Documentation/admin-guide/cgroup-v1/memory.rst
> +++ b/Documentation/admin-guide/cgroup-v1/memory.rst
> @@ -87,6 +87,9 @@ Brief summary of control files.
>  node
>
>   memory.kmem.limit_in_bytes  set/show hard limit for kernel memory
> + This knob is deprecated it shouldn't be
> + used. It is planned to be removed in
> + a foreseeable future.
>   memory.kmem.usage_in_bytes  show current kernel memory allocation
>   memory.kmem.failcnt show the number of kernel memory usage
>  hits limits
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index e18108b2b786..113969bc57e8 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -3518,6 +3518,9 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file 
> *of,
> ret = mem_cgroup_resize_max(memcg, nr_pages, true);
> break;
> case _KMEM:
> +   pr_warn_once("kmem.limit_in_bytes is deprecated and 
> will be removed. "
> +

[PATCH 07/11] KVM: x86/mmu: Revert "Revert "KVM: MMU: collapse TLB flushes when zap all pages""

Now that the fast invalidate mechanism has been reintroduced, restore
the performance tweaks for fast invalidation that existed prior to its
removal.

Paraphrashing the original changelog:

  Reload the mmu on all vCPUs after updating the generation number so
  that obsolete pages are not used by any vCPUs.  This allows collapsing
  all TLB flushes during obsolete page zapping into a single flush, as
  there is no need to flush when dropping mmu_lock (to reschedule).

  Note: a remote TLB flush is still needed before freeing the pages as
  other vCPUs may be doing a lockless shadow page walk.

Opportunstically improve the comments restored by the revert (the
code itself is a true revert).

This reverts commit f34d251d66ba263c077ed9d2bbd1874339a4c887.

Signed-off-by: Sean Christopherson 
---
 arch/x86/kvm/mmu.c | 25 ++---
 1 file changed, 22 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 827414b12dbd..8c0648bbc7c1 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -5695,11 +5695,15 @@ static void kvm_zap_obsolete_pages(struct kvm *kvm)
if (sp->role.invalid)
continue;
 
+   /*
+* No need to flush the TLB since we're only zapping shadow
+* pages with an obsolete generation number and all vCPUS have
+* loaded a new root, i.e. the shadow pages being zapped cannot
+* be in active use by the guest.
+*/
if (batch >= BATCH_ZAP_PAGES &&
-   (need_resched() || spin_needbreak(>mmu_lock))) {
+   cond_resched_lock(>mmu_lock)) {
batch = 0;
-   kvm_mmu_commit_zap_page(kvm, _list);
-   cond_resched_lock(>mmu_lock);
goto restart;
}
 
@@ -5710,6 +5714,11 @@ static void kvm_zap_obsolete_pages(struct kvm *kvm)
}
}
 
+   /*
+* Trigger a remote TLB flush before freeing the page tables to ensure
+* KVM is not in the middle of a lockless shadow page table walk, which
+* may reference the pages.
+*/
kvm_mmu_commit_zap_page(kvm, _list);
 }
 
@@ -5728,6 +5737,16 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
trace_kvm_mmu_zap_all_fast(kvm);
kvm->arch.mmu_valid_gen++;
 
+   /*
+* Notify all vcpus to reload its shadow page table and flush TLB.
+* Then all vcpus will switch to new shadow page table with the new
+* mmu_valid_gen.
+*
+* Note: we need to do this under the protection of mmu_lock,
+* otherwise, vcpu would purge shadow page but miss tlb flush.
+*/
+   kvm_reload_remote_mmus(kvm);
+
kvm_zap_obsolete_pages(kvm);
spin_unlock(>mmu_lock);
 }
-- 
2.22.0

[PATCH 10/11] KVM: x86/mmu: Explicitly track only a single invalid mmu generation

Toggle mmu_valid_gen between '0' and '1' instead of blindly incrementing
the generation.  Because slots_lock is held for the entire duration of
zapping obsolete pages, it's impossible for there to be multiple invalid
generations associated with shadow pages at any given time.

Toggling between the two generations (valid vs. invalid) allows changing
mmu_valid_gen from an unsigned long to a u8, which reduces the size of
struct kvm_mmu_page from 160 to 152 bytes on 64-bit KVM, i.e. reduces
KVM's memory footprint by 8 bytes per shadow page.

Set sp->mmu_valid_gen before it is added to active_mmu_pages.
Functionally this has no effect as kvm_mmu_alloc_page() has a single
caller that sets sp->mmu_valid_gen soon thereafter, but visually it is
jarring to see a shadow page being added to the list without its
mmu_valid_gen first being set.

Signed-off-by: Sean Christopherson 
---
 arch/x86/include/asm/kvm_host.h |  4 ++--
 arch/x86/kvm/mmu.c  | 14 --
 arch/x86/kvm/mmutrace.h | 16 
 3 files changed, 22 insertions(+), 12 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 6e4fa75351fd..8912b04d4ae1 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -320,6 +320,7 @@ struct kvm_mmu_page {
struct list_head link;
struct hlist_node hash_link;
bool unsync;
+   u8 mmu_valid_gen;
 
/*
 * The following two entries are used to key the shadow page in the
@@ -334,7 +335,6 @@ struct kvm_mmu_page {
int root_count;  /* Currently serving as active root */
unsigned int unsync_children;
struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */
-   unsigned long mmu_valid_gen;
DECLARE_BITMAP(unsync_child_bitmap, 512);
 
 #ifdef CONFIG_X86_32
@@ -856,7 +856,7 @@ struct kvm_arch {
unsigned long n_requested_mmu_pages;
unsigned long n_max_mmu_pages;
unsigned int indirect_shadow_pages;
-   unsigned long mmu_valid_gen;
+   u8 mmu_valid_gen;
struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
/*
 * Hash table of struct kvm_mmu_page.
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index bce19918ca5a..a7b14750cde9 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2101,6 +2101,7 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct 
kvm_vcpu *vcpu, int direct
 * depends on valid pages being added to the head of the list.  See
 * comments in kvm_zap_obsolete_pages().
 */
+   sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
list_add(>link, >kvm->arch.active_mmu_pages);
kvm_mod_used_mmu_pages(vcpu->kvm, +1);
return sp;
@@ -2537,7 +2538,6 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct 
kvm_vcpu *vcpu,
if (level > PT_PAGE_TABLE_LEVEL && need_sync)
flush |= kvm_sync_pages(vcpu, gfn, _list);
}
-   sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
clear_page(sp->spt);
trace_kvm_mmu_get_page(sp, true);
 
@@ -5737,9 +5737,19 @@ static void kvm_zap_obsolete_pages(struct kvm *kvm)
  */
 static void kvm_mmu_zap_all_fast(struct kvm *kvm)
 {
+   lockdep_assert_held(>slots_lock);
+
spin_lock(>mmu_lock);
trace_kvm_mmu_zap_all_fast(kvm);
-   kvm->arch.mmu_valid_gen++;
+
+   /*
+* Toggle mmu_valid_gen between '0' and '1'.  Because slots_lock is
+* held for the entire duration of zapping obsolete pages, it's
+* impossible for there to be multiple invalid generations associated
+* with *valid* shadow pages at any given time, i.e. there is exactly
+* one valid generation and (at most) one invalid generation.
+*/
+   kvm->arch.mmu_valid_gen = kvm->arch.mmu_valid_gen ? 0 : 1;
 
/*
 * Notify all vcpus to reload its shadow page table and flush TLB.
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 1a063ba76281..7ca8831c7d1a 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -8,11 +8,11 @@
 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM kvmmmu
 
-#define KVM_MMU_PAGE_FIELDS\
-   __field(unsigned long, mmu_valid_gen)   \
-   __field(__u64, gfn) \
-   __field(__u32, role)\
-   __field(__u32, root_count)  \
+#define KVM_MMU_PAGE_FIELDS\
+   __field(__u8, mmu_valid_gen)\
+   __field(__u64, gfn) \
+   __field(__u32, role)\
+   __field(__u32, root_count)  \
__field(bool, unsync)
 
 #define KVM_MMU_PAGE_ASSIGN(sp)\
@@ -31,7 +31,7 @@
\
role.word = __entry->role;  \

Re: [PATCH V3 2/5] input: keyboard: imx_sc: Add i.MX system controller key support

Hi Anson,

On Tue, Sep 03, 2019 at 05:36:37PM -0400, Anson Huang wrote:
> i.MX8QXP is an ARMv8 SoC which has a Cortex-M4 system controller
> inside, the system controller is in charge of controlling power,
> clock and scu key etc..
> 
> Adds i.MX system controller key driver support, Linux kernel has
> to communicate with system controller via MU (message unit) IPC
> to get scu key's status.
> 
> Signed-off-by: Anson Huang 
> ---
> Changes since V2:
>   - use private platform data instead of global data;
>   - use "key" instead of "pwrkey";
>   - fix some data format.
> ---
>  drivers/input/keyboard/Kconfig  |   7 ++
>  drivers/input/keyboard/Makefile |   1 +
>  drivers/input/keyboard/imx_sc_key.c | 178 
> 
>  3 files changed, 186 insertions(+)
>  create mode 100644 drivers/input/keyboard/imx_sc_key.c
> 
> diff --git a/drivers/input/keyboard/Kconfig b/drivers/input/keyboard/Kconfig
> index 2e6d288..607acf2 100644
> --- a/drivers/input/keyboard/Kconfig
> +++ b/drivers/input/keyboard/Kconfig
> @@ -469,6 +469,13 @@ config KEYBOARD_IMX
> To compile this driver as a module, choose M here: the
> module will be called imx_keypad.
>  
> +config KEYBOARD_IMX_SC_KEY
> + tristate "IMX SCU Key Driver"
> + depends on IMX_SCU
> + help
> +   This is the system controller key driver for NXP i.MX SoCs with
> +   system controller inside.
> +
>  config KEYBOARD_NEWTON
>   tristate "Newton keyboard"
>   select SERIO
> diff --git a/drivers/input/keyboard/Makefile b/drivers/input/keyboard/Makefile
> index 9510325..f5b1752 100644
> --- a/drivers/input/keyboard/Makefile
> +++ b/drivers/input/keyboard/Makefile
> @@ -29,6 +29,7 @@ obj-$(CONFIG_KEYBOARD_HIL)  += hil_kbd.o
>  obj-$(CONFIG_KEYBOARD_HIL_OLD)   += hilkbd.o
>  obj-$(CONFIG_KEYBOARD_IPAQ_MICRO)+= ipaq-micro-keys.o
>  obj-$(CONFIG_KEYBOARD_IMX)   += imx_keypad.o
> +obj-$(CONFIG_KEYBOARD_IMX_SC_KEY)+= imx_sc_key.o
>  obj-$(CONFIG_KEYBOARD_HP6XX) += jornada680_kbd.o
>  obj-$(CONFIG_KEYBOARD_HP7XX) += jornada720_kbd.o
>  obj-$(CONFIG_KEYBOARD_LKKBD) += lkkbd.o
> diff --git a/drivers/input/keyboard/imx_sc_key.c 
> b/drivers/input/keyboard/imx_sc_key.c
> new file mode 100644
> index 000..e69479b
> --- /dev/null
> +++ b/drivers/input/keyboard/imx_sc_key.c
> @@ -0,0 +1,178 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright 2019 NXP.
> + */
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +#define DEBOUNCE_TIME100
> +#define REPEAT_INTERVAL  60
> +
> +#define SC_IRQ_BUTTON1
> +#define SC_IRQ_GROUP_WAKE3
> +#define IMX_SC_MISC_FUNC_GET_BUTTON_STATUS   18
> +
> +struct imx_key_drv_data {
> + int keycode;
> + bool keystate;  /* 1: pressed, 0: release */
> + bool delay_check;
> + struct delayed_work check_work;
> + struct input_dev *input;
> + struct imx_sc_ipc *key_ipc_handle;
> + struct notifier_block key_notifier;
> +};
> +
> +struct imx_sc_msg_key {
> + struct imx_sc_rpc_msg hdr;
> + u8 state;
> +};
> +
> +static int imx_sc_key_notify(struct notifier_block *nb,
> +  unsigned long event, void *group)
> +{
> + struct imx_key_drv_data *priv =
> +  container_of(nb,
> +   struct imx_key_drv_data,
> +   key_notifier);
> +
> + if ((event & SC_IRQ_BUTTON) && (*(u8 *)group == SC_IRQ_GROUP_WAKE)
> + && !priv->delay_check) {
> + priv->delay_check = 1;
> + schedule_delayed_work(>check_work,
> +   msecs_to_jiffies(REPEAT_INTERVAL));
> + }
> +
> + return 0;
> +}
> +
> +static void imx_sc_check_for_events(struct work_struct *work)
> +{
> + struct imx_key_drv_data *priv =
> +  container_of(work,
> +   struct imx_key_drv_data,
> +   check_work.work);
> + struct input_dev *input = priv->input;
> + struct imx_sc_msg_key msg;
> + struct imx_sc_rpc_msg *hdr = 
> + bool state;
> + int ret;
> +
> + hdr->ver = IMX_SC_RPC_VERSION;
> + hdr->svc = IMX_SC_RPC_SVC_MISC;
> + hdr->func = IMX_SC_MISC_FUNC_GET_BUTTON_STATUS;
> + hdr->size = 1;
> +
> + ret = imx_scu_call_rpc(priv->key_ipc_handle, , true);
> + if (ret) {
> + dev_err(>dev, "read imx sc key failed, ret %d\n", ret);
> + return;
> + }
> +
> + state = (bool)msg.state;
> +
> + if (!state && !priv->keystate)
> + state = true;
> +
> + if (state ^ priv->keystate) {
> + pm_wakeup_event(input->dev.parent, 0);
> + priv->keystate = state;
> +

Re: [PATCH] HID: hidraw: replace printk() with corresponding pr_xx() variant

Hi Rishi,

On Thu, Aug 22, 2019 at 10:13:52PM +0530, Rishi Gupta wrote:
> This commit replaces direct invocations of printk with
> their appropriate pr_info/warn() variant.
> 
> Signed-off-by: Rishi Gupta 
> ---
>  drivers/hid/hidraw.c | 6 +++---
>  1 file changed, 3 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/hid/hidraw.c b/drivers/hid/hidraw.c
> index 006bd6f..67b652b 100644
> --- a/drivers/hid/hidraw.c
> +++ b/drivers/hid/hidraw.c
> @@ -197,14 +197,14 @@ static ssize_t hidraw_get_report(struct file *file, 
> char __user *buffer, size_t
>   }
>  
>   if (count > HID_MAX_BUFFER_SIZE) {
> - printk(KERN_WARNING "hidraw: pid %d passed too large report\n",
> + pr_warn("hidraw: pid %d passed too large report\n",
>   task_pid_nr(current));

If you are doing this, you should also look into pr_fmt() so that we do
not need to manually add "hidraw: " prefix to the messages.

>   ret = -EINVAL;
>   goto out;
>   }
>  
>   if (count < 2) {
> - printk(KERN_WARNING "hidraw: pid %d passed too short report\n",
> + pr_warn("hidraw: pid %d passed too short report\n",
>   task_pid_nr(current));
>   ret = -EINVAL;
>   goto out;
> @@ -597,7 +597,7 @@ int __init hidraw_init(void)
>   if (result < 0)
>   goto error_class;
>  
> - printk(KERN_INFO "hidraw: raw HID events driver (C) Jiri Kosina\n");
> + pr_info("hidraw: raw HID events driver (C) Jiri Kosina\n");
>  out:
>   return result;
>  
> -- 
> 2.7.4
> 

Thanks.

-- 
Dmitry

[PATCH] clk: Make clk_bulk_get_all() return a valid "id"

2019-09-12 Thread Bjorn Andersson

The adreno driver expects the "id" field of the returned clk_bulk_data
to be filled in with strings from the clock-names property.

But due to the use of kmalloc_array() in of_clk_bulk_get_all() it
receives a list of bogus pointers instead.

Zero-initialize the "id" field and attempt to populate with strings from
the clock-names property to resolve both these issues.

Fixes: 616e45df7c4a ("clk: add new APIs to operate on all available clocks")
Fixes: 8e3e791d20d2 ("drm/msm: Use generic bulk clock function")
Cc: Dong Aisheng 
Cc: Jordan Crouse 
Signed-off-by: Bjorn Andersson 
---
 drivers/clk/clk-bulk.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/clk/clk-bulk.c b/drivers/clk/clk-bulk.c
index 524bf9a53098..e9e16425c739 100644
--- a/drivers/clk/clk-bulk.c
+++ b/drivers/clk/clk-bulk.c
@@ -18,10 +18,13 @@ static int __must_check of_clk_bulk_get(struct device_node 
*np, int num_clks,
int ret;
int i;
 
-   for (i = 0; i < num_clks; i++)
+   for (i = 0; i < num_clks; i++) {
+   clks[i].id = NULL;
clks[i].clk = NULL;
+   }
 
for (i = 0; i < num_clks; i++) {
+   of_property_read_string_index(np, "clock-names", i, 
[i].id);
clks[i].clk = of_clk_get(np, i);
if (IS_ERR(clks[i].clk)) {
ret = PTR_ERR(clks[i].clk);
-- 
2.18.0

Re: [PATCH v3] input: keyboard: snvs_pwrkey: Send key events for i.MX6 S, DL and Q

Hi Robin,

On Wed, Sep 04, 2019 at 06:23:29AM +, Robin van der Gracht wrote:
> The first generation i.MX6 processors does not send an interrupt when the
> power key is pressed. It sends a power down request interrupt if the key is
> released before a hard shutdown (5 second press). This should allow
> software to bring down the SoC safely.
> 
> For this driver to work as a regular power key with the older SoCs, we need
> to send a keypress AND release when we get the power down request irq.
> 
> Signed-off-by: Robin van der Gracht 
> ---
> 
> Changes v2 -> v3:
>  - Drop alt compatible string for identifying first revision snvs hardware,
>read minor revision from register instead.
>  - Drop imx6qdl.dtsi modification and device-tree binding documentation.
>  - Add an additional input_sync() to create 2 seperate input reports for press
>and release.
> 
>  drivers/input/keyboard/Kconfig   |  2 +-
>  drivers/input/keyboard/snvs_pwrkey.c | 28 ++--
>  2 files changed, 27 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/input/keyboard/Kconfig b/drivers/input/keyboard/Kconfig
> index 7c4f19dab34f..937e58da5ce1 100644
> --- a/drivers/input/keyboard/Kconfig
> +++ b/drivers/input/keyboard/Kconfig
> @@ -436,7 +436,7 @@ config KEYBOARD_SNVS_PWRKEY
>   depends on OF
>   help
> This is the snvs powerkey driver for the Freescale i.MX application
> -   processors that are newer than i.MX6 SX.
> +   processors.
>  
> To compile this driver as a module, choose M here; the
> module will be called snvs_pwrkey.
> diff --git a/drivers/input/keyboard/snvs_pwrkey.c 
> b/drivers/input/keyboard/snvs_pwrkey.c
> index 5342d8d45f81..828580eee0d2 100644
> --- a/drivers/input/keyboard/snvs_pwrkey.c
> +++ b/drivers/input/keyboard/snvs_pwrkey.c
> @@ -19,6 +19,7 @@
>  #include 
>  #include 
>  
> +#define SNVS_HPVIDR1_REG 0xF8
>  #define SNVS_LPSR_REG0x4C/* LP Status Register */
>  #define SNVS_LPCR_REG0x38/* LP Control Register */
>  #define SNVS_HPSR_REG0x14
> @@ -37,6 +38,7 @@ struct pwrkey_drv_data {
>   int wakeup;
>   struct timer_list check_timer;
>   struct input_dev *input;
> + u8 minor_rev;
>  };
>  
>  static void imx_imx_snvs_check_for_events(struct timer_list *t)
> @@ -45,6 +47,20 @@ static void imx_imx_snvs_check_for_events(struct 
> timer_list *t)
>   struct input_dev *input = pdata->input;
>   u32 state;
>  
> + if (pdata->minor_rev == 0) {
> + /*
> +  * The first generation i.MX6 SoCs only sends an interrupt on
> +  * button release. To mimic power-key usage, we'll prepend a
> +  * press event.
> +  */
> + input_report_key(input, pdata->keycode, 1);
> + input_sync(input);
> + input_report_key(input, pdata->keycode, 0);
> + input_sync(input);
> + pm_relax(input->dev.parent);
> + return;
> + }
> +
>   regmap_read(pdata->snvs, SNVS_HPSR_REG, );
>   state = state & SNVS_HPSR_BTN ? 1 : 0;
>  
> @@ -67,13 +83,17 @@ static irqreturn_t imx_snvs_pwrkey_interrupt(int irq, 
> void *dev_id)
>  {
>   struct platform_device *pdev = dev_id;
>   struct pwrkey_drv_data *pdata = platform_get_drvdata(pdev);
> + unsigned long expire = jiffies;
>   u32 lp_status;
>  
>   pm_wakeup_event(pdata->input->dev.parent, 0);
>  
>   regmap_read(pdata->snvs, SNVS_LPSR_REG, _status);
> - if (lp_status & SNVS_LPSR_SPO)
> - mod_timer(>check_timer, jiffies + 
> msecs_to_jiffies(DEBOUNCE_TIME));
> + if (lp_status & SNVS_LPSR_SPO) {
> + if (pdata->minor_rev > 0)
> + expire = jiffies + msecs_to_jiffies(DEBOUNCE_TIME);
> + mod_timer(>check_timer, expire);

Why do we even need to fire the timer in case of the first generation
hardware? Just send press and release events directly from the ISR.

Thanks.

-- 
Dmitry

[PATCH v5 1/2] drivers: hv: vmbus: Introduce latency testing

2019-09-12 Thread Branden Bonaby

Introduce user specified latency in the packet reception path
By exposing the test parameters as part of the debugfs channel
attributes. We will control the testing state via these attributes.

Signed-off-by: Branden Bonaby 
---
changes in v5:
 - As per Stephen's suggestion, Moved CONFIG_HYPERV_TESTING
   to lib/Kconfig.debug.

 - Fixed build issue reported by Kbuild, with Michael's
   suggestion to make hv_debugfs part of the hv_vmbus
   module.

 - updated debugfs-hyperv to show kernel version 5.4

changes in v4:
 - Combined v3 patch 2 into this patch, and changed the
   commit description to reflect this.

 - Moved debugfs code from "vmbus_drv.c" that was in
   previous v3 patch 2, into a new file "debugfs.c" in
   drivers/hv.

 - Updated the Makefile to compile "debugfs.c" if
   CONFIG_HYPERV_TESTING is enabled

 - As per Michael's comments, added empty implementations
   of the new functions, so the compiler will not generate
   code when CONFIG_HYPERV_TESTING is not enabled.

 - Added microseconds into description for files in
   Documentation/ABI/testing/debugfs-hyperv.

Changes in v2:
 - Add #ifdef in Kconfig file so test code will not interfere
   with non-test code.
 - Move test code functions for delay to hyperv_vmbus header
   file.
 - Wrap test code under #ifdef statement.
 
Documentation/ABI/testing/debugfs-hyperv |  23 +++
 MAINTAINERS  |   1 +
 drivers/hv/Makefile  |   1 +
 drivers/hv/connection.c  |   1 +
 drivers/hv/hv_debugfs.c  | 185 +++
 drivers/hv/hyperv_vmbus.h|  31 
 drivers/hv/ring_buffer.c |   2 +
 drivers/hv/vmbus_drv.c   |   6 +
 include/linux/hyperv.h   |  19 +++
 lib/Kconfig.debug|   7 +
 10 files changed, 276 insertions(+)
 create mode 100644 Documentation/ABI/testing/debugfs-hyperv
 create mode 100644 drivers/hv/hv_debugfs.c

diff --git a/Documentation/ABI/testing/debugfs-hyperv 
b/Documentation/ABI/testing/debugfs-hyperv
new file mode 100644
index ..4427503ec762
--- /dev/null
+++ b/Documentation/ABI/testing/debugfs-hyperv
@@ -0,0 +1,23 @@
+What:   /sys/kernel/debug/hyperv//fuzz_test_state
+Date:   August 2019
+KernelVersion:  5.4
+Contact:Branden Bonaby 
+Description:Fuzz testing status of a vmbus device, whether its in an ON
+state or a OFF state
+Users:  Debugging tools
+
+What:   
/sys/kernel/debug/hyperv//delay/fuzz_test_buffer_interrupt_delay
+Date:   August 2019
+KernelVersion:  5.4
+Contact:Branden Bonaby 
+Description:Fuzz testing buffer interrupt delay value between 0 - 1000
+microseconds (inclusive).
+Users:  Debugging tools
+
+What:   /sys/kernel/debug/hyperv//delay/fuzz_test_message_delay
+Date:   August 2019
+KernelVersion:  5.4
+Contact:Branden Bonaby 
+Description:Fuzz testing message delay value between 0 - 1000 microseconds
+(inclusive).
+Users:  Debugging tools
diff --git a/MAINTAINERS b/MAINTAINERS
index e7a47b5210fd..00831931eb22 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7468,6 +7468,7 @@ F:include/uapi/linux/hyperv.h
 F: include/asm-generic/mshyperv.h
 F: tools/hv/
 F: Documentation/ABI/stable/sysfs-bus-vmbus
+F: Documentation/ABI/testing/debugfs-hyperv
 
 HYPERBUS SUPPORT
 M: Vignesh Raghavendra 
diff --git a/drivers/hv/Makefile b/drivers/hv/Makefile
index a1eec7177c2d..94daf8240c95 100644
--- a/drivers/hv/Makefile
+++ b/drivers/hv/Makefile
@@ -9,4 +9,5 @@ CFLAGS_hv_balloon.o = -I$(src)
 hv_vmbus-y := vmbus_drv.o \
 hv.o connection.o channel.o \
 channel_mgmt.o ring_buffer.o hv_trace.o
+hv_vmbus-$(CONFIG_HYPERV_TESTING)  += hv_debugfs.o
 hv_utils-y := hv_util.o hv_kvp.o hv_snapshot.o hv_fcopy.o hv_utils_transport.o
diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c
index 09829e15d4a0..4d4d40832846 100644
--- a/drivers/hv/connection.c
+++ b/drivers/hv/connection.c
@@ -357,6 +357,7 @@ void vmbus_on_event(unsigned long data)
 
trace_vmbus_on_event(channel);
 
+   hv_debug_delay_test(channel, INTERRUPT_DELAY);
do {
void (*callback_fn)(void *);
 
diff --git a/drivers/hv/hv_debugfs.c b/drivers/hv/hv_debugfs.c
new file mode 100644
index ..933080b51410
--- /dev/null
+++ b/drivers/hv/hv_debugfs.c
@@ -0,0 +1,185 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Authors:
+ *   Branden Bonaby 
+ */
+
+#include 
+#include 
+#include 
+#include 
+
+#include "hyperv_vmbus.h"
+
+struct dentry *hv_debug_root;
+
+static int hv_debugfs_delay_get(void *data, u64 *val)
+{
+   *val = *(u32 *)data;
+   return 0;
+}
+
+static int hv_debugfs_delay_set(void *data, u64 val)
+{
+   int ret = 0;
+
+   if (val >= 0 && val <= 1000)
+   *(u32 *)data = val;
+   else
+

[PATCH v5 2/2] tools: hv: add vmbus testing tool

2019-09-12 Thread Branden Bonaby

This is a userspace tool to drive the testing. Currently it supports
introducing user specified delay in the host to guest communication
path on a per-channel basis.

Signed-off-by: Branden Bonaby 
---
Changes in v4:
- Based on Harrys comments, made the tool more
  user friendly and added more error checking.

Changes in v3:
- Align python tool to match Linux coding style.

Changes in v2:
 - Move testing location to new location in debugfs.

 tools/hv/vmbus_testing | 376 +
 1 file changed, 376 insertions(+)
 create mode 100644 tools/hv/vmbus_testing

diff --git a/tools/hv/vmbus_testing b/tools/hv/vmbus_testing
new file mode 100644
index ..e7212903dd1d
--- /dev/null
+++ b/tools/hv/vmbus_testing
@@ -0,0 +1,376 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+#
+# Program to allow users to fuzz test Hyper-V drivers
+# by interfacing with Hyper-V debugfs attributes.
+# Current test methods available:
+#   1. delay testing
+#
+# Current file/directory structure of hyper-V debugfs:
+#   /sys/kernel/debug/hyperv/UUID
+#   /sys/kernel/debug/hyperv/UUID/
+#   /sys/kernel/debug/hyperv/UUID/
+#
+# author: Branden Bonaby 
+
+import os
+import cmd
+import argparse
+import glob
+from argparse import RawDescriptionHelpFormatter
+from argparse import RawTextHelpFormatter
+from enum import Enum
+
+# Do not change unless, you change the debugfs attributes
+# in /drivers/hv/debugfs.c. All fuzz testing
+# attributes will start with "fuzz_test".
+
+# debugfs path for hyperv must exist before proceeding
+debugfs_hyperv_path = "/sys/kernel/debug/hyperv"
+if not os.path.isdir(debugfs_hyperv_path):
+print("{} doesn't exist/check permissions".format(debugfs_hyperv_path))
+exit(-1)
+
+class dev_state(Enum):
+off = 0
+on = 1
+
+# File names, that correspond to the files created in
+# /drivers/hv/debugfs.c
+class f_names(Enum):
+state_f = "fuzz_test_state"
+buff_f =  "fuzz_test_buffer_interrupt_delay"
+mess_f =  "fuzz_test_message_delay"
+
+# Both single_actions and all_actions are used
+# for error checking and to allow for some subparser
+# names to be abbreviated. Do not abbreviate the
+# test method names, as it will become less intuitive
+# as to what the user can do. If you do decide to
+# abbreviate the test method name, make sure the main
+# function reflects this change.
+
+all_actions = [
+"disable_all",
+"D",
+"enable_all",
+"view_all",
+"V"
+]
+
+single_actions = [
+"disable_single",
+"d",
+"enable_single",
+"view_single",
+"v"
+]
+
+def main():
+
+file_map = recursive_file_lookup(debugfs_hyperv_path, dict())
+args = parse_args()
+if (not args.action):
+print ("Error, no options selected...exiting")
+exit(-1)
+arg_set = { k for (k,v) in vars(args).items() if v and k != "action" }
+arg_set.add(args.action)
+path = args.path if "path" in arg_set else None
+if (path and path[-1] == "/"):
+path = path[:-1]
+validate_args_path(path, arg_set, file_map)
+if (path and "enable_single" in arg_set):
+state_path = locate_state(path, file_map)
+set_test_state(state_path, dev_state.on.value, args.quiet)
+
+# Use subparsers as the key for different actions
+if ("delay" in arg_set):
+validate_delay_values(args.delay_time)
+if (args.enable_all):
+set_delay_all_devices(file_map, args.delay_time,
+  args.quiet)
+else:
+set_delay_values(path, file_map, args.delay_time,
+ args.quiet)
+elif ("disable_all" in arg_set or "D" in arg_set):
+disable_all_testing(file_map)
+elif ("disable_single" in arg_set or "d" in arg_set):
+disable_testing_single_device(path, file_map)
+elif ("view_all" in arg_set or "V" in arg_set):
+get_all_devices_test_status(file_map)
+elif ("view_single" in arg_set or  "v" in arg_set):
+get_device_test_values(path, file_map)
+
+# Get the state location
+def locate_state(device, file_map):
+return file_map[device][f_names.state_f.value]
+
+# Validate delay values to make sure they are acceptable to
+# enable delays on a device
+def validate_delay_values(delay):
+
+if (delay[0]  == -1 and delay[1] == -1):
+print("\nError, At least 1 value must be greater than 0")
+exit(-1)
+for i in delay:
+if (i < -1 or i == 0 or i > 1000):
+print("\nError, Values must be  equal to -1 "
+  "or be > 0 and <= 1000")
+exit(-1)
+
+# Validate argument path

[PATCH v5 0/2] hv: vmbus: add fuzz testing to hv device

2019-09-12 Thread Branden Bonaby

This patchset introduces a testing framework for Hyper-V drivers.
This framework allows us to introduce delays in the packet receive
path on a per-device basis. While the current code only supports
introducing arbitrary delays in the host/guest communication path,
we intend to expand this to support error injection in the future.

changes in v5:
  patch 1:
As per Stephen's suggestion, Moved CONFIG_HYPERV_TESTING
to lib/Kconfig.debug.

Fixed build issue reported by Kbuild, with Michael's
suggestion to make hv_debugfs part of the hv_vmbus
module.

changes in v4:
  patch 1:
Combined previous v3 patches 1 and 2, into a single patch
which is now patch 1. This was done so that calls to
the new debugfs functions are in the same patch as
the definitions for these functions.

Moved debugfs code from "vmbus_drv.c" that was in
previous v3 patch 2, into a new file "debugfs.c" in
drivers/hv.

Updated the Makefile to compile "debugfs.c" if
CONFIG_HYPERV_TESTING is enabled

As per Michael's comments, added empty implementations
of the new functions, so the compiler will not generate
code when CONFIG_HYPERV_TESTING is not enabled.

  patch 2 (was previously v3 patch 3):
Based on Harrys comments, made the tool more
user friendly and added more error checking.

changes in v3:
  patch 2: change call to IS_ERR_OR_NULL, to IS_ERR.

  patch 3: Align python tool to match Linux coding style.

Changes in v2:
  Patch 1: As per Vitaly's suggestion, wrapped the test code under an
   #ifdef and updated the Kconfig file, so that the test code
   will only be used when the config option is set to true.
   (default is false).

   Updated hyperv_vmbus header to contain new #ifdef with new
   new functions for the test code.

  Patch 2: Moved code from under sysfs to debugfs and wrapped it under
   the new ifdef.

   Updated MAINTAINERS file with new debugfs-hyperv file under
   the section for hyperv.

  Patch 3: Updated testing tool with new debugfs location.

Branden Bonaby (2):
  drivers: hv: vmbus: Introduce latency testing
  tools: hv: add vmbus testing tool

 Documentation/ABI/testing/debugfs-hyperv |  23 ++
 MAINTAINERS  |   1 +
 drivers/hv/Makefile  |   1 +
 drivers/hv/connection.c  |   1 +
 drivers/hv/hv_debugfs.c  | 185 +++
 drivers/hv/hyperv_vmbus.h|  31 ++
 drivers/hv/ring_buffer.c |   2 +
 drivers/hv/vmbus_drv.c   |   6 +
 include/linux/hyperv.h   |  19 ++
 lib/Kconfig.debug|   7 +
 tools/hv/vmbus_testing   | 376 +++
 11 files changed, 652 insertions(+)
 create mode 100644 Documentation/ABI/testing/debugfs-hyperv
 create mode 100644 drivers/hv/hv_debugfs.c
 create mode 100644 tools/hv/vmbus_testing

-- 
2.17.1

Re: [Ksummit-discuss] [PATCH v2 3/3] libnvdimm, MAINTAINERS: Maintainer Entry Profile

2019-09-12 Thread Aneesh Kumar K.V


On 9/12/19 12:13 AM, Dan Carpenter wrote:

On Wed, Sep 11, 2019 at 08:48:59AM -0700, Dan Williams wrote:

+Coding Style Addendum
+-
+libnvdimm expects multi-line statements to be double indented. I.e.
+
+if (x...
+&& ...y) {


That looks horrible and it causes a checkpatch warning.  :(  Why not
do it the same way that everyone else does it.

if (blah_blah_x && <-- && has to be on the first line for checkpatch
blah_blah_y) { <-- [tab][space][space][space][space]blah

Now all the conditions are aligned visually which makes it readable.
They aren't aligned with the indent block so it's easy to tell the
inside from the if condition.



I came across this while sending patches to libnvdimm subsystem. W.r.t 
coding Style can we have consistent styles across the kernel? Otherwise, 
one would have to change the editor settings as they work across 
different subsystems in the kernel. In this specific case both 
clang-format and emacs customization tip in the kernel documentation 
directory suggest the later style.


-aneesh

Re: [PATCH v5 2/9] documention: leds: Add multicolor class documentation

2019-09-12 Thread Dan Murphy


Hello Pavel

Thanks for looking at this again

On 9/12/19 3:55 PM, Pavel Machek wrote:

Hi!


+Directory Layout Example
+
+root:/sys/class/leds/rgb:grouped_leds# ls -lR colors/
+colors/:
+drwxr-xr-x2 root root 0 Jun 28 20:21 blue
+drwxr-xr-x2 root root 0 Jun 28 20:21 green
+drwxr-xr-x2 root root 0 Jun 28 20:21 red
+-rw---1 root root  4096 Jun 28 20:21 color_mix
+
+colors/blue:
+-rw---1 root root  4096 Jun 28 20:21 intensity
+-r1 root root  4096 Jun 28 20:27 max_intensity
+-r1 root root  4096 Jun 28 20:21 color_id

I don't really like the directories... A bit too much complexity, and
it will have a memory footprint, too.


The directories should be fine to have I am not seeing the complexity. 
Is memory footprint really an issue? Maybe in the IoT space but this is 
small and memory footprint should be able to handle this for IoT and 
larger systems.


Having dedicated directories and files clears up issues for user space 
to know about the parameters for each LED especially with the color_mix 
file which I still am not a fan of, but conceded and implemented 
anyway.  It also gives the user space flexibility to call the monochrome 
LEDs specific intensity file.  The user space can either use the color 
intensity file or the color_mix file it is a choice for them to make.


This code was modeled off the LP50xx device which has individual LED 
intensity controls as well as a overall brightness control. Since we 
have no feedback from user space folks I feel we have to give some 
options not very many but some.




I'd expect max_intensity to be same for all the leds in
rgb:grouped_leds... Could we simply rely on max_brightness file?


I went under the assumption that not all grouped LEDs would have the 
same max_intensity.


I don't have specific use cases but wanted this as an option.

Dan


[If not, would one "max_intensity" file in rgb:grouped_leds be
enough?]

Best regards,
Pavel

Re: [PATCH 2/3] powperc/mm: read TLB Block Invalidate Characteristics

2019-09-12 Thread Aneesh Kumar K.V


On 9/13/19 12:56 AM, Laurent Dufour wrote:

Le 12/09/2019 à 16:44, Aneesh Kumar K.V a écrit :

Laurent Dufour  writes:



+
+    idx = 2;
+    while (idx < len) {
+    unsigned int block_size = local_buffer[idx++];
+    unsigned int npsize;
+
+    if (!block_size)
+    break;
+
+    block_size = 1 << block_size;
+    if (block_size != 8)
+    /* We only support 8 bytes size TLB invalidate buffer */
+    pr_warn("Unsupported H_BLOCK_REMOVE block size : %d\n",
+    block_size);


Should we skip setting block size if we find block_size != 8? Also can
we avoid doing that pr_warn in loop and only warn if we don't find
block_size 8 in the invalidate characteristics array?


My idea here is to fully read and process the data returned by the 
hcall, and to put the limitation to 8 when checking before calling 
H_BLOCK_REMOVE.

The warning is there because I want it to be displayed once at boot.




Can we have two block size reported for the same base page size/actual 
page size combination? If so we will overwrite the hblk[actual_psize] ?





+
+    for (npsize = local_buffer[idx++];  npsize > 0; npsize--)
+    check_lp_set_hblk((unsigned int) local_buffer[idx++],
+  block_size);
+    }
+
+    for (bpsize = 0; bpsize < MMU_PAGE_COUNT; bpsize++)
+    for (idx = 0; idx < MMU_PAGE_COUNT; idx++)
+    if (mmu_psize_defs[bpsize].hblk[idx])
+    pr_info("H_BLOCK_REMOVE supports base psize:%d 
psize:%d block size:%d",

+    bpsize, idx,
+    mmu_psize_defs[bpsize].hblk[idx]);
+
+    return 0;
+}
+machine_arch_initcall(pseries, read_tlbbi_characteristics);
+
  /*
   * Take a spinlock around flushes to avoid bouncing the hypervisor 
tlbie

   * lock.


-aneesh

[PATCH v2] KVM: x86: Handle unexpected MMIO accesses using master abort semantics

Use master abort semantics, i.e. reads return all ones and writes are
dropped, to handle unexpected MMIO accesses when reading guest memory
instead of returning X86EMUL_IO_NEEDED, which in turn gets interpreted
as a guest page fault.

Emulation of certain instructions, notably VMX instructions, involves
reading or writing guest memory without going through the emulator.
These emulation flows are not equipped to handle MMIO accesses as no
sane and properly functioning guest kernel will target MMIO with such
instructions, and so simply inject a page fault in response to
X86EMUL_IO_NEEDED.

While not 100% correct, using master abort semantics is at least
sometimes correct, e.g. non-existent MMIO accesses do actually master
abort, whereas injecting a page fault is always wrong, i.e. the issue
lies in the physical address domain, not in the virtual to physical
translation.

Apply the logic to kvm_write_guest_virt_system() in addition to
replacing existing #PF logic in kvm_read_guest_virt(), as VMPTRST uses
the former, i.e. can also leak a host stack address.

Reported-by: Fuqian Huang 
Cc: sta...@vger.kernel.org
Signed-off-by: Sean Christopherson 
---

v2: Fix the comment for kvm_read_guest_virt_helper().

 arch/x86/kvm/x86.c | 40 +++-
 1 file changed, 31 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b4cfd786d0b6..3da57f137470 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5234,16 +5234,24 @@ int kvm_read_guest_virt(struct kvm_vcpu *vcpu,
   struct x86_exception *exception)
 {
u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+   int r;
+
+   r = kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
+  exception);
 
/*
-* FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED
-* is returned, but our callers are not ready for that and they blindly
-* call kvm_inject_page_fault.  Ensure that they at least do not leak
-* uninitialized kernel stack memory into cr2 and error code.
+* FIXME: this should technically call out to userspace to handle the
+* MMIO access, but our callers are not ready for that, so emulate
+* master abort behavior instead, i.e. reads return all ones.
 */
-   memset(exception, 0, sizeof(*exception));
-   return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
- exception);
+   if (r == X86EMUL_IO_NEEDED) {
+   memset(val, 0xff, bytes);
+   return 0;
+   }
+   if (r == X86EMUL_PROPAGATE_FAULT)
+   return -EFAULT;
+   WARN_ON_ONCE(r);
+   return 0;
 }
 EXPORT_SYMBOL_GPL(kvm_read_guest_virt);
 
@@ -5317,11 +5325,25 @@ static int emulator_write_std(struct x86_emulate_ctxt 
*ctxt, gva_t addr, void *v
 int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, gva_t addr, void *val,
unsigned int bytes, struct x86_exception 
*exception)
 {
+   int r;
+
/* kvm_write_guest_virt_system can pull in tons of pages. */
vcpu->arch.l1tf_flush_l1d = true;
 
-   return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
-  PFERR_WRITE_MASK, exception);
+   r = kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
+   PFERR_WRITE_MASK, exception);
+
+   /*
+* FIXME: this should technically call out to userspace to handle the
+* MMIO access, but our callers are not ready for that, so emulate
+* master abort behavior instead, i.e. writes are dropped.
+*/
+   if (r == X86EMUL_IO_NEEDED)
+   return 0;
+   if (r == X86EMUL_PROPAGATE_FAULT)
+   return -EFAULT;
+   WARN_ON_ONCE(r);
+   return 0;
 }
 EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system);
 
-- 
2.22.0

Re: [PATCH] KVM: x86: Handle unexpected MMIO accesses using master abort semantics

On Thu, Sep 12, 2019 at 04:56:03PM -0700, Sean Christopherson wrote:
> Use master abort semantics, i.e. reads return all ones and writes are
> dropped, to handle unexpected MMIO accesses when reading guest memory
> instead of returning X86EMUL_IO_NEEDED, which in turn gets interpreted
> as a guest page fault.
> 
> Emulation of certain instructions, notably VMX instructions, involves
> reading or writing guest memory without going through the emulator.
> These emulation flows are not equipped to handle MMIO accesses as no
> sane and properly functioning guest kernel will target MMIO with such
> instructions, and so simply inject a page fault in response to
> X86EMUL_IO_NEEDED.
> 
> While not 100% correct, using master abort semantics is at least
> sometimes correct, e.g. non-existent MMIO accesses do actually master
> abort, whereas injecting a page fault is always wrong, i.e. the issue
> lies in the physical address domain, not in the virtual to physical
> translation.
> 
> Apply the logic to kvm_write_guest_virt_system() in addition to
> replacing existing #PF logic in kvm_read_guest_virt(), as VMPTRST uses
> the former, i.e. can also leak a host stack address.
> 
> Reported-by: Fuqian Huang 
> Cc: sta...@vger.kernel.org
> Signed-off-by: Sean Christopherson 
> ---
>  arch/x86/kvm/x86.c | 40 +++-
>  1 file changed, 31 insertions(+), 9 deletions(-)
> 
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index b4cfd786d0b6..d1d7e9fac17a 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -5234,16 +5234,24 @@ int kvm_read_guest_virt(struct kvm_vcpu *vcpu,
>  struct x86_exception *exception)
>  {
>   u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
> + int r;
> +
> + r = kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
> +exception);
>  
>   /*
> -  * FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED
> -  * is returned, but our callers are not ready for that and they blindly
> -  * call kvm_inject_page_fault.  Ensure that they at least do not leak
> -  * uninitialized kernel stack memory into cr2 and error code.
> +  * FIXME: this should technically call out to userspace to handle the
> +  * MMIO access, but our callers are not ready for that, so emulate
> +  * master abort behavior instead, i.e. writes are dropped.

Dagnabbit, fixed this to make it 'reads return all ones' and forgot to
commit..  v2 on its way.

>*/
> - memset(exception, 0, sizeof(*exception));
> - return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
> -   exception);
> + if (r == X86EMUL_IO_NEEDED) {
> + memset(val, 0xff, bytes);
> + return 0;
> + }
> + if (r == X86EMUL_PROPAGATE_FAULT)
> + return -EFAULT;
> + WARN_ON_ONCE(r);
> + return 0;
>  }
>  EXPORT_SYMBOL_GPL(kvm_read_guest_virt);
>  
> @@ -5317,11 +5325,25 @@ static int emulator_write_std(struct x86_emulate_ctxt 
> *ctxt, gva_t addr, void *v
>  int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, gva_t addr, void *val,
>   unsigned int bytes, struct x86_exception 
> *exception)
>  {
> + int r;
> +
>   /* kvm_write_guest_virt_system can pull in tons of pages. */
>   vcpu->arch.l1tf_flush_l1d = true;
>  
> - return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
> -PFERR_WRITE_MASK, exception);
> + r = kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
> + PFERR_WRITE_MASK, exception);
> +
> + /*
> +  * FIXME: this should technically call out to userspace to handle the
> +  * MMIO access, but our callers are not ready for that, so emulate
> +  * master abort behavior instead, i.e. writes are dropped.
> +  */
> + if (r == X86EMUL_IO_NEEDED)
> + return 0;
> + if (r == X86EMUL_PROPAGATE_FAULT)
> + return -EFAULT;
> + WARN_ON_ONCE(r);
> + return 0;
>  }
>  EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system);
>  
> -- 
> 2.22.0
>

Re: [PATCH 0/5] hugetlbfs: Disable PMD sharing for large systems

2019-09-12 Thread Dave Chinner

On Wed, Sep 11, 2019 at 04:05:32PM +0100, Waiman Long wrote:
> A customer with large SMP systems (up to 16 sockets) with application
> that uses large amount of static hugepages (~500-1500GB) are experiencing
> random multisecond delays. These delays was caused by the long time it
> took to scan the VMA interval tree with mmap_sem held.
> 
> To fix this problem while perserving existing behavior as much as
> possible, we need to allow timeout in down_write() and disabling PMD
> sharing when it is taking too long to do so. Since a transaction can
> involving touching multiple huge pages, timing out for each of the huge
> page interactions does not completely solve the problem. So a threshold
> is set to completely disable PMD sharing if too many timeouts happen.
> 
> The first 4 patches of this 5-patch series adds a new
> down_write_timedlock() API which accepts a timeout argument and return
> true is locking is successful or false otherwise. It works more or less
> than a down_write_trylock() but the calling thread may sleep.

Just on general principle, this is a non-starter. If a lock is being
held too long, then whatever the lock is protecting needs fixing.
Adding timeouts to locks and sysctls to tune them is not a viable
solution to address latencies caused by algorithm scalability
issues.

Cheers,

Dave.
-- 
Dave Chinner
da...@fromorbit.com

Re: problem starting /sbin/init (32-bit 5.3-rc8)

2019-09-12 Thread Kees Cook

On Thu, Sep 12, 2019 at 05:16:02PM -0700, Kees Cook wrote:
> On Thu, Sep 12, 2019 at 02:40:19PM -0700, Randy Dunlap wrote:
> > This is 32-bit kernel, just happens to be running on a 64-bit laptop.
> > I added the debug printk in __phys_addr() just before "[cut here]".
> > 
> > CONFIG_HARDENED_USERCOPY=y
> 
> I can reproduce this under CONFIG_DEBUG_VIRTUAL=y, and it goes back
> to at least to v5.2. Booting with "hardened_usercopy=off" or without
> CONFIG_DEBUG_VIRTUAL makes this go away (since __phys_addr() doesn't
> get called):
> 
> __check_object_size+0xff/0x1b0:
> pfn_to_section_nr at include/linux/mmzone.h:1153
> (inlined by) __pfn_to_section at include/linux/mmzone.h:1291
> (inlined by) virt_to_head_page at include/linux/mm.h:729
> (inlined by) check_heap_object at mm/usercopy.c:230
> (inlined by) __check_object_size at mm/usercopy.c:280
> 
> Is virt_to_head_page() illegal to use under some recently new conditions?

This combination appears to be bugged since the original introduction
of hardened usercopy in v4.8. Is this an untested combination until
now? (I don't usually do tests with CONFIG_DEBUG_VIRTUAL, but I guess
I will from now on!)

Note from the future (i.e. the end of this email where I figure it out):
it turns out it's actually these three together:

CONFIG_HIGHMEM=y
CONFIG_DEBUG_VIRTUAL=y
CONFIG_HARDENED_USERCOPY=y

> 
> > The BUG is this line in arch/x86/mm/physaddr.c:
> > VIRTUAL_BUG_ON((phys_addr >> PAGE_SHIFT) > max_low_pfn);
> > It's line 83 in my source file only due to adding  and
> > a conditional pr_crit() call.

What exactly is this trying to test?

> > [   19.730409][T1] debug: unmapping init [mem 0xdc7bc000-0xdca30fff]
> > [   19.734289][T1] Write protecting kernel text and read-only data: 
> > 13888k
> > [   19.737675][T1] rodata_test: all tests were successful
> > [   19.740757][T1] Run /sbin/init as init process
> > [   19.792877][T1] __phys_addr: max_low_pfn=0x36ffe, x=0xff001ff1, 
> > phys_addr=0x3f001ff1

It seems like this address is way out of range of the physical memory.
That seems like it's vmalloc or something, but that was actually
explicitly tested for back in the v4.8 version (it became unneeded
later).

> > [   19.796561][T1] [ cut here ]
> > [   19.797501][T1] kernel BUG at ../arch/x86/mm/physaddr.c:83!
> > [   19.802799][T1] invalid opcode:  [#1] PREEMPT SMP DEBUG_PAGEALLOC
> > [   19.803782][T1] CPU: 1 PID: 1 Comm: swapper/0 Not tainted 5.3.0-rc8 
> > #6
> > [   19.803782][T1] Hardware name: Dell Inc. Inspiron 1318   
> > /0C236D, BIOS A04 01/15/2009
> > [   19.803782][T1] EIP: __phys_addr+0xaf/0x100
> > [   19.803782][T1] Code: 85 c0 74 67 89 f7 c1 ef 0c 39 f8 73 2e 56 53 
> > 50 68 90 9f 1f dc 68 00 eb 45 dc e8 ec b3 09 00 83 c4 14 3b 3d 30 55 cf dc 
> > 76 11 <0f> 0b b8 7c 3b 5c dc e8 45 53 4c 00 90 8d 74 26 00 89 d8 e8 39 cd
> > [   19.803782][T1] EAX: 0044 EBX: ff001ff1 ECX:  EDX: 
> > db90a471
> > [   19.803782][T1] ESI: 3f001ff1 EDI: 0003f001 EBP: f41ddea0 ESP: 
> > f41dde90
> > [   19.803782][T1] DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068 EFLAGS: 
> > 00010216
> > [   19.803782][T1] CR0: 80050033 CR2: dc218544 CR3: 1ca39000 CR4: 
> > 000406d0
> > [   19.803782][T1] Call Trace:
> > [   19.803782][T1]  __check_object_size+0xaf/0x3c0
> > [   19.803782][T1]  ? __might_sleep+0x80/0xa0
> > [   19.803782][T1]  copy_strings+0x1c2/0x370

Oh, this is actually copying into a kmap() pointer due to the weird
stuff exec() does:

kaddr = kmap(kmapped_page);
...
if (copy_from_user(kaddr+offset, str, bytes_to_copy)) {

> > [   19.803782][T1]  copy_strings_kernel+0x2b/0x40
> > 
> > Full boot log or kernel .config file are available if wanted.

Is kmap somewhere "unexpected" in this case? Ah-ha, yes, it seems it is.
There is even a helper to do the "right" thing as virt_to_page(). This
seems to be used very rarely in the kernel... is there a page type for
kmap pages? This seems like a hack, but it fixes it:


diff --git a/mm/usercopy.c b/mm/usercopy.c
index 98e924864554..5a14b80ad63e 100644
--- a/mm/usercopy.c
+++ b/mm/usercopy.c
@@ -11,6 +11,7 @@
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -227,7 +228,7 @@ static inline void check_heap_object(const void *ptr, 
unsigned long n,
if (!virt_addr_valid(ptr))
return;
 
-   page = virt_to_head_page(ptr);
+   page = compound_head(kmap_to_page((void *)ptr));
 
if (PageSlab(page)) {
/* Check slab allocator for flags and size. */


What's the right way to "ignore" the kmap range? (i.e. it's not Slab, so
ignore it here: I can't find a page type nor a "is this kmap?" helper...)

-- 
Kees Cook

[RFC V1 0/7] Add support for a new IMS interrupt mechanism

Currently, MSI (Message signaled interrupts) and MSI-X are the de facto
standard for device interrupt mechanism. MSI-X supports up to 2048
interrupts per device while MSI supports 32, which seems more than enough
for current devices. However, the introduction of SIOV (Scalable IO
virtualization) shifts the creation of assignable virtual devices from
hardware to a more software assisted approach. This flexible composition
of direct assignable devices, a.k.a. assignable device interfaces (ADIs)
unchains hardware from costly PCI standard. Under SIOV, device resource
can now be mapped directly to a guest or other user space drivers for
near native DMA performance. To complete functionality of ADIs, a matching
interrupt resource must also be introduced which will be scalable.

Interrupt message storage (IMS) is conceived as a scalable albeit device
specific interrupt mechanism to meet such a demand. With IMS, there is
theoretically no upper bound on the number of interrupts which a device
can support. The size and location of IMS is device-specific; some devices
may implement IMS as on-device storage which are memory-mapped, others may
opt to implement IMS in system memory. IMS stores each interrupt message as
a DWORD size data payload and a 64-bit address(same as MSI-X). Access to
the IMS is through the host driver due to the non-architectural nature of
device IMS unlike the architectural MSI-X table which are accessed through
PCI drivers.

In this patchset, we introduce generic IMS APIs that fits the Linux IRQ
subsystem, supports IMS IRQ chip and domains that can be used by drivers
which are capable of generating IMS interrupts.

The IMS has been introduced as part of Intel's Scalable I/O virtualization
specification:
https://software.intel.com/en-us/download/intel-scalable-io-virtualization-technical-specification

This patchset is based on Linux 5.3-rc8.

Currently there is no device out in the market which supports SIOV (Hence no
device supports IMS).

This series is a basic patchset to get the ball rolling and receive some
inital comments. As per my discussion with Marc Zyngier and Thomas Gleixner
at the Linux Plumbers, I need to do the following:
1. Since a device can support MSI-X and IMS simultaneously, ensure proper
locking mechanism for the 'msi_list' in the device structure.
2. Introduce dynamic allocation of IMS vectors perhaps by using a group ID
3. IMS support of a device needs to be discoverable. A bit in the vendor
specific capability in the PCI config is to be added rather than getting
this information from each device driver.

Jason Gunthorpe of Mellanox technologies is looking to do something similar
on ARM platforms and was wondering why IMS is x86 sepcific. Perhaps we can
use this thread to discuss further on this.

Megha Dey (7):
genirq/msi: Differentiate between various MSI based interrupts
drivers/base: Introduce callbacks for IMS interrupt domain
x86/ims: Add support for a new IMS irq domain
irq_remapping: New interfaces to support IMS irqdomain
x86/ims: Introduce x86_ims_ops
ims-msi: Add APIs to allocate/free IMS interrupts
ims: Add the set_desc callback

--
2.7.4

[RFC V1 4/7] irq_remapping: New interfaces to support IMS irqdomain

Introduce new interfaces for interrupt remapping drivers to support
IMS irqdomains:

irq_remapping_get_ims_irq_domain(): get the IMS irqdomain for an IRQ
allocation. We must build one IMS irqdomain for each interrupt remapping
unit. The driver calls this interface to get the IMS irqdomain associated
with an IR irqdomain which manages the devices.

Architecture specific hooks:
arch_create_ims_irq_domain(): create an IMS irqdomain associated with the
interrupt remapping unit.

We also add following callback into struct irq_remap_ops:
struct irq_domain *(*get_ims_irq_domain)(struct irq_alloc_info *);

Cc: Jacob Pan 
Signed-off-by: Sanjay Kumar 
Signed-off-by: Megha Dey 
---
 arch/x86/include/asm/irq_remapping.h | 13 +
 drivers/iommu/intel_irq_remapping.c  | 30 ++
 drivers/iommu/irq_remapping.c|  9 +
 drivers/iommu/irq_remapping.h|  3 +++
 include/linux/intel-iommu.h  |  1 +
 5 files changed, 56 insertions(+)

diff --git a/arch/x86/include/asm/irq_remapping.h 
b/arch/x86/include/asm/irq_remapping.h
index 4bc985f..a735507 100644
--- a/arch/x86/include/asm/irq_remapping.h
+++ b/arch/x86/include/asm/irq_remapping.h
@@ -48,11 +48,18 @@ extern struct irq_domain *
 irq_remapping_get_ir_irq_domain(struct irq_alloc_info *info);
 extern struct irq_domain *
 irq_remapping_get_irq_domain(struct irq_alloc_info *info);
+extern struct irq_domain *
+irq_remapping_get_ims_irq_domain(struct irq_alloc_info *info);
 
 /* Create PCI MSI/MSIx irqdomain, use @parent as the parent irqdomain. */
 extern struct irq_domain *
 arch_create_remap_msi_irq_domain(struct irq_domain *par, const char *n, int 
id);
 
+/* Create IMS irqdomain, use @parent as the parent irqdomain. */
+#ifdef CONFIG_MSI_IMS
+extern struct irq_domain *arch_create_ims_irq_domain(struct irq_domain 
*parent);
+#endif
+
 /* Get parent irqdomain for interrupt remapping irqdomain */
 static inline struct irq_domain *arch_get_ir_parent_domain(void)
 {
@@ -85,5 +92,11 @@ irq_remapping_get_irq_domain(struct irq_alloc_info *info)
return NULL;
 }
 
+static inline struct irq_domain *
+irq_remapping_get_ims_irq_domain(struct irq_alloc_info *info)
+{
+   return NULL;
+}
+
 #endif /* CONFIG_IRQ_REMAP */
 #endif /* __X86_IRQ_REMAPPING_H */
diff --git a/drivers/iommu/intel_irq_remapping.c 
b/drivers/iommu/intel_irq_remapping.c
index 4786ca0..3c0c0cb 100644
--- a/drivers/iommu/intel_irq_remapping.c
+++ b/drivers/iommu/intel_irq_remapping.c
@@ -573,6 +573,10 @@ static int intel_setup_irq_remapping(struct intel_iommu 
*iommu)
 "INTEL-IR-MSI",
 iommu->seq_id);
 
+#ifdef CONFIG_MSI_IMS
+   iommu->ir_ims_domain = arch_create_ims_irq_domain(iommu->ir_domain);
+#endif
+
ir_table->base = page_address(pages);
ir_table->bitmap = bitmap;
iommu->ir_table = ir_table;
@@ -633,6 +637,10 @@ static void intel_teardown_irq_remapping(struct 
intel_iommu *iommu)
irq_domain_remove(iommu->ir_msi_domain);
iommu->ir_msi_domain = NULL;
}
+   if (iommu->ir_ims_domain) {
+   irq_domain_remove(iommu->ir_ims_domain);
+   iommu->ir_ims_domain = NULL;
+   }
if (iommu->ir_domain) {
irq_domain_remove(iommu->ir_domain);
iommu->ir_domain = NULL;
@@ -1139,6 +1147,27 @@ static struct irq_domain *intel_get_irq_domain(struct 
irq_alloc_info *info)
return NULL;
 }
 
+static struct irq_domain *intel_get_ims_irq_domain(struct irq_alloc_info *info)
+{
+   struct intel_iommu *iommu;
+
+   if (!info)
+   return NULL;
+
+   switch (info->type) {
+   case X86_IRQ_ALLOC_TYPE_MSI:
+   case X86_IRQ_ALLOC_TYPE_MSIX:
+   iommu = map_dev_to_ir(info->msi_dev);
+   if (iommu)
+   return iommu->ir_ims_domain;
+   break;
+   default:
+   break;
+   }
+
+   return NULL;
+}
+
 struct irq_remap_ops intel_irq_remap_ops = {
.prepare= intel_prepare_irq_remapping,
.enable = intel_enable_irq_remapping,
@@ -1147,6 +1176,7 @@ struct irq_remap_ops intel_irq_remap_ops = {
.enable_faulting= enable_drhd_fault_handling,
.get_ir_irq_domain  = intel_get_ir_irq_domain,
.get_irq_domain = intel_get_irq_domain,
+   .get_ims_irq_domain = intel_get_ims_irq_domain,
 };
 
 static void intel_ir_reconfigure_irte(struct irq_data *irqd, bool force)
diff --git a/drivers/iommu/irq_remapping.c b/drivers/iommu/irq_remapping.c
index 83f36f6..c4352fc 100644
--- a/drivers/iommu/irq_remapping.c
+++ b/drivers/iommu/irq_remapping.c
@@ -193,3 +193,12 @@ irq_remapping_get_irq_domain(struct irq_alloc_info *info)
 
return remap_ops->get_irq_domain(info);
 }
+

[RFC V1 3/7] x86/ims: Add support for a new IMS irq domain

This patch adds support for the creation of a new IMS irq domain. It
creates a new irq_chip associated with the IMS domain and adds the
necessary domain operations to it.

Cc: Jacob Pan 
Signed-off-by: Sanjay Kumar 
Signed-off-by: Megha Dey 
---
 arch/x86/include/asm/msi.h   |  4 ++
 arch/x86/kernel/apic/Makefile|  1 +
 arch/x86/kernel/apic/ims.c   | 93 
 arch/x86/kernel/apic/msi.c   |  4 +-
 drivers/vfio/mdev/mdev_core.c|  6 +++
 drivers/vfio/mdev/mdev_private.h |  1 -
 include/linux/mdev.h |  2 +
 7 files changed, 108 insertions(+), 3 deletions(-)
 create mode 100644 arch/x86/kernel/apic/ims.c

diff --git a/arch/x86/include/asm/msi.h b/arch/x86/include/asm/msi.h
index 25ddd09..51f9d25 100644
--- a/arch/x86/include/asm/msi.h
+++ b/arch/x86/include/asm/msi.h
@@ -11,4 +11,8 @@ int pci_msi_prepare(struct irq_domain *domain, struct device 
*dev, int nvec,
 
 void pci_msi_set_desc(msi_alloc_info_t *arg, struct msi_desc *desc);
 
+struct msi_domain_info;
+
+irq_hw_number_t msi_get_hwirq(struct msi_domain_info *info,
+   msi_alloc_info_t *arg);
 #endif /* _ASM_X86_MSI_H */
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
index a6fcaf16..75a2270 100644
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -12,6 +12,7 @@ obj-y += hw_nmi.o
 
 obj-$(CONFIG_X86_IO_APIC)  += io_apic.o
 obj-$(CONFIG_PCI_MSI)  += msi.o
+obj-$(CONFIG_MSI_IMS)  += ims.o
 obj-$(CONFIG_SMP)  += ipi.o
 
 ifeq ($(CONFIG_X86_64),y)
diff --git a/arch/x86/kernel/apic/ims.c b/arch/x86/kernel/apic/ims.c
new file mode 100644
index 000..d9808a5
--- /dev/null
+++ b/arch/x86/kernel/apic/ims.c
@@ -0,0 +1,93 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright © 2019 Intel Corporation.
+ *
+ * Author: Megha Dey 
+ */
+
+#include 
+#include 
+#include 
+#include 
+
+/*
+ * Determine if a dev is mdev or not. Return NULL if not mdev device.
+ * Return mdev's parent dev if success.
+ */
+static inline struct device *mdev_to_parent(struct device *dev)
+{
+   struct device *ret = NULL;
+   struct device *(*fn)(struct device *dev);
+   struct bus_type *bus = symbol_get(mdev_bus_type);
+
+   if (bus && dev->bus == bus) {
+   fn = symbol_get(mdev_dev_to_parent_dev);
+   ret = fn(dev);
+   symbol_put(mdev_dev_to_parent_dev);
+   symbol_put(mdev_bus_type);
+   }
+
+   return ret;
+}
+
+static struct pci_dev *ims_get_pci_dev(struct device *dev)
+{
+   struct pci_dev *pdev;
+
+   if (dev_is_mdev(dev)) {
+   struct device *parent = mdev_to_parent(dev);
+
+   pdev = to_pci_dev(parent);
+   } else {
+   pdev = to_pci_dev(dev);
+   }
+
+   return pdev;
+}
+
+int dev_ims_prepare(struct irq_domain *domain, struct device *dev, int nvec,
+   msi_alloc_info_t *arg)
+{
+   struct pci_dev *pdev = ims_get_pci_dev(dev);
+
+   init_irq_alloc_info(arg, NULL);
+   arg->msi_dev = pdev;
+   arg->type = X86_IRQ_ALLOC_TYPE_MSIX;
+
+   return 0;
+}
+EXPORT_SYMBOL_GPL(dev_ims_prepare);
+
+#ifdef CONFIG_IRQ_REMAP
+
+static struct msi_domain_ops dev_ims_domain_ops = {
+   .get_hwirq  = msi_get_hwirq,
+   .msi_prepare= dev_ims_prepare,
+};
+
+static struct irq_chip dev_ims_ir_controller = {
+   .name   = "IR-DEV-IMS",
+   .irq_unmask = dev_ims_unmask_irq,
+   .irq_mask   = dev_ims_mask_irq,
+   .irq_ack= irq_chip_ack_parent,
+   .irq_retrigger  = irq_chip_retrigger_hierarchy,
+   .irq_set_vcpu_affinity  = irq_chip_set_vcpu_affinity_parent,
+   .flags  = IRQCHIP_SKIP_SET_WAKE,
+   .irq_write_msi_msg  = dev_ims_write_msg,
+};
+
+static struct msi_domain_info ims_ir_domain_info = {
+   .flags  = MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS |
+ MSI_FLAG_MULTI_PCI_MSI | MSI_FLAG_PCI_MSIX,
+   .ops= _ims_domain_ops,
+   .chip   = _ims_ir_controller,
+   .handler= handle_edge_irq,
+   .handler_name   = "edge",
+};
+
+struct irq_domain *arch_create_ims_irq_domain(struct irq_domain *parent)
+{
+   return pci_msi_create_irq_domain(NULL, _ir_domain_info, parent);
+}
+
+#endif
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index 435bcda..65da813 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -84,7 +84,7 @@ void native_teardown_msi_irq(unsigned int irq)
irq_domain_free_irqs(irq, 1);
 }
 
-static irq_hw_number_t pci_msi_get_hwirq(struct msi_domain_info *info,
+irq_hw_number_t msi_get_hwirq(struct msi_domain_info *info,
 msi_alloc_info_t *arg)
 {
return arg->msi_hwirq;
@@ -116,7 +116,7 @@

[RFC V1 5/7] x86/ims: Introduce x86_ims_ops

This patch introduces an x86 specific indirect mechanism to setup the
interrupt message storage. The IMS specific functions (setup, teardown,
restore) become function pointers in an x86_ims_ops struct, that
defaults to their implementations in ims.c and ims-msi.c.

Cc: Jacob Pan 
Signed-off-by: Sanjay Kumar 
Signed-off-by: Megha Dey 
---
 arch/x86/include/asm/pci.h  |  4 
 arch/x86/include/asm/x86_init.h | 10 ++
 arch/x86/kernel/apic/ims.c  | 18 ++
 arch/x86/kernel/x86_init.c  | 23 +++
 drivers/base/ims-msi.c  | 34 ++
 include/linux/msi.h |  6 ++
 6 files changed, 95 insertions(+)

diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index e662f98..2ef513f 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -114,6 +114,10 @@ struct msi_desc;
 int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type);
 void native_teardown_msi_irq(unsigned int irq);
 void native_restore_msi_irqs(struct pci_dev *dev);
+#ifdef CONFIG_MSI_IMS
+int native_setup_ims_irqs(struct device *dev, int nvec);
+#endif
+
 #else
 #define native_setup_msi_irqs  NULL
 #define native_teardown_msi_irqNULL
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index ac09341..9c2cbbb 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -287,6 +287,15 @@ struct x86_msi_ops {
void (*restore_msi_irqs)(struct pci_dev *dev);
 };
 
+struct device;
+
+struct x86_ims_ops {
+   int (*setup_ims_irqs)(struct device *dev, int nvec);
+   void (*teardown_ims_irq)(unsigned int irq);
+   void (*teardown_ims_irqs)(struct device *dev);
+   void (*restore_ims_irqs)(struct device *dev);
+};
+
 struct x86_apic_ops {
unsigned int(*io_apic_read)   (unsigned int apic, unsigned int reg);
void(*restore)(void);
@@ -297,6 +306,7 @@ extern struct x86_cpuinit_ops x86_cpuinit;
 extern struct x86_platform_ops x86_platform;
 extern struct x86_msi_ops x86_msi;
 extern struct x86_apic_ops x86_apic_ops;
+extern struct x86_ims_ops x86_ims;
 
 extern void x86_early_init_platform_quirks(void);
 extern void x86_init_noop(void);
diff --git a/arch/x86/kernel/apic/ims.c b/arch/x86/kernel/apic/ims.c
index d9808a5..a539666 100644
--- a/arch/x86/kernel/apic/ims.c
+++ b/arch/x86/kernel/apic/ims.c
@@ -9,6 +9,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /*
  * Determine if a dev is mdev or not. Return NULL if not mdev device.
@@ -45,6 +46,23 @@ static struct pci_dev *ims_get_pci_dev(struct device *dev)
return pdev;
 }
 
+int native_setup_ims_irqs(struct device *dev, int nvec)
+{
+   struct irq_domain *domain;
+   struct irq_alloc_info info;
+   struct pci_dev *pdev = ims_get_pci_dev(dev);
+
+   init_irq_alloc_info(, NULL);
+   info.type = X86_IRQ_ALLOC_TYPE_MSIX;
+   info.msi_dev = pdev;
+
+   domain = irq_remapping_get_ims_irq_domain();
+   if (!domain)
+   return -ENOSYS;
+
+   return msi_domain_alloc_irqs(domain, dev, nvec);
+}
+
 int dev_ims_prepare(struct irq_domain *domain, struct device *dev, int nvec,
msi_alloc_info_t *arg)
 {
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 1bef687..3ce42d4 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -153,6 +153,29 @@ void arch_restore_msi_irqs(struct pci_dev *dev)
 }
 #endif
 
+#if defined(CONFIG_MSI_IMS)
+struct x86_ims_ops x86_ims __ro_after_init = {
+   .setup_ims_irqs = native_setup_ims_irqs,
+   .teardown_ims_irqs  = dev_ims_teardown_irqs,
+   .restore_ims_irqs   = dev_ims_restore_irqs,
+};
+
+int arch_setup_ims_irqs(struct device *dev, int nvec)
+{
+   return x86_ims.setup_ims_irqs(dev, nvec);
+}
+
+void arch_teardown_ims_irqs(struct device *dev)
+{
+   x86_ims.teardown_ims_irqs(dev);
+}
+
+void arch_restore_ims_irqs(struct device *dev)
+{
+   x86_ims.restore_ims_irqs(dev);
+}
+#endif
+
 struct x86_apic_ops x86_apic_ops __ro_after_init = {
.io_apic_read   = native_io_apic_read,
.restore= native_restore_boot_irq_mode,
diff --git a/drivers/base/ims-msi.c b/drivers/base/ims-msi.c
index 68dc10f..df28ee2 100644
--- a/drivers/base/ims-msi.c
+++ b/drivers/base/ims-msi.c
@@ -92,3 +92,37 @@ void dev_ims_write_msg(struct irq_data *data, struct msi_msg 
*msg)
__dev_write_ims_msg(desc, msg);
 }
 EXPORT_SYMBOL_GPL(dev_ims_write_msg);
+
+void dev_ims_teardown_irqs(struct device *dev)
+{
+   struct msi_desc *entry;
+
+   for_each_msi_entry(entry, dev)
+   if (entry->irq && entry->tag == IRQ_MSI_TAG_IMS)
+   arch_teardown_msi_irq(entry->irq);
+}
+
+static void dev_ims_restore_irq(struct device *dev, int irq)
+{
+   struct msi_desc *entry = NULL;
+   struct dev_ims_ops *ops;
+
+

[RFC V1 2/7] drivers/base: Introduce callbacks for IMS interrupt domain

This patch serves as a preparatory patch to introduce a new IMS
(Interrupt Message Store) domain. It consists of APIs which would
be used as callbacks to the IRQ chip associated with the IMS domain.

The APIs introduced in this patch are:
dev_ims_mask_irq - Generic irq chip callback to mask IMS interrupts
dev_ims_unmask_irq - Generic irq chip callback to unmask IMS interrupts
dev_ims_domain_write_msg - Helper to write MSI message to Device IMS

It also introduces IMS specific structures namely:
dev_ims_ops - Callbacks for IMS domain ops
dev_ims_desc - Device specific IMS msi descriptor data
dev_ims_priv_data - Internal data structure containing a unique devid
and a pointer to the IMS domain ops

Lastly, it adds a new config option MSI_IMS which must be enabled by
any driver who would want to use the IMS infrastructure.

Since IMS is not PCI compliant (like platform-msi), most of the code is
similar to platform-msi.c.

TODO: Conclude if ims-msi.c and platform-msi.c can be merged.

Cc: Jacob Pan 
Signed-off-by: Sanjay Kumar 
Signed-off-by: Megha Dey 
---
 drivers/base/Kconfig   |  7 
 drivers/base/Makefile  |  1 +
 drivers/base/ims-msi.c | 94 ++
 include/linux/msi.h| 35 ++-
 4 files changed, 136 insertions(+), 1 deletion(-)
 create mode 100644 drivers/base/ims-msi.c

diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig
index dc40449..038fabd 100644
--- a/drivers/base/Kconfig
+++ b/drivers/base/Kconfig
@@ -206,3 +206,10 @@ config GENERIC_ARCH_TOPOLOGY
  runtime.
 
 endmenu
+
+config MSI_IMS
+   bool "Device Specific Interrupt Message Storage (IMS)"
+   select GENERIC_MSI_IRQ
+   help
+ This allows device drivers to enable device specific
+ interrupt message storage (IMS) besides standard MSI-X interrupts.
diff --git a/drivers/base/Makefile b/drivers/base/Makefile
index 1574520..659b9b0 100644
--- a/drivers/base/Makefile
+++ b/drivers/base/Makefile
@@ -22,6 +22,7 @@ obj-$(CONFIG_SOC_BUS) += soc.o
 obj-$(CONFIG_PINCTRL) += pinctrl.o
 obj-$(CONFIG_DEV_COREDUMP) += devcoredump.o
 obj-$(CONFIG_GENERIC_MSI_IRQ_DOMAIN) += platform-msi.o
+obj-$(CONFIG_MSI_IMS) += ims-msi.o
 obj-$(CONFIG_GENERIC_ARCH_TOPOLOGY) += arch_topology.o
 
 obj-y  += test/
diff --git a/drivers/base/ims-msi.c b/drivers/base/ims-msi.c
new file mode 100644
index 000..68dc10f
--- /dev/null
+++ b/drivers/base/ims-msi.c
@@ -0,0 +1,94 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright © 2019 Intel Corporation.
+ *
+ * Author: Megha Dey 
+ */
+
+#include 
+#include 
+#include 
+#include 
+
+struct dev_ims_priv_data {
+   struct device   *dev;
+   msi_alloc_info_targ;
+   int devid;
+   struct dev_ims_ops  *ims_ops;
+};
+
+u32 __dev_ims_desc_mask_irq(struct msi_desc *desc, u32 flag)
+{
+   u32 mask_bits = desc->dev_ims.masked;
+   struct dev_ims_ops *ops;
+
+   ops = desc->dev_ims.priv->ims_ops;
+   if (!ops)
+   return 0;
+
+   if (flag) {
+   if (ops->irq_mask)
+   mask_bits = ops->irq_mask(desc);
+   } else {
+   if (ops->irq_unmask)
+   mask_bits = ops->irq_unmask(desc);
+   }
+
+   return mask_bits;
+}
+
+static void ims_mask_irq(struct msi_desc *desc, u32 flag)
+{
+   desc->dev_ims.masked = __dev_ims_desc_mask_irq(desc, flag);
+}
+
+static void ims_set_mask_bit(struct irq_data *data, u32 flag)
+{
+   struct msi_desc *desc = irq_data_get_msi_desc(data);
+
+   ims_mask_irq(desc, flag);
+}
+
+static void __dev_write_ims_msg(struct msi_desc *desc, struct msi_msg *msg)
+{
+   struct dev_ims_ops *ops;
+
+   ops = desc->dev_ims.priv->ims_ops;
+   if (ops && ops->irq_write_msi_msg)
+   ops->irq_write_msi_msg(desc, msg);
+
+   desc->msg = *msg;
+}
+
+/**
+ * dev_ims_mask_irq - Generic irq chip callback to mask IMS interrupts
+ * @data: pointer to irqdata associated to that interrupt
+ */
+void dev_ims_mask_irq(struct irq_data *data)
+{
+   ims_set_mask_bit(data, 1);
+}
+EXPORT_SYMBOL_GPL(dev_ims_mask_irq);
+
+/**
+ * dev_msi_unmask_irq - Generic irq chip callback to unmask IMS interrupts
+ * @data: pointer to irqdata associated to that interrupt
+ */
+void dev_ims_unmask_irq(struct irq_data *data)
+{
+   ims_set_mask_bit(data, 0);
+}
+EXPORT_SYMBOL_GPL(dev_ims_unmask_irq);
+
+/**
+ * dev_ims_write_msg - Helper to write MSI message to Device IMS
+ * @irq_data: Pointer to interrupt data of the MSI interrupt
+ * @msg:  Pointer to the message
+ */
+void dev_ims_write_msg(struct irq_data *data, struct msi_msg *msg)
+{
+   struct msi_desc *desc = irq_data_get_msi_desc(data);
+
+   __dev_write_ims_msg(desc, msg);
+}
+EXPORT_SYMBOL_GPL(dev_ims_write_msg);
diff --git a/include/linux/msi.h b/include/linux/msi.h
index 22591b6..246285a 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -17,6

[RFC V1 7/7] ims: Add the set_desc callback

Add the set_desc callback to the ims domain ops.

The set_desc callback is used to find a unique hwirq number from a given
domain.

Each mdev can have a maximum of 2048 IMS interrupts.

Cc: Jacob Pan 
Signed-off-by: Sanjay Kumar 
Signed-off-by: Megha Dey 
---
 arch/x86/kernel/apic/ims.c | 7 +++
 drivers/base/ims-msi.c | 9 +
 include/linux/msi.h| 1 +
 3 files changed, 17 insertions(+)

diff --git a/arch/x86/kernel/apic/ims.c b/arch/x86/kernel/apic/ims.c
index a539666..7e36571 100644
--- a/arch/x86/kernel/apic/ims.c
+++ b/arch/x86/kernel/apic/ims.c
@@ -76,11 +76,18 @@ int dev_ims_prepare(struct irq_domain *domain, struct 
device *dev, int nvec,
 }
 EXPORT_SYMBOL_GPL(dev_ims_prepare);
 
+void dev_ims_set_desc(msi_alloc_info_t *arg, struct msi_desc *desc)
+{
+   arg->msi_hwirq = dev_ims_calc_hwirq(desc);
+}
+EXPORT_SYMBOL_GPL(dev_ims_set_desc);
+
 #ifdef CONFIG_IRQ_REMAP
 
 static struct msi_domain_ops dev_ims_domain_ops = {
.get_hwirq  = msi_get_hwirq,
.msi_prepare= dev_ims_prepare,
+   .set_desc   = dev_ims_set_desc,
 };
 
 static struct irq_chip dev_ims_ir_controller = {
diff --git a/drivers/base/ims-msi.c b/drivers/base/ims-msi.c
index 3e579c9..48f3d24 100644
--- a/drivers/base/ims-msi.c
+++ b/drivers/base/ims-msi.c
@@ -22,6 +22,15 @@ struct dev_ims_priv_data {
 
 static DEFINE_IDA(dev_ims_devid_ida);
 
+irq_hw_number_t dev_ims_calc_hwirq(struct msi_desc *desc)
+{
+   u32 devid;
+
+   devid = desc->dev_ims.priv->devid;
+
+   return (devid << (32 - DEVIMS_ID_SHIFT)) | desc->dev_ims.ims_index;
+}
+
 u32 __dev_ims_desc_mask_irq(struct msi_desc *desc, u32 flag)
 {
u32 mask_bits = desc->dev_ims.masked;
diff --git a/include/linux/msi.h b/include/linux/msi.h
index 4543bbf..fe4678e 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -237,6 +237,7 @@ void dev_ims_teardown_irqs(struct device *dev);
 void dev_ims_restore_irqs(struct device *dev);
 int dev_ims_alloc_irqs(struct device *dev, int nvec, struct dev_ims_ops *ops);
 void dev_ims_free_irqs(struct device *dev);
+irq_hw_number_t dev_ims_calc_hwirq(struct msi_desc *desc);
 
 /*
  * The arch hooks to setup up msi irqs. Those functions are
-- 
2.7.4

[RFC V1 1/7] genirq/msi: Differentiate between various MSI based interrupts

Since a device can support both MSI-X and IMS interrupts simultaneously,
do away with is_msix and introduce a new enum msi_desc_tag to
differentiate between the various types of msi_descs.

Signed-off-by: Megha Dey 
---
 arch/mips/pci/msi-xlp.c|  2 +-
 arch/s390/pci/pci_irq.c|  2 +-
 arch/x86/kernel/apic/msi.c |  2 +-
 arch/x86/pci/xen.c |  2 +-
 drivers/pci/msi.c  | 19 ++-
 include/linux/msi.h| 11 ++-
 kernel/irq/msi.c   |  2 +-
 7 files changed, 25 insertions(+), 15 deletions(-)

diff --git a/arch/mips/pci/msi-xlp.c b/arch/mips/pci/msi-xlp.c
index bb14335..0f06ad1 100644
--- a/arch/mips/pci/msi-xlp.c
+++ b/arch/mips/pci/msi-xlp.c
@@ -457,7 +457,7 @@ int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc 
*desc)
node = slot / 8;
lnkbase = nlm_get_pcie_base(node, link);
 
-   if (desc->msi_attrib.is_msix)
+   if (desc->tag == IRQ_MSI_TAG_MSIX)
return xlp_setup_msix(lnkbase, node, link, desc);
else
return xlp_setup_msi(lnkbase, node, link, desc);
diff --git a/arch/s390/pci/pci_irq.c b/arch/s390/pci/pci_irq.c
index d80616a..1938582 100644
--- a/arch/s390/pci/pci_irq.c
+++ b/arch/s390/pci/pci_irq.c
@@ -332,7 +332,7 @@ void arch_teardown_msi_irqs(struct pci_dev *pdev)
for_each_pci_msi_entry(msi, pdev) {
if (!msi->irq)
continue;
-   if (msi->msi_attrib.is_msix)
+   if (msi->tag == IRQ_MSI_TAG_MSIX)
__pci_msix_desc_mask_irq(msi, 1);
else
__pci_msi_desc_mask_irq(msi, 1, 1);
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index 7f75334..435bcda 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -98,7 +98,7 @@ int pci_msi_prepare(struct irq_domain *domain, struct device 
*dev, int nvec,
 
init_irq_alloc_info(arg, NULL);
arg->msi_dev = pdev;
-   if (desc->msi_attrib.is_msix) {
+   if (desc->tag == IRQ_MSI_TAG_MSIX) {
arg->type = X86_IRQ_ALLOC_TYPE_MSIX;
} else {
arg->type = X86_IRQ_ALLOC_TYPE_MSI;
diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index 91220cc..5e850b8 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -382,7 +382,7 @@ static void xen_teardown_msi_irqs(struct pci_dev *dev)
struct msi_desc *msidesc;
 
msidesc = first_pci_msi_entry(dev);
-   if (msidesc->msi_attrib.is_msix)
+   if (msidesc->tag == IRQ_MSI_TAG_MSIX)
xen_pci_frontend_disable_msix(dev);
else
xen_pci_frontend_disable_msi(dev);
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 0884bed..8a05416 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -235,7 +235,7 @@ static void msi_set_mask_bit(struct irq_data *data, u32 
flag)
 {
struct msi_desc *desc = irq_data_get_msi_desc(data);
 
-   if (desc->msi_attrib.is_msix) {
+   if (desc->tag == IRQ_MSI_TAG_MSIX) {
msix_mask_irq(desc, flag);
readl(desc->mask_base); /* Flush write to device */
} else {
@@ -278,7 +278,7 @@ void __pci_read_msi_msg(struct msi_desc *entry, struct 
msi_msg *msg)
 
BUG_ON(dev->current_state != PCI_D0);
 
-   if (entry->msi_attrib.is_msix) {
+   if (entry->tag == IRQ_MSI_TAG_MSIX) {
void __iomem *base = pci_msix_desc_addr(entry);
 
if (!base) {
@@ -313,7 +313,7 @@ void __pci_write_msi_msg(struct msi_desc *entry, struct 
msi_msg *msg)
 
if (dev->current_state != PCI_D0 || pci_dev_is_disconnected(dev)) {
/* Don't touch the hardware now */
-   } else if (entry->msi_attrib.is_msix) {
+   } else if (entry->tag == IRQ_MSI_TAG_MSIX) {
void __iomem *base = pci_msix_desc_addr(entry);
 
if (!base)
@@ -376,7 +376,7 @@ static void free_msi_irqs(struct pci_dev *dev)
pci_msi_teardown_msi_irqs(dev);
 
list_for_each_entry_safe(entry, tmp, msi_list, list) {
-   if (entry->msi_attrib.is_msix) {
+   if (entry->tag == IRQ_MSI_TAG_MSIX) {
if (list_is_last(>list, msi_list))
iounmap(entry->mask_base);
}
@@ -471,7 +471,7 @@ static ssize_t msi_mode_show(struct device *dev, struct 
device_attribute *attr,
entry = irq_get_msi_desc(irq);
if (entry)
return sprintf(buf, "%s\n",
-   entry->msi_attrib.is_msix ? "msix" : "msi");
+   (entry->tag == IRQ_MSI_TAG_MSIX) ? "msix" : "msi");
 
return -ENODEV;
 }
@@ -570,7 +570,7 @@ msi_setup_entry(struct pci_dev *dev, int nvec, struct 
irq_affinity *affd)
 
pci_read_config_word(dev, dev->msi_cap + PCI_MSI_FLAGS, );
 
-   entry->msi_attrib.is_msix   = 0;
+   entry->tag  = IRQ_MSI_TAG_MSI;

[RFC V1 6/7] ims-msi: Add APIs to allocate/free IMS interrupts

This patch introduces APIs to allocate and free IMS interrupts.

Cc: Jacob Pan 
Signed-off-by: Sanjay Kumar 
Signed-off-by: Megha Dey 
---
 drivers/base/ims-msi.c | 216 +
 include/linux/msi.h|   2 +
 2 files changed, 218 insertions(+)

diff --git a/drivers/base/ims-msi.c b/drivers/base/ims-msi.c
index df28ee2..3e579c9 100644
--- a/drivers/base/ims-msi.c
+++ b/drivers/base/ims-msi.c
@@ -7,9 +7,12 @@
 
 #include 
 #include 
+#include 
 #include 
 #include 
 
+#define DEVIMS_ID_SHIFT21
+
 struct dev_ims_priv_data {
struct device   *dev;
msi_alloc_info_targ;
@@ -17,6 +20,8 @@ struct dev_ims_priv_data {
struct dev_ims_ops  *ims_ops;
 };
 
+static DEFINE_IDA(dev_ims_devid_ida);
+
 u32 __dev_ims_desc_mask_irq(struct msi_desc *desc, u32 flag)
 {
u32 mask_bits = desc->dev_ims.masked;
@@ -126,3 +131,214 @@ void dev_ims_restore_irqs(struct device *dev)
if (entry->tag == IRQ_MSI_TAG_IMS)
dev_ims_restore_irq(dev, entry->irq);
 }
+
+static void dev_ims_free_descs(struct device *dev)
+{
+   struct msi_desc *desc, *tmp;
+
+   for_each_msi_entry(desc, dev)
+   if (desc->irq && desc->tag == IRQ_MSI_TAG_IMS)
+   BUG_ON(irq_has_action(desc->irq));
+
+   dev_ims_teardown_irqs(dev);
+
+   list_for_each_entry_safe(desc, tmp, dev_to_msi_list(dev), list) {
+   if (desc->tag == IRQ_MSI_TAG_IMS) {
+   list_del(>list);
+   free_msi_entry(desc);
+   }
+   }
+}
+
+static int dev_ims_setup_msi_irqs(struct device *dev, int nvec)
+{
+   struct irq_domain *domain;
+
+   domain = dev_get_msi_domain(dev);
+   if (domain && irq_domain_is_hierarchy(domain))
+   return msi_domain_alloc_irqs(domain, dev, nvec);
+
+   return arch_setup_ims_irqs(dev, nvec);
+}
+
+static struct dev_ims_priv_data *
+dev_ims_alloc_priv_data(struct device *dev, unsigned int nvec,
+   struct dev_ims_ops *ops)
+{
+   struct dev_ims_priv_data *datap;
+   int ret;
+
+   /*
+* Currently there is no limit to the number of IRQs a device can
+* allocate.
+*/
+   if (!nvec)
+   return ERR_PTR(-EINVAL);
+
+   datap = kzalloc(sizeof(*datap), GFP_KERNEL);
+   if (!datap)
+   return ERR_PTR(-ENOMEM);
+
+   ret = ida_simple_get(_ims_devid_ida,
+   0, 1 << DEVIMS_ID_SHIFT, GFP_KERNEL);
+
+   if (ret < 0) {
+   kfree(datap);
+   return ERR_PTR(ret);
+   }
+
+   datap->devid = ret;
+   datap->ims_ops = ops;
+   datap->dev = dev;
+
+   return datap;
+}
+
+static int dev_ims_alloc_descs(struct device *dev,
+  int nvec, struct dev_ims_priv_data *data,
+  struct irq_affinity *affd)
+{
+   struct irq_affinity_desc *curmsk, *masks = NULL;
+   struct msi_desc *desc;
+   int i, base = 0;
+
+   if (!list_empty(dev_to_msi_list(dev))) {
+   desc = list_last_entry(dev_to_msi_list(dev),
+   struct msi_desc, list);
+   base = desc->dev_ims.ims_index + 1;
+   }
+
+   if (affd) {
+   masks = irq_create_affinity_masks(nvec, affd);
+   if (!masks)
+   dev_err(dev, "Unable to allocate affinity masks, 
ignoring\n");
+   }
+
+   for (i = 0, curmsk = masks; i < nvec; i++) {
+   desc = alloc_msi_entry(dev, 1, NULL);
+   if (!desc)
+   break;
+
+   desc->dev_ims.priv = data;
+   desc->tag = IRQ_MSI_TAG_IMS;
+   desc->dev_ims.ims_index = base + i;
+
+   list_add_tail(>list, dev_to_msi_list(dev));
+
+   if (masks)
+   curmsk++;
+   }
+
+   kfree(masks);
+
+   if (i != nvec) {
+   /* Clean up the mess */
+   dev_ims_free_descs(dev);
+   return -ENOMEM;
+   }
+
+   return 0;
+}
+
+static void dev_ims_free_priv_data(struct dev_ims_priv_data *data)
+{
+   ida_simple_remove(_ims_devid_ida, data->devid);
+   kfree(data);
+}
+
+/**
+ * dev_ims_enable_irqs - Allocate IMS interrupts for @dev
+ * @dev:   The device for which to allocate interrupts
+ * @nvec:  The number of interrupts to allocate
+ * @ops:   IMS device operations
+ * @affd:  optional description of the affinity requirements
+ *
+ * Returns:
+ * Zero for success, or an error code in case of failure
+ */
+int dev_ims_enable_irqs(struct device *dev, unsigned int nvec,
+   struct dev_ims_ops *ops,
+   struct irq_affinity *affd)
+{
+   struct dev_ims_priv_data *priv_data;
+   int err;
+
+   priv_data =

Re: [PATCH] leds: remove PAGE_SIZE limit of /sys/class/leds//trigger

2019-09-12 Thread Akinobu Mita

2019年9月13日(金) 2:15 Jacek Anaszewski :
>
> Hi Akinobu,
>
> Please bump patch version each time you send an update
> of the patch with the same subject.

Oops, should I resend with the correct subject?

Re: problem starting /sbin/init (32-bit 5.3-rc8)

2019-09-12 Thread Kees Cook

On Thu, Sep 12, 2019 at 02:40:19PM -0700, Randy Dunlap wrote:
> This is 32-bit kernel, just happens to be running on a 64-bit laptop.
> I added the debug printk in __phys_addr() just before "[cut here]".
> 
> CONFIG_HARDENED_USERCOPY=y

I can reproduce this under CONFIG_DEBUG_VIRTUAL=y, and it goes back
to at least to v5.2. Booting with "hardened_usercopy=off" or without
CONFIG_DEBUG_VIRTUAL makes this go away (since __phys_addr() doesn't
get called):

__check_object_size+0xff/0x1b0:
pfn_to_section_nr at include/linux/mmzone.h:1153
(inlined by) __pfn_to_section at include/linux/mmzone.h:1291
(inlined by) virt_to_head_page at include/linux/mm.h:729
(inlined by) check_heap_object at mm/usercopy.c:230
(inlined by) __check_object_size at mm/usercopy.c:280

Is virt_to_head_page() illegal to use under some recently new conditions?

> The BUG is this line in arch/x86/mm/physaddr.c:
>   VIRTUAL_BUG_ON((phys_addr >> PAGE_SHIFT) > max_low_pfn);
> It's line 83 in my source file only due to adding  and
> a conditional pr_crit() call.
> 
> 
> [   19.730409][T1] debug: unmapping init [mem 0xdc7bc000-0xdca30fff]
> [   19.734289][T1] Write protecting kernel text and read-only data: 13888k
> [   19.737675][T1] rodata_test: all tests were successful
> [   19.740757][T1] Run /sbin/init as init process
> [   19.792877][T1] __phys_addr: max_low_pfn=0x36ffe, x=0xff001ff1, 
> phys_addr=0x3f001ff1
> [   19.796561][T1] [ cut here ]
> [   19.797501][T1] kernel BUG at ../arch/x86/mm/physaddr.c:83!
> [   19.802799][T1] invalid opcode:  [#1] PREEMPT SMP DEBUG_PAGEALLOC
> [   19.803782][T1] CPU: 1 PID: 1 Comm: swapper/0 Not tainted 5.3.0-rc8 #6
> [   19.803782][T1] Hardware name: Dell Inc. Inspiron 1318 
>   /0C236D, BIOS A04 01/15/2009
> [   19.803782][T1] EIP: __phys_addr+0xaf/0x100
> [   19.803782][T1] Code: 85 c0 74 67 89 f7 c1 ef 0c 39 f8 73 2e 56 53 50 
> 68 90 9f 1f dc 68 00 eb 45 dc e8 ec b3 09 00 83 c4 14 3b 3d 30 55 cf dc 76 11 
> <0f> 0b b8 7c 3b 5c dc e8 45 53 4c 00 90 8d 74 26 00 89 d8 e8 39 cd
> [   19.803782][T1] EAX: 0044 EBX: ff001ff1 ECX:  EDX: db90a471
> [   19.803782][T1] ESI: 3f001ff1 EDI: 0003f001 EBP: f41ddea0 ESP: f41dde90
> [   19.803782][T1] DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068 EFLAGS: 
> 00010216
> [   19.803782][T1] CR0: 80050033 CR2: dc218544 CR3: 1ca39000 CR4: 000406d0
> [   19.803782][T1] Call Trace:
> [   19.803782][T1]  __check_object_size+0xaf/0x3c0
> [   19.803782][T1]  ? __might_sleep+0x80/0xa0
> [   19.803782][T1]  copy_strings+0x1c2/0x370
> [   19.803782][T1]  copy_strings_kernel+0x2b/0x40
> 
> Full boot log or kernel .config file are available if wanted.

I'll see if I can bisect, but I'm getting on a plane soon...

-- 
Kees Cook

Re: [alsa-devel] [PATCH] ASoC: fsl_sai: Implement set_bclk_ratio

On Wed, Sep 11, 2019 at 04:06:41PM +0300, Daniel Baluta wrote:
> On Wed, Sep 11, 2019 at 2:01 PM Mark Brown  wrote:
> >
> > On Thu, Sep 05, 2019 at 06:29:39PM -0700, Nicolin Chen wrote:
> > > On Sat, Aug 31, 2019 at 12:59:10AM +0300, Daniel Baluta wrote:
> >
> > > > This is to allow machine drivers to set a certain bitclk rate
> > > > which might not be exactly rate * frame size.
> >
> > > Just a quick thought of mine: slot_width and slots could be
> > > set via set_dai_tdm_slot() actually, while set_bclk_ratio()
> > > would override that one with your change. I'm not sure which
> > > one could be more important...so would you mind elaborating
> > > your use case?
> >
> > The reason we have both operations is partly that some hardware
> > can configure the ratio but not do TDM and partly that setting
> > TDM slots forces us to configure the slot size depending on the
> > current stream configuration while just setting the ratio means
> > we can just fix the configuration once.  I'd say it's just a user
> > error to try to do both simultaneously.
> 
> Yes, exactly. We wanted to have a better control of bclk freq.
> Sorry for the late answer, I'm traveling.

I see. Thanks for the explain. Just acked.

[PATCH] KVM: x86: Handle unexpected MMIO accesses using master abort semantics

Use master abort semantics, i.e. reads return all ones and writes are
dropped, to handle unexpected MMIO accesses when reading guest memory
instead of returning X86EMUL_IO_NEEDED, which in turn gets interpreted
as a guest page fault.

Emulation of certain instructions, notably VMX instructions, involves
reading or writing guest memory without going through the emulator.
These emulation flows are not equipped to handle MMIO accesses as no
sane and properly functioning guest kernel will target MMIO with such
instructions, and so simply inject a page fault in response to
X86EMUL_IO_NEEDED.

While not 100% correct, using master abort semantics is at least
sometimes correct, e.g. non-existent MMIO accesses do actually master
abort, whereas injecting a page fault is always wrong, i.e. the issue
lies in the physical address domain, not in the virtual to physical
translation.

Apply the logic to kvm_write_guest_virt_system() in addition to
replacing existing #PF logic in kvm_read_guest_virt(), as VMPTRST uses
the former, i.e. can also leak a host stack address.

Reported-by: Fuqian Huang 
Cc: sta...@vger.kernel.org
Signed-off-by: Sean Christopherson 
---
 arch/x86/kvm/x86.c | 40 +++-
 1 file changed, 31 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b4cfd786d0b6..d1d7e9fac17a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5234,16 +5234,24 @@ int kvm_read_guest_virt(struct kvm_vcpu *vcpu,
   struct x86_exception *exception)
 {
u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+   int r;
+
+   r = kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
+  exception);
 
/*
-* FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED
-* is returned, but our callers are not ready for that and they blindly
-* call kvm_inject_page_fault.  Ensure that they at least do not leak
-* uninitialized kernel stack memory into cr2 and error code.
+* FIXME: this should technically call out to userspace to handle the
+* MMIO access, but our callers are not ready for that, so emulate
+* master abort behavior instead, i.e. writes are dropped.
 */
-   memset(exception, 0, sizeof(*exception));
-   return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
- exception);
+   if (r == X86EMUL_IO_NEEDED) {
+   memset(val, 0xff, bytes);
+   return 0;
+   }
+   if (r == X86EMUL_PROPAGATE_FAULT)
+   return -EFAULT;
+   WARN_ON_ONCE(r);
+   return 0;
 }
 EXPORT_SYMBOL_GPL(kvm_read_guest_virt);
 
@@ -5317,11 +5325,25 @@ static int emulator_write_std(struct x86_emulate_ctxt 
*ctxt, gva_t addr, void *v
 int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, gva_t addr, void *val,
unsigned int bytes, struct x86_exception 
*exception)
 {
+   int r;
+
/* kvm_write_guest_virt_system can pull in tons of pages. */
vcpu->arch.l1tf_flush_l1d = true;
 
-   return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
-  PFERR_WRITE_MASK, exception);
+   r = kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
+   PFERR_WRITE_MASK, exception);
+
+   /*
+* FIXME: this should technically call out to userspace to handle the
+* MMIO access, but our callers are not ready for that, so emulate
+* master abort behavior instead, i.e. writes are dropped.
+*/
+   if (r == X86EMUL_IO_NEEDED)
+   return 0;
+   if (r == X86EMUL_PROPAGATE_FAULT)
+   return -EFAULT;
+   WARN_ON_ONCE(r);
+   return 0;
 }
 EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system);
 
-- 
2.22.0

Re: [PATCH] ASoC: fsl_sai: Implement set_bclk_ratio

On Sat, Aug 31, 2019 at 12:59:10AM +0300, Daniel Baluta wrote:
> From: Viorel Suman 
> 
> This is to allow machine drivers to set a certain bitclk rate
> which might not be exactly rate * frame size.
> 
> Cc: NXP Linux Team 
> Signed-off-by: Viorel Suman 
> Signed-off-by: Daniel Baluta 

Acked-by: Nicolin Chen 

> ---
>  sound/soc/fsl/fsl_sai.c | 21 +++--
>  sound/soc/fsl/fsl_sai.h |  1 +
>  2 files changed, 20 insertions(+), 2 deletions(-)
> 
> diff --git a/sound/soc/fsl/fsl_sai.c b/sound/soc/fsl/fsl_sai.c
> index fe126029f4e3..e896b577b1f7 100644
> --- a/sound/soc/fsl/fsl_sai.c
> +++ b/sound/soc/fsl/fsl_sai.c
> @@ -137,6 +137,16 @@ static int fsl_sai_set_dai_tdm_slot(struct snd_soc_dai 
> *cpu_dai, u32 tx_mask,
>   return 0;
>  }
>  
> +static int fsl_sai_set_dai_bclk_ratio(struct snd_soc_dai *dai,
> +   unsigned int ratio)
> +{
> + struct fsl_sai *sai = snd_soc_dai_get_drvdata(dai);
> +
> + sai->bclk_ratio = ratio;
> +
> + return 0;
> +}
> +
>  static int fsl_sai_set_dai_sysclk_tr(struct snd_soc_dai *cpu_dai,
>   int clk_id, unsigned int freq, int fsl_dir)
>  {
> @@ -423,8 +433,14 @@ static int fsl_sai_hw_params(struct snd_pcm_substream 
> *substream,
>   slot_width = sai->slot_width;
>  
>   if (!sai->is_slave_mode) {
> - ret = fsl_sai_set_bclk(cpu_dai, tx,
> - slots * slot_width * params_rate(params));
> + if (sai->bclk_ratio)
> + ret = fsl_sai_set_bclk(cpu_dai, tx,
> +sai->bclk_ratio *
> +params_rate(params));
> + else
> + ret = fsl_sai_set_bclk(cpu_dai, tx,
> +slots * slot_width *
> +params_rate(params));
>   if (ret)
>   return ret;
>  
> @@ -640,6 +656,7 @@ static void fsl_sai_shutdown(struct snd_pcm_substream 
> *substream,
>  }
>  
>  static const struct snd_soc_dai_ops fsl_sai_pcm_dai_ops = {
> + .set_bclk_ratio = fsl_sai_set_dai_bclk_ratio,
>   .set_sysclk = fsl_sai_set_dai_sysclk,
>   .set_fmt= fsl_sai_set_dai_fmt,
>   .set_tdm_slot   = fsl_sai_set_dai_tdm_slot,
> diff --git a/sound/soc/fsl/fsl_sai.h b/sound/soc/fsl/fsl_sai.h
> index 3a3f6f8e5595..f96f8d97489d 100644
> --- a/sound/soc/fsl/fsl_sai.h
> +++ b/sound/soc/fsl/fsl_sai.h
> @@ -177,6 +177,7 @@ struct fsl_sai {
>   unsigned int mclk_streams;
>   unsigned int slots;
>   unsigned int slot_width;
> + unsigned int bclk_ratio;
>  
>   const struct fsl_sai_soc_data *soc_data;
>   struct snd_dmaengine_dai_dma_data dma_params_rx;
> -- 
> 2.17.1
>

Re: [PATCH 3/3] ASoC: fsl_asrc: Fix error with S24_3LE format bitstream in i.MX8

On Wed, Sep 11, 2019 at 12:08:07PM +0100, Mark Brown wrote:
> On Mon, Sep 09, 2019 at 06:52:13PM -0700, Nicolin Chen wrote:
> 
> > And a quick feeling is that below code is mostly identical to what
> > is in the soc-generic-dmaengine-pcm.c file. So I'm wondering if we
> > could abstract a helper function somewhere in the ASoC core: Mark?
> 
> That's roughly what sound/core/pcm_dmaengine.c is doing -
> possibly we should move more stuff into there.

It looks like a right place to me. Thank you!

Re: [PATCH 2/3] ASoC: fsl_asrc: update supported sample format

On Tue, Sep 10, 2019 at 02:07:25AM +, S.j. Wang wrote:
> > On Mon, Sep 09, 2019 at 06:33:20PM -0400, Shengjiu Wang wrote:
> > > The ASRC support 24bit/16bit/8bit input width, so S20_3LE format
> > > should not be supported, it is word width is 20bit.
> > 
> > I thought 3LE used 24-bit physical width. And the driver assigns
> > ASRC_WIDTH_24_BIT to "width" for all non-16bit cases, so 20-bit would go
> > for that 24-bit slot also. I don't clearly recall if I had explicitly tested
> > S20_3LE, but I feel it should work since I put there...
> 
> For S20_3LE, the width is 20bit,  but the ASRC only support 24bit, if set the
> ASRMCR1n.IWD= 24bit, because the actual width is 20 bit, the volume is
> Lower than expected,  it likes 24bit data right shift 4 bit.
> So it is not supported.

Hmm..S20_3LE right-aligns 20 bits in a 24-bit slot? I thought
they're left aligned...

If this is the case...shouldn't we have the same lower-volume
problem for all hardwares that support S20_3LE now?

Re: [PATCH] KVM: x86: work around leak of uninitialized stack contents

On Thu, Sep 12, 2019 at 02:20:09PM -0700, Jim Mattson wrote:
> On Wed, Sep 11, 2019 at 9:18 PM Fuqian Huang  wrote:
> >
> > Emulation of VMPTRST can incorrectly inject a page fault
> > when passed an operand that points to an MMIO address.
> > The page fault will use uninitialized kernel stack memory
> > as the CR2 and error code.
> >
> > The right behavior would be to abort the VM with a KVM_EXIT_INTERNAL_ERROR
> > exit to userspace; however, it is not an easy fix, so for now just ensure
> > that the error code and CR2 are zero.
> >
> > Signed-off-by: Fuqian Huang 
> > ---
> >  arch/x86/kvm/x86.c | 1 +
> >  1 file changed, 1 insertion(+)
> >
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index 290c3c3efb87..7f442d710858 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -5312,6 +5312,7 @@ int kvm_write_guest_virt_system(struct kvm_vcpu 
> > *vcpu, gva_t addr, void *val,
> > /* kvm_write_guest_virt_system can pull in tons of pages. */
> > vcpu->arch.l1tf_flush_l1d = true;
> >
> > +   memset(exception, 0, sizeof(*exception));
> > return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
> >PFERR_WRITE_MASK, exception);
> >  }
> > --
> > 2.11.0
> >
> Perhaps you could also add a comment like the one Paolo added when he
> made the same change in kvm_read_guest_virt?
> See commit 353c0956a618 ("KVM: x86: work around leak of uninitialized
> stack contents (CVE-2019-7222)").

I have a better hack-a-fix, we can handle the unexpected MMIO using master
abort semantics, i.e. reads return all ones, writes are dropped.  It's not
100% correct as KVM won't handle the case where the address is legit MMIO,
but it's at least sometimes correct and thus better than a #PF.

Patch and a unit test incoming...

Re: [rfc patch script] treewide conversion of __section(foo) to section("foo");

On Thu, 2019-09-12 at 15:45 -0700, Nick Desaulniers wrote:
> If you want to email me just the patch file (so I don't have to
> copy+pasta from an email),

Lazy... ;)

> I'd be happy to apply it and compile+boot test a few more arch's
> than x86.

Thanks.  attached.



section.pl
Description: Perl program

Memory corruption (redzone overwritten) names_cache?

2019-09-12 Thread Jakub Jankowski


Hi,

We're getting some random memory corruption on an AWS EC2 instance with 
4.19.x kernels. I've tried 4.19.19, 4.19.52, but the results below are 
from the most recent (4.19.72). For debugging I enabled 
KASAN+slub_debug, but TBH, I can't make heads or tails from these.


Without slub_debug, the host reboots within couple of minutes of uptime. 
With slub_debug it survives a bit longer, but eventually all sorts of 
issues manifest (including: reboot; ps not being able to read some 
processes' /proc//cmdline while /proc//stack shows 
acct_collect()->down_read(), etc).


Upon multiple tests, the slab I most often seen pop up as first detected 
as corrupted was names_cache.
What is really weird is that multiple times I saw redzone being 
overwritten by the same content, which looks like part of 'sessions.py'  
Python's 'requests' module.


Any debugging hints would be greatly appreciated.


Command line: BOOT_IMAGE=(hd0,msdos2)/vmlinuz ro root=/dev/xvda5 console=tty0 
console=ttyS0,9600n8 crashkernel=512M-2G:64M,2G-:128M kmemleak=on 
slub_debug=FZPU slub_nomerge
(...)
[  262.957418] 
=
[  262.957423] BUG vm_area_struct (Tainted: GB  O ): Redzone 
overwritten
[  262.957424] 
-

[  262.957427] INFO: 0xb91cc681-0x98bd5238. First byte 0x6e 
instead of 0xcc
[  262.957433] INFO: Allocated in vm_area_dup+0x1e/0x180 age=6117 cpu=0 pid=8187
[  262.957438]  kmem_cache_alloc+0x1a4/0x1d0
[  262.957439]  vm_area_dup+0x1e/0x180
[  262.957441]  copy_process.part.4+0x2fa9/0x6cd0
[  262.957443]  _do_fork+0x151/0x7a0
[  262.957446]  do_syscall_64+0x9b/0x290
[  262.957452]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
[  262.957455] INFO: Freed in qlist_free_all+0x37/0xd0 age=7431 cpu=0 pid=8521
[  262.957457]  quarantine_reduce+0x1a2/0x210
[  262.957458]  kasan_kmalloc+0x95/0xc0
[  262.957460]  kmem_cache_alloc+0xc6/0x1d0
[  262.957463]  getname_flags+0xba/0x510
[  262.957465]  user_path_at_empty+0x1d/0x40
[  262.957468]  vfs_statx+0xb9/0x140
[  262.957470]  __se_sys_newstat+0x7c/0xd0
[  262.957472]  do_syscall_64+0x9b/0x290
[  262.957474]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
[  262.957476] INFO: Slab 0xca532806 objects=30 used=24 
fp=0x6ce6da86 flags=0x2008101
[  262.957477] INFO: Object 0x5eb7e26b @offset=8 fp=0xac807fa7

[  262.957480] Redzone b91cc681: 6e 73 2e 70 79 5c 22 2c 
 ns.py\",
[  262.957482] Object 5eb7e26b: 20 6c 69 6e 65 20 36 34 36 2c 20 69 6e 
20 73 65   line 646, in se
[  262.957484] Object 7d5d4673: 6e 64 5c 6e 20 20 20 20 72 20 3d 20 61 
64 61 70  nd\nr = adap
[  262.957485] Object a3cf6db1: 74 65 72 2e 73 65 6e 64 28 72 65 71 75 
65 73 74  ter.send(request
[  262.957487] Object d8b14cdd: 2c 20 2a 2a 6b 77 61 72 00 00 00 00 00 
00 00 00  , **kwar
[  262.957489] Object 5eca0928: 40 97 5a 73 83 88 ff ff 25 00 00 00 00 
00 00 80  @.Zs%...
[  262.957491] Object 592ffbd7: 71 00 00 00 00 00 00 00 e0 c8 22 6d 83 88 ff 
ff  q."m
[  262.957492] Object 84c88ae5: 00 00 00 00 00 00 00 00 00 00 00 00 00 
00 00 00  
[  262.957494] Object ea6d1cb3: 83 00 00 00 00 00 00 00 80 c0 fd 5a 83 
88 ff ff  ...Z
[  262.957495] Object a236617c: 80 c0 fd 5a 83 88 ff ff 00 00 00 00 00 
00 00 00  ...Z
[  262.957497] Object 91c7956c: 00 3a 94 b0 ff ff ff ff 75 00 00 00 00 
00 00 00  .:..u...
[  262.957499] Object 216cef35: c0 85 cc 6a 83 88 ff ff 00 00 00 00 00 
00 00 00  ...j
[  262.957500] Object e0fd506c: 00 00 00 00 00 00 00 00 00 00 00 00 00 
00 00 00  
[  262.957502] Redzone f5906e86: cc cc cc cc cc cc cc cc
  
[  262.957503] Padding 53d79574: 5a 5a 5a 5a 5a 5a 5a 5a
  
[  262.957507] CPU: 3 PID: 11769 Comm: ps Kdump: loaded Tainted: GB  O  
4.19.72_3upstreamdbg #1
[  262.957508] Hardware name: Xen HVM domU, BIOS 4.2.amazon 08/24/2006
[  262.957509] Call Trace:
[  262.957516]  dump_stack+0x9a/0xf0
[  262.957519]  check_bytes_and_report.cold.24+0x3f/0x6b
[  262.957521]  check_object+0x17c/0x280
[  262.957524]  free_debug_processing+0x105/0x2a0
[  262.957526]  ? qlist_free_all+0x37/0xd0
[  262.957527]  ? qlist_free_all+0x37/0xd0
[  262.957529]  __slab_free+0x218/0x3b0
[  262.957533]  ? __free_pages_ok+0x62f/0x840
[  262.957536]  ? _raw_spin_unlock_irqrestore+0x2b/0x40
[  262.957537]  ? qlist_free_all+0x37/0xd0
[  262.957541]  ? trace_hardirqs_on+0x35/0x140
[  262.957543]  ? qlist_free_all+0x37/0xd0
[  262.957544]  qlist_free_all+0x4c/0xd0
[  262.957546]  quarantine_reduce+0x1a2/0x210
[  262.957549]  ? getname_flags+0xba/0x510
[  262.957550]  kasan_kmalloc+0x95/0xc0
[  262.957553]  ?

clang-format and 'clang-format on' and 'clang-format off'

On Thu, 2019-09-12 at 23:58 +0200, Miguel Ojeda wrote:
> On Thu, Sep 12, 2019 at 11:08 PM Joe Perches  wrote:

> > Marking sections _no_auto_format_ isn't really a
> > good solution is it?
> 
> I am thinking about special tables that are hand-crafted or very
> complex macros. For those, yes, I think it is a fine solution.

Can the 'clang-format on/off' trigger be indirected into
something non-clang specific via a macro?

Not every project is going to use only the clang-format tool.

Re: [EXT] Re: [PATCH 1/3] ASoC: fsl_asrc: Use in(out)put_format instead of in(out)put_word_width

On Tue, Sep 10, 2019 at 02:22:06AM +, S.j. Wang wrote:
> Hi
> 
> > 
> > On Mon, Sep 09, 2019 at 06:33:19PM -0400, Shengjiu Wang wrote:
> > > snd_pcm_format_t is more formal than enum asrc_word_width, which
> > has
> > > two property, width and physical width, which is more accurate than
> > > enum asrc_word_width. So it is better to use in(out)put_format instead
> > > of in(out)put_word_width.
> > 
> > Hmm...I don't really see the benefit of using snd_pcm_format_t here...I
> > mean, I know it's a generic one, and would understand if we use it as a
> > param for a common API. But this patch merely packs the "width" by
> > intentionally using this snd_pcm_format_t and then adds another
> > translation to unpack it.. I feel it's a bit overcomplicated. Or am I 
> > missing
> > something?
> > 
> > And I feel it's not necessary to use ALSA common format in our own "struct
> > asrc_config" since it is more IP/register specific.
> > 
> > Thanks
> > Nicolin
> > 
> 
> As you know, we have another M2M function internally, when user want to
> Set the format through M2M API, it is better to use snd_pcm_format_t instead 
> the
> Width, for snd_pcm_format_t include two property, data with and physical width
> In driver some place need data width, some place need physical width.
> For example how to distinguish S24_LE and S24_3LE in driver,  DMA setting 
> needs
> The physical width,  but ASRC need data width. 
> 
> Another purpose is that we have another new designed ASRC, which support more
> Formats, I would like it can share same API with this ASRC, using 
> snd_pcm_format_t
> That we can use the common API, like snd_pcm_format_linear,
> snd_pcm_format_big_endian to get the property of the format, which is needed 
> by
> driver.

I see. Just acked the patch.

Re: [PATCH 1/3] ASoC: fsl_asrc: Use in(out)put_format instead of in(out)put_word_width

On Mon, Sep 09, 2019 at 06:33:19PM -0400, Shengjiu Wang wrote:
> snd_pcm_format_t is more formal than enum asrc_word_width, which has
> two property, width and physical width, which is more accurate than
> enum asrc_word_width. So it is better to use in(out)put_format
> instead of in(out)put_word_width.
> 
> Signed-off-by: Shengjiu Wang 

Acked-by: Nicolin Chen 

> ---
>  sound/soc/fsl/fsl_asrc.c | 56 +++-
>  sound/soc/fsl/fsl_asrc.h |  4 +--
>  2 files changed, 40 insertions(+), 20 deletions(-)
> 
> diff --git a/sound/soc/fsl/fsl_asrc.c b/sound/soc/fsl/fsl_asrc.c
> index cfa40ef6b1ca..4d3804a1ea55 100644
> --- a/sound/soc/fsl/fsl_asrc.c
> +++ b/sound/soc/fsl/fsl_asrc.c
> @@ -265,6 +265,8 @@ static int fsl_asrc_config_pair(struct fsl_asrc_pair 
> *pair)
>   struct asrc_config *config = pair->config;
>   struct fsl_asrc *asrc_priv = pair->asrc_priv;
>   enum asrc_pair_index index = pair->index;
> + enum asrc_word_width input_word_width;
> + enum asrc_word_width output_word_width;
>   u32 inrate, outrate, indiv, outdiv;
>   u32 clk_index[2], div[2];
>   int in, out, channels;
> @@ -283,9 +285,32 @@ static int fsl_asrc_config_pair(struct fsl_asrc_pair 
> *pair)
>   return -EINVAL;
>   }
>  
> - /* Validate output width */
> - if (config->output_word_width == ASRC_WIDTH_8_BIT) {
> - pair_err("does not support 8bit width output\n");
> + switch (snd_pcm_format_width(config->input_format)) {
> + case 8:
> + input_word_width = ASRC_WIDTH_8_BIT;
> + break;
> + case 16:
> + input_word_width = ASRC_WIDTH_16_BIT;
> + break;
> + case 24:
> + input_word_width = ASRC_WIDTH_24_BIT;
> + break;
> + default:
> + pair_err("does not support this input format, %d\n",
> +  config->input_format);
> + return -EINVAL;
> + }
> +
> + switch (snd_pcm_format_width(config->output_format)) {
> + case 16:
> + output_word_width = ASRC_WIDTH_16_BIT;
> + break;
> + case 24:
> + output_word_width = ASRC_WIDTH_24_BIT;
> + break;
> + default:
> + pair_err("does not support this output format, %d\n",
> +  config->output_format);
>   return -EINVAL;
>   }
>  
> @@ -383,8 +408,8 @@ static int fsl_asrc_config_pair(struct fsl_asrc_pair 
> *pair)
>   /* Implement word_width configurations */
>   regmap_update_bits(asrc_priv->regmap, REG_ASRMCR1(index),
>  ASRMCR1i_OW16_MASK | ASRMCR1i_IWD_MASK,
> -ASRMCR1i_OW16(config->output_word_width) |
> -ASRMCR1i_IWD(config->input_word_width));
> +ASRMCR1i_OW16(output_word_width) |
> +ASRMCR1i_IWD(input_word_width));
>  
>   /* Enable BUFFER STALL */
>   regmap_update_bits(asrc_priv->regmap, REG_ASRMCR(index),
> @@ -497,13 +522,13 @@ static int fsl_asrc_dai_hw_params(struct 
> snd_pcm_substream *substream,
> struct snd_soc_dai *dai)
>  {
>   struct fsl_asrc *asrc_priv = snd_soc_dai_get_drvdata(dai);
> - int width = params_width(params);
>   struct snd_pcm_runtime *runtime = substream->runtime;
>   struct fsl_asrc_pair *pair = runtime->private_data;
>   unsigned int channels = params_channels(params);
>   unsigned int rate = params_rate(params);
>   struct asrc_config config;
> - int word_width, ret;
> + snd_pcm_format_t format;
> + int ret;
>  
>   ret = fsl_asrc_request_pair(channels, pair);
>   if (ret) {
> @@ -513,15 +538,10 @@ static int fsl_asrc_dai_hw_params(struct 
> snd_pcm_substream *substream,
>  
>   pair->config = 
>  
> - if (width == 16)
> - width = ASRC_WIDTH_16_BIT;
> - else
> - width = ASRC_WIDTH_24_BIT;
> -
>   if (asrc_priv->asrc_width == 16)
> - word_width = ASRC_WIDTH_16_BIT;
> + format = SNDRV_PCM_FORMAT_S16_LE;
>   else
> - word_width = ASRC_WIDTH_24_BIT;
> + format = SNDRV_PCM_FORMAT_S24_LE;
>  
>   config.pair = pair->index;
>   config.channel_num = channels;
> @@ -529,13 +549,13 @@ static int fsl_asrc_dai_hw_params(struct 
> snd_pcm_substream *substream,
>   config.outclk = OUTCLK_ASRCK1_CLK;
>  
>   if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK) {
> - config.input_word_width   = width;
> - config.output_word_width  = word_width;
> + config.input_format   = params_format(params);
> + config.output_format  = format;
>   config.input_sample_rate  = rate;
>   config.output_sample_rate = asrc_priv->asrc_rate;
>   } else {
> - config.input_word_width   = word_width;
> - config.output_word_width  = width;
> +

[PATCH] mm: memory: fix /proc/meminfo reporting for MLOCK_ONFAULT

2019-09-12 Thread Lucian Adrian Grijincu

As pages are faulted in MLOCK_ONFAULT correctly updates
/proc/self/smaps, but doesn't update /proc/meminfo's Mlocked field.

- Before this /proc/meminfo fields didn't change as pages were faulted in:

```
= Start =
/proc/meminfo
Unevictable:   10128 kB
Mlocked:   10132 kB
= Creating testfile =

= after mlock2(MLOCK_ONFAULT) =
/proc/meminfo
Unevictable:   10128 kB
Mlocked:   10132 kB
/proc/self/smaps
7f871400-7f875400 rw-s  08:04 50857050   /root/testfile
Locked:0 kB

= after reading half of the file =
/proc/meminfo
Unevictable:   10128 kB
Mlocked:   10132 kB
/proc/self/smaps
7f871400-7f875400 rw-s  08:04 50857050   /root/testfile
Locked:   524288 kB

= after reading the entire the file =
/proc/meminfo
Unevictable:   10128 kB
Mlocked:   10132 kB
/proc/self/smaps
7f871400-7f875400 rw-s  08:04 50857050   /root/testfile
Locked:  1048576 kB

= after munmap =
/proc/meminfo
Unevictable:   10128 kB
Mlocked:   10132 kB
/proc/self/smaps
```

- After: /proc/meminfo fields are properly updated as pages are touched:

```
= Start =
/proc/meminfo
Unevictable:  60 kB
Mlocked:  60 kB
= Creating testfile =

= after mlock2(MLOCK_ONFAULT) =
/proc/meminfo
Unevictable:  60 kB
Mlocked:  60 kB
/proc/self/smaps
7f2b9c60-7f2bdc60 rw-s  08:04 63045798   /root/testfile
Locked:0 kB

= after reading half of the file =
/proc/meminfo
Unevictable:  524220 kB
Mlocked:  524220 kB
/proc/self/smaps
7f2b9c60-7f2bdc60 rw-s  08:04 63045798   /root/testfile
Locked:   524288 kB

= after reading the entire the file =
/proc/meminfo
Unevictable: 1048496 kB
Mlocked: 1048508 kB
/proc/self/smaps
7f2b9c60-7f2bdc60 rw-s  08:04 63045798   /root/testfile
Locked:  1048576 kB

= after munmap =
/proc/meminfo
Unevictable: 176 kB
Mlocked:  60 kB
/proc/self/smaps
```

Repro code.
---

int mlock2wrap(const void* addr, size_t len, int flags) {
  return syscall(SYS_mlock2, addr, len, flags);
}

void smaps() {
  char smapscmd[1000];
  snprintf(
  smapscmd,
  sizeof(smapscmd) - 1,
  "grep testfile -A 20 /proc/%d/smaps | grep -E '(testfile|Locked)'",
  getpid());
  printf("/proc/self/smaps\n");
  fflush(stdout);
  system(smapscmd);
}

void meminfo() {
  const char* meminfocmd = "grep -E '(Mlocked|Unevictable)' /proc/meminfo";
  printf("/proc/meminfo\n");
  fflush(stdout);
  system(meminfocmd);
}

  { \
int rc = (call);\
if (rc != 0) {  \
  printf("error %d %s\n", rc, strerror(errno)); \
  exit(1);  \
}   \
  }
int main(int argc, char* argv[]) {
  printf("= Start =\n");
  meminfo();

  printf("= Creating testfile =\n");
  size_t size = 1 << 30; // 1 GiB
  int fd = open("testfile", O_CREAT | O_RDWR, 0666);
  {
void* buf = malloc(size);
write(fd, buf, size);
free(buf);
  }
  int ret = 0;
  void* addr = NULL;
  addr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);

  if (argc > 1) {
PCHECK(mlock2wrap(addr, size, MLOCK_ONFAULT));
printf("= after mlock2(MLOCK_ONFAULT) =\n");
meminfo();
smaps();

for (size_t i = 0; i < size / 2; i += 4096) {
  ret += ((char*)addr)[i];
}
printf("= after reading half of the file =\n");
meminfo();
smaps();

for (size_t i = 0; i < size; i += 4096) {
  ret += ((char*)addr)[i];
}
printf("= after reading the entire the file =\n");
meminfo();
smaps();

  } else {
PCHECK(mlock(addr, size));
printf("= after mlock =\n");
meminfo();
smaps();
  }

  PCHECK(munmap(addr, size));
  printf("= after munmap =\n");
  meminfo();
  smaps();

  return ret;
}

---

Signed-off-by: Lucian Adrian Grijincu 
---
 mm/memory.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/mm/memory.c b/mm/memory.c
index e0c232fe81d9..7e8dc3ed4e89 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3311,6 +3311,9 @@ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct 
mem_cgroup *memcg,
} else {
inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
page_add_file_rmap(page, false);
+   if ((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) == VM_LOCKED &&
+   !PageTransCompound(page))
+   mlock_vma_page(page);
}
set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
 
-- 
2.17.1

Re: [RFC PATCH v3 00/16] Core scheduling v3

2019-09-12 Thread Aubrey Li

On Thu, Sep 12, 2019 at 8:04 PM Aaron Lu  wrote:
>
> On Wed, Sep 11, 2019 at 09:19:02AM -0700, Tim Chen wrote:
> > On 9/11/19 7:02 AM, Aaron Lu wrote:
> > I think Julien's result show that my patches did not do as well as
> > your patches for fairness. Aubrey did some other testing with the same
> > conclusion.  So I think keeping the forced idle time balanced is not
> > enough for maintaining fairness.
>
> Well, I have done following tests:
> 1 Julien's test script: https://paste.debian.net/plainh/834cf45c
> 2 start two tagged will-it-scale/page_fault1, see how each performs;
> 3 Aubrey's mysql test: https://github.com/aubreyli/coresched_bench.git
>
> They all show your patchset performs equally well...And consider what
> the patch does, I think they are really doing the same thing in
> different ways.

It looks like we are not on the same page, if you don't mind, can both of
you rebase your patchset onto v5.3-rc8 and provide a public branch so I
can fetch and test it at least by my benchmark?

Thanks,
-Aubrey

RE: [PATCH] scsi: storvsc: Add the support of hibernation

2019-09-12 Thread Dexuan Cui

> From: linux-scsi-ow...@vger.kernel.org 
> On Behalf Of kbuild test robot
> Sent: Thursday, September 12, 2019 1:54 PM
> To: Dexuan Cui 
> Cc: kbuild-...@01.org; KY Srinivasan ; Haiyang Zhang
> ; Stephen Hemminger
> ; sas...@kernel.org; j...@linux.ibm.com;
> martin.peter...@oracle.com; linux-hyp...@vger.kernel.org;
> linux-s...@vger.kernel.org; linux-kernel@vger.kernel.org; Michael Kelley
> ; Dexuan Cui 
> Subject: Re: [PATCH] scsi: storvsc: Add the support of hibernation
> 
> Hi Dexuan,
> 
> Thank you for the patch! Yet something to improve:
> 
> [auto build test ERROR on linus/master]
> [cannot apply to v5.3-rc8 next-20190904]
> [if your patch is applied to the wrong git tree, please drop us a note to help
> improve the system]
> 
> >> drivers//scsi/storvsc_drv.c:1982:3: error: 'struct hv_driver' has no member
> named 'suspend'
>  .suspend = storvsc_suspend,
>   ^~~

This build failure is expected: In the patch mail, I mentioned this patch
has a build dependency on the commit 271b2224d42f ("Drivers: hv: vmbus: 
Implement
suspend/resume for VSC drivers for hibernation"), which is on Sasha Levin's
Hyper-V tree's hyperv-next branch:
https://git.kernel.org/pub/scm/linux/kernel/git/hyperv/linux.git/log/?h=hyperv-next

Thanks,
-- Dexuan

Re: [PATCH 00/13] nvdimm: Use more common kernel coding style

On Thu, 2019-09-12 at 16:00 -0700, Nick Desaulniers wrote:

> Consider the fact that not all kernel developers run checkpatch.pl.
> Is that a deficiency in checkpatch.pl, or the lack of enforcement in
> kernel developers' workflows?

No.  Mostly it's because the kernel is like a bunch of little
untethered development planets, each with a little prince that
wants to keep their own little fiefdom separate from the others.

Re: [PATCH 00/13] nvdimm: Use more common kernel coding style

2019-09-12 Thread Nick Desaulniers

On Thu, Sep 12, 2019 at 3:38 PM Joe Perches  wrote:
>
> On Thu, 2019-09-12 at 23:58 +0200, Miguel Ojeda wrote:
> > On Thu, Sep 12, 2019 at 11:08 PM Joe Perches  wrote:
> > > Please name the major projects and then point to their
> > > .clang-format equivalents.
> > >
> > > Also note the size/scope/complexity of the major projects.
> >
> > Mozilla, WebKit, LLVM and Microsoft. They have their style distributed
> > with the official clang-format, not sure if they enforce it.
>
> At least for LLVM, it appears not.

I acknowledge the irony you present, but that's because there's no
enforcement on the LLVM side.  I frequently forget to run:
$ git-clang-format HEAD~

If you have automated systems that help encourage (ie. force) the use
of the formatter, this helps.

Consider the fact that not all kernel developers run checkpatch.pl.
Is that a deficiency in checkpatch.pl, or the lack of enforcement in
kernel developers' workflows?
-- 
Thanks,
~Nick Desaulniers

Re: [PATCH 00/13] nvdimm: Use more common kernel coding style

2019-09-12 Thread Nick Desaulniers

On Thu, Sep 12, 2019 at 2:58 PM Miguel Ojeda
 wrote:
>
> On Thu, Sep 12, 2019 at 11:08 PM Joe Perches  wrote:
> >
> > Please name the major projects and then point to their
> > .clang-format equivalents.
> >
> > Also note the size/scope/complexity of the major projects.
>
> Mozilla, WebKit, LLVM and Microsoft. They have their style distributed
> with the official clang-format, not sure if they enforce it.
>
> Same for Chromium/Chrome, but it looks like they indeed enforce it:
>
>   "A checkout should give you clang-format to automatically format C++
> code. By policy, Clang's formatting of code should always be accepted
> in code reviews."
>
> I would bet other Google projects do so as well (since Chandler
> Carruth has been giving talks about clang-format for 7+ years). Nick?

So Google3 (the internal monorepo that Android, Chromium, ChromiumOS,
Fuchsia are not a part of) is pretty sweet.  You cannot even post code
unless the linter has been run on it (presubmit hook), which for our
~350 millions LoC of C++ is clang-format.  If you bypass local
presubmit hooks, our code review tool ("critique") won't let you
submit code that fails lint presubmit checks.  I suspect the initial
conversion was probably committed by bots.

>
> I hope those are major enough. There is also precedent in other
> languages (e.g. Java, C#, Rust).

Yep! Other people coming to C/C++ from these languages find the
discussion about tabs vs spaces to be highly entertaining!  When you
have an automated code formatter and an agreed upon coding style (and
hopefully enforcement), you save so much time from avoided bikesheds!
Don't like the codebase's coding style?  Then write the code how you
like and just run the formatter when you're done (might not help with
conventions though, maybe that's where checkpatch.pl can shine).
Done! No more wasted time on what color to paint the bikeshed!
-- 
Thanks,
~Nick Desaulniers

Re: [PATCH] watchdog: f71808e_wdt: Add F81803 support

2019-09-12 Thread Jaret Cantu


On 9/12/19 2:50 PM, Guenter Roeck wrote:

On Thu, Sep 12, 2019 at 01:55:50PM -0400, Jaret Cantu wrote:

This adds watchdog support for the Fintek F81803 Super I/O chip.

Testing was done on the Seneca XK-QUAD.

Signed-off-by: Jaret Cantu 


Since there is no datasheet, we can only hope that this works
for other platforms using the same chip. Nothing we can do
about that, so


I did get the register descriptions after hounding the vendor's support 
team for a good long while, which is how I was able to get the watchdog 
working in the first place.  Nothing publicly available, however.


The only real difference between this part and others in the family is 
requiring a bank select before setting the WDTRST pin.  (And the 
registers/bits which have to be twiddled to do so, of course.)




Reviewed-by: Guenter Roeck 


---
  drivers/watchdog/Kconfig   |  4 ++--
  drivers/watchdog/f71808e_wdt.c | 17 -
  2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig
index 8188963a405b..781ff835f2a4 100644
--- a/drivers/watchdog/Kconfig
+++ b/drivers/watchdog/Kconfig
@@ -1046,8 +1046,8 @@ config F71808E_WDT
depends on X86
help
  This is the driver for the hardware watchdog on the Fintek F71808E,
- F71862FG, F71868, F71869, F71882FG, F71889FG, F81865 and F81866
- Super I/O controllers.
+ F71862FG, F71868, F71869, F71882FG, F71889FG, F81803, F81865, and
+ F81866 Super I/O controllers.
  
  	  You can compile this driver directly into the kernel, or use

  it as a module.  The module will be called f71808e_wdt.
diff --git a/drivers/watchdog/f71808e_wdt.c b/drivers/watchdog/f71808e_wdt.c
index ff5cf1b48a4d..e46104c2fd94 100644
--- a/drivers/watchdog/f71808e_wdt.c
+++ b/drivers/watchdog/f71808e_wdt.c
@@ -31,8 +31,10 @@
  #define SIO_REG_DEVID 0x20/* Device ID (2 bytes) */
  #define SIO_REG_DEVREV0x22/* Device revision */
  #define SIO_REG_MANID 0x23/* Fintek ID (2 bytes) */
+#define SIO_REG_CLOCK_SEL  0x26/* Clock select */
  #define SIO_REG_ROM_ADDR_SEL  0x27/* ROM address select */
  #define SIO_F81866_REG_PORT_SEL   0x27/* F81866 Multi-Function 
Register */
+#define SIO_REG_TSI_LEVEL_SEL  0x28/* TSI Level select */
  #define SIO_REG_MFUNCT1   0x29/* Multi function select 1 */
  #define SIO_REG_MFUNCT2   0x2a/* Multi function select 2 */
  #define SIO_REG_MFUNCT3   0x2b/* Multi function select 3 */
@@ -49,6 +51,7 @@
  #define SIO_F71869A_ID0x1007  /* Chipset ID */
  #define SIO_F71882_ID 0x0541  /* Chipset ID */
  #define SIO_F71889_ID 0x0723  /* Chipset ID */
+#define SIO_F81803_ID  0x1210  /* Chipset ID */
  #define SIO_F81865_ID 0x0704  /* Chipset ID */
  #define SIO_F81866_ID 0x1010  /* Chipset ID */
  
@@ -108,7 +111,7 @@ MODULE_PARM_DESC(start_withtimeout, "Start watchdog timer on module load with"

" given initial timeout. Zero (default) disables this feature.");
  
  enum chips { f71808fg, f71858fg, f71862fg, f71868, f71869, f71882fg, f71889fg,

-f81865, f81866};
+f81803, f81865, f81866};
  
  static const char *f71808e_names[] = {

"f71808fg",
@@ -118,6 +121,7 @@ static const char *f71808e_names[] = {
"f71869",
"f71882fg",
"f71889fg",
+   "f81803",
"f81865",
"f81866",
  };
@@ -370,6 +374,14 @@ static int watchdog_start(void)
superio_inb(watchdog.sioaddr, SIO_REG_MFUNCT3) & 0xcf);
break;
  
+	case f81803:

+   /* Enable TSI Level register bank */
+   superio_clear_bit(watchdog.sioaddr, SIO_REG_CLOCK_SEL, 3);
+   /* Set pin 27 to WDTRST# */
+   superio_outb(watchdog.sioaddr, SIO_REG_TSI_LEVEL_SEL, 0x5f &
+   superio_inb(watchdog.sioaddr, SIO_REG_TSI_LEVEL_SEL));
+   break;
+
case f81865:
/* Set pin 70 to WDTRST# */
superio_clear_bit(watchdog.sioaddr, SIO_REG_MFUNCT3, 5);
@@ -809,6 +821,9 @@ static int __init f71808e_find(int sioaddr)
/* Confirmed (by datasheet) not to have a watchdog. */
err = -ENODEV;
goto exit;
+   case SIO_F81803_ID:
+   watchdog.type = f81803;
+   break;
case SIO_F81865_ID:
watchdog.type = f81865;
break;
--
2.11.0

Re: [rfc patch script] treewide conversion of __section(foo) to section("foo");

2019-09-12 Thread Nick Desaulniers

On Sun, Sep 8, 2019 at 9:21 PM Joe Perches  wrote:

> So running the script:
>
> $ perl section.pl
>
> produces a commit
> ---
> From 04e52f34fd4ee7008ea5bf0d8896bf8d1fdf9f3f Mon Sep 17 00:00:00 2001
> Message-Id: 
> <04e52f34fd4ee7008ea5bf0d8896bf8d1fdf9f3f.1568001863.git@perches.com>
> From: Joe Perches 
> Date: Sun, 8 Sep 2019 20:53:41 -0700
> Subject: [PATCH] treewide: Convert macro and uses of __section(foo) to
>  __section("foo")
>
> Use a more generic form for __section that requires quotes to avoid
> complications with clang and gcc differences.
>
> Remove the quote operator # from compiler_attributes.h __section macro.
>
> Convert all unquoted __section(foo) uses to quoted __section("foo").
> Also convert __attribute__((section("foo"))) uses to __section("foo")
> even if the __attribute__ has multiple list entry forms.
>
> Signed-off-by: Joe Perches 
> ---
>  arch/arc/include/asm/linkage.h|  8 +++---
>  arch/arc/include/asm/mach_desc.h  |  2 +-
>  arch/arc/plat-hsdk/platform.c |  2 +-
>  arch/arm/include/asm/cache.h  |  2 +-
>  arch/arm/include/asm/cpuidle.h|  2 +-
>  arch/arm/include/asm/idmap.h  |  2 +-
>  arch/arm/include/asm/kvm_hyp.h|  2 +-
>  arch/arm/include/asm/mach/arch.h  |  4 +--
>  arch/arm/include/asm/setup.h  |  2 +-
>  arch/arm/include/asm/smp.h|  2 +-
>  arch/arm/include/asm/tcm.h|  8 +++---
>  arch/arm/kernel/cpuidle.c |  2 +-
>  arch/arm/kernel/devtree.c |  2 +-
>  arch/arm64/include/asm/cache.h|  2 +-
>  arch/arm64/include/asm/exception.h|  2 +-
>  arch/arm64/include/asm/kvm_hyp.h  |  2 +-
>  arch/arm64/kernel/efi.c   |  2 +-
>  arch/arm64/kernel/smp_spin_table.c|  2 +-
>  arch/ia64/include/asm/cache.h |  2 +-
>  arch/microblaze/kernel/setup.c|  2 +-
>  arch/mips/include/asm/cache.h |  2 +-
>  arch/mips/include/asm/mach-pmcs-msp71xx/msp_pci.h |  4 +--
>  arch/mips/include/asm/machine.h   |  2 +-
>  arch/mips/include/asm/mips_machine.h  |  2 +-
>  arch/mips/kernel/setup.c  |  2 +-
>  arch/mips/mm/init.c   |  2 +-
>  arch/parisc/include/asm/cache.h   |  2 +-
>  arch/parisc/include/asm/ldcw.h|  2 +-
>  arch/parisc/kernel/ftrace.c   |  2 +-
>  arch/parisc/mm/init.c |  6 ++--
>  arch/powerpc/boot/main.c  |  2 +-
>  arch/powerpc/boot/ps3.c   |  2 +-
>  arch/powerpc/include/asm/cache.h  |  2 +-
>  arch/powerpc/include/asm/machdep.h|  2 +-
>  arch/powerpc/kernel/btext.c   |  2 +-
>  arch/powerpc/kernel/prom_init.c   |  2 +-
>  arch/powerpc/kvm/book3s_64_vio_hv.c   |  2 +-
>  arch/s390/boot/compressed/decompressor.c  |  2 +-
>  arch/s390/boot/ipl_parm.c |  4 +--
>  arch/s390/boot/startup.c  |  2 +-
>  arch/s390/include/asm/cache.h |  2 +-
>  arch/s390/include/asm/sections.h  |  4 +--
>  arch/s390/kernel/setup.c  |  2 +-
>  arch/s390/mm/init.c   |  2 +-
>  arch/sh/boards/of-generic.c   |  2 +-
>  arch/sh/include/asm/cache.h   |  2 +-
>  arch/sh/include/asm/machvec.h |  2 +-
>  arch/sh/include/asm/smp.h |  2 +-
>  arch/sparc/include/asm/cache.h|  2 +-
>  arch/sparc/kernel/btext.c |  2 +-
>  arch/um/include/shared/init.h | 22 +++
>  arch/um/kernel/skas/clone.c   |  2 +-
>  arch/um/kernel/um_arch.c  |  2 +-
>  arch/x86/boot/compressed/pgtable_64.c |  2 +-
>  arch/x86/boot/tty.c   |  8 +++---
>  arch/x86/boot/video.h |  2 +-
>  arch/x86/include/asm/apic.h   |  4 +--
>  arch/x86/include/asm/cache.h  |  2 +-
>  arch/x86/include/asm/intel-mid.h  |  2 +-
>  arch/x86/include/asm/iommu_table.h|  2 +-
>  arch/x86/include/asm/irqflags.h   |  2 +-
>  arch/x86/include/asm/mem_encrypt.h|  2 +-
>  arch/x86/include/asm/setup.h  |  2 +-
>  arch/x86/kernel/cpu/cpu.h |  2 +-
>  arch/x86/kernel/head64.c  |  2 +-
>  arch/x86/mm/mem_encrypt.c |  4 +--
>  arch/x86/mm/mem_encrypt_identity.c|  2

Re: [PATCH 00/13] nvdimm: Use more common kernel coding style

On Thu, 2019-09-12 at 23:58 +0200, Miguel Ojeda wrote:
> On Thu, Sep 12, 2019 at 11:08 PM Joe Perches  wrote:
> > Please name the major projects and then point to their
> > .clang-format equivalents.
> > 
> > Also note the size/scope/complexity of the major projects.
> 
> Mozilla, WebKit, LLVM and Microsoft. They have their style distributed
> with the official clang-format, not sure if they enforce it.

At least for LLVM, it appears not.

I just tried a very small portion of the clang compiler:

$ git ls-files llvm/lib/CodeGen/ | wc -l
293
$ git ls-files llvm/lib/CodeGen/ | xargs clang-format -i

and got:

$ git diff --shortstat
 245 files changed, 19519 insertions(+), 17794 deletions(-)

btw: that seems a pretty small ~7% of the overall lines

$ git ls-files llvm/lib/CodeGen/ | xargs wc -l | tail -1
 251034 total

Re: [RFC] ARM: dts: omap36xx: Enable thermal throttling

2019-09-12 Thread Daniel Lezcano



Hi Adam,

On 12/09/2019 23:19, Adam Ford wrote:
> On Thu, Sep 12, 2019 at 4:12 PM Daniel Lezcano
>  wrote:
>>
>> On 12/09/2019 20:30, Adam Ford wrote:
>>> The thermal sensor in the omap3 family isn't accurate, but it's
>>> better than nothing.  The various OPP's enabled for the omap3630
>>> support up to OPP1G, however the datasheet for the DM3730 states
>>> that OPP130 and OPP1G are not available above TJ of 90C.
>>>
>>> This patch configures the thermal throttling to limit the
>>> operating points of the omap3630 to Only OPP50 and OPP100 if
>>> the thermal sensor reads a value above 90C.

Oh, that's a very interesting use case.

AFAICT the thermal framework is not designed to deal with this
situation. I agree this setup may work (even if I'm not convinced about
the stability of the whole).

May be Viresh can help for the cpufreq side?

>> Out of curiosity, what are the OPP50 and OPP100 mentioned above? and
>> what does mean "OPP130 and OPP1G are not available above TJ of 90C"?
>>
> OPP130 is the 800 MHz and OPP1G is 1GHz operating point.
> The 90C is the max junction temperature.  When the temperature exceeds
> 90C, the processor is not designed to operate at 800+ MHz.  The
> statement itself is a direct quote from the public datasheet for the
> dm3730, Table 4-19.

> The datasheet is: http://www.ti.com/lit/ds/symlink/dm3730.pdf

It is ambiguous how it is stated:

"OPP130 and OPP1G are not available above TJ of 90C"

that can be interpreted the OPP is disabled by the hardware, no?

> The operating points were updated in [1], but they haven't yet been
> fully applied yet, but during the discussion, the question came about
> regarding how to limit the speed at high temp, so that's why this
> patch was done.
> 
> [1] - https://patchwork.kernel.org/patch/11141643/

I see, you switched to opp-v2.

Thanks for the detailed answer.



>> I don't see the connection between these OPP names and the definition in
>> the DT.
>>
>>> Signed-off-by: Adam Ford 
>>>
>>> diff --git a/arch/arm/boot/dts/omap36xx.dtsi 
>>> b/arch/arm/boot/dts/omap36xx.dtsi
>>> index 4bb4f534afe2..58b9d347019f 100644
>>> --- a/arch/arm/boot/dts/omap36xx.dtsi
>>> +++ b/arch/arm/boot/dts/omap36xx.dtsi
>>> @@ -25,6 +25,7 @@
>>>
>>>   vbb-supply = <_mpu_iva>;
>>>   clock-latency = <30>; /* From omap-cpufreq driver 
>>> */
>>> + #cooling-cells = <2>;
>>>   };
>>>   };
>>>
>>> @@ -195,6 +196,31 @@
>>>   };
>>>  };
>>>
>>> +_thermal {
>>> + cpu_trips: trips {
>>> + /* OPP130 and OPP1G are not available above TJ of 90C. */
>>> + cpu_alert0: cpu_alert {
>>> + temperature = <9>; /* millicelsius */
>>> + hysteresis = <2000>; /* millicelsius */
>>> + type = "passive";
>>> + };
>>> +
>>> + cpu_crit: cpu_crit {
>>> + temperature = <125000>; /* millicelsius */
>>> + hysteresis = <2000>; /* millicelsius */
>>> + type = "critical";
>>> + };
>>> + };
>>> +
>>> + cpu_cooling_maps: cooling-maps {
>>> + map0 {
>>> + trip = <_alert0>;
>>> + /* Only allow OPP50 and OPP100 */
>>> + cooling-device = < 0 1>;
>>> + };
>>> + };
>>> +};
>>> +
>>>  /* OMAP3630 needs dss_96m_fck for VENC */
>>>   {
>>>   clocks = <_tv_fck>, <_96m_fck>;
>>>
>>
>>
>> --
>>   Linaro.org │ Open source software for ARM SoCs
>>
>> Follow Linaro:   Facebook |
>>  Twitter |
>>  Blog
>>


-- 
  Linaro.org │ Open source software for ARM SoCs

Follow Linaro:   Facebook |
 Twitter |
 Blog

Re: [PATCH v3 0/6] make use of gcc 9's "asm inline()"

2019-09-12 Thread Miguel Ojeda

On Fri, Sep 13, 2019 at 12:19 AM Rasmus Villemoes
 wrote:
>
> Patch 1 has already been picked up by Greg in staging-next, it's
> included here for completeness. I don't know how to route the rest, or
> if they should simply wait for 5.5 given how close we are to the merge
> window for 5.4.

If you want I can pick this up in compiler-attributes and submit it as
a whole if we get Acks from rtl8723bs/x86/...maintainers.

Cheers,
Miguel

[PATCH v3 4/6] compiler-types.h: add asm_inline definition

This adds an asm_inline macro which expands to "asm inline" [1] when
the compiler supports it. This is currently gcc 9.1+, gcc 8.3
and (once released) gcc 7.5 [2]. It expands to just "asm" for other
compilers.

Using asm inline("foo") instead of asm("foo") overrules gcc's
heuristic estimate of the size of the code represented by the asm()
statement, and makes gcc use the minimum possible size instead. That
can in turn affect gcc's inlining decisions.

I wasn't sure whether to make this a function-like macro or not - this
way, it can be combined with volatile as

  asm_inline volatile()

but perhaps we'd prefer to spell that

  asm_inline_volatile()

anyway.

The Kconfig logic is taken from an RFC patch by Masahiro Yamada [3].

[1] Technically, asm __inline, since both inline and __inline__
are macros that attach various attributes, making gcc barf if one
literally does "asm inline()". However, the third spelling __inline is
available for referring to the bare keyword.

[2] https://lore.kernel.org/lkml/20190907001411.gg9...@gate.crashing.org/

[3] 
https://lore.kernel.org/lkml/1544695154-15250-1-git-send-email-yamada.masah...@socionext.com/

Signed-off-by: Rasmus Villemoes 
---
 include/linux/compiler_types.h | 6 ++
 init/Kconfig   | 3 +++
 2 files changed, 9 insertions(+)

diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index ee49be6d6088..2bf316fe0a20 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -198,6 +198,12 @@ struct ftrace_likely_data {
 #define asm_volatile_goto(x...) asm goto(x)
 #endif
 
+#ifdef CONFIG_CC_HAS_ASM_INLINE
+#define asm_inline asm __inline
+#else
+#define asm_inline asm
+#endif
+
 #ifndef __no_fgcse
 # define __no_fgcse
 #endif
diff --git a/init/Kconfig b/init/Kconfig
index bd7d650d4a99..7fee5978dd73 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -30,6 +30,9 @@ config CC_CAN_LINK
 config CC_HAS_ASM_GOTO
def_bool $(success,$(srctree)/scripts/gcc-goto.sh $(CC))
 
+config CC_HAS_ASM_INLINE
+   def_bool $(success,echo 'void foo(void) { asm inline (""); }' | $(CC) 
-x c - -c -o /dev/null)
+
 config CC_HAS_WARN_MAYBE_UNINITIALIZED
def_bool $(cc-option,-Wmaybe-uninitialized)
help
-- 
2.20.1

[PATCH v3 3/6] compiler_types.h: don't #define __inline

The spellings __inline and __inline__ should be reserved for uses
where one really wants to refer to the inline keyword, regardless of
whether or not the spelling "inline" has been #defined to something
else. Due to use of __inline__ in uapi headers, we can't easily get
rid of the definition of __inline__. However, almost all users of
__inline has been converted to inline, so we can get rid of that
#define.

The exception is include/acpi/platform/acintel.h. However, that header
is only included when using the intel compiler (does anybody actually
build the kernel with that?), and the ACPI_INLINE macro is only used
in the definition of utterly trivial stub functions, where I doubt a
small change of semantics (lack of __gnu_inline) changes anything.

Signed-off-by: Rasmus Villemoes 
---
 include/linux/compiler_types.h | 11 ++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index 599c27b56c29..ee49be6d6088 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -150,8 +150,17 @@ struct ftrace_likely_data {
__maybe_unused notrace
 #endif
 
+/*
+ * gcc provides both __inline__ and __inline as alternate spellings of
+ * the inline keyword, though the latter is undocumented. New kernel
+ * code should only use the inline spelling, but some existing code
+ * uses __inline__. Since we #define inline above, to ensure
+ * __inline__ has the same semantics, we need this #define.
+ *
+ * However, the spelling __inline is strictly reserved for referring
+ * to the bare keyword.
+ */
 #define __inline__ inline
-#define __inline   inline
 
 /*
  * Rather then using noinline to prevent stack consumption, use
-- 
2.20.1

Re: [PATCH RT v3 4/5] rcu: Disable use_softirq on PREEMPT_RT

On Thu, Sep 12, 2019 at 05:38:43PM -0400, Joel Fernandes wrote:
> Hi Scott,
> 
> Would you mind CC'ing r...@vger.kernel.org on RCU related patches? I added it
> for this time.
> 
> On Wed, Sep 11, 2019 at 05:57:28PM +0100, Scott Wood wrote:
> > Besides restoring behavior that used to be default on RT, this avoids
> > a deadlock on scheduler locks:
[snip]
> > [  136.995194] 039:  May be due to missing lock nesting notation
> > 
> > [  137.001115] 039: 3 locks held by rcu_torture_rea/13474:
> > [  137.006341] 039:  #0:
> > [  137.008707] 039: 5f25146d
> > [  137.012024] 039:  (
> > [  137.014131] 039: >pi_lock
> > [  137.017015] 039: ){-...}
> > [  137.019558] 039: , at: try_to_wake_up+0x39/0x920
> > [  137.024175] 039:  #1:
> > [  137.026540] 039: 11c8e51d
> > [  137.029859] 039:  (
> > [  137.031966] 039: >lock
> > [  137.034679] 039: ){-...}
> > [  137.037217] 039: , at: try_to_wake_up+0x241/0x920
> > [  137.041924] 039:  #2:
> > [  137.044291] 039: 098649b9
> > [  137.047610] 039:  (
> > [  137.049714] 039: rcu_read_lock
> > [  137.052774] 039: ){}
> > [  137.055314] 039: , at: cpuacct_charge+0x33/0x1e0
> > [  137.059934] 039:
> > stack backtrace:
> > [  137.063425] 039: CPU: 39 PID: 13474 Comm: rcu_torture_rea Kdump: loaded 
> > Tainted: GE 5.2.9-rt3.dbg+ #174
> > [  137.074197] 039: Hardware name: Intel Corporation S2600BT/S2600BT, BIOS 
> > SE5C620.86B.01.00.0763.022420181017 02/24/2018
> > [  137.084886] 039: Call Trace:
> > [  137.087773] 039:  
> > [  137.090226] 039:  dump_stack+0x5e/0x8b
> > [  137.093997] 039:  __lock_acquire+0x725/0x1100
> > [  137.098358] 039:  lock_acquire+0xc0/0x240
> > [  137.102374] 039:  ? try_to_wake_up+0x39/0x920
> > [  137.106737] 039:  _raw_spin_lock_irqsave+0x47/0x90
> > [  137.111534] 039:  ? try_to_wake_up+0x39/0x920
> > [  137.115910] 039:  try_to_wake_up+0x39/0x920
> > [  137.120098] 039:  rcu_read_unlock_special+0x65/0xb0
> > [  137.124977] 039:  __rcu_read_unlock+0x5d/0x70
> > [  137.129337] 039:  cpuacct_charge+0xd9/0x1e0
> > [  137.133522] 039:  ? cpuacct_charge+0x33/0x1e0
> > [  137.137880] 039:  update_curr+0x14b/0x420
> > [  137.141894] 039:  enqueue_entity+0x42/0x370
> > [  137.146080] 039:  enqueue_task_fair+0xa9/0x490
> > [  137.150528] 039:  activate_task+0x5a/0xf0
> > [  137.154539] 039:  ttwu_do_activate+0x4e/0x90
> > [  137.158813] 039:  try_to_wake_up+0x277/0x920
> > [  137.163086] 039:  irq_exit+0xb6/0xf0
[snip]
> > Signed-off-by: Scott Wood 
> > ---
> > The prohibition on use_softirq should be able to be dropped once RT gets
> > the latest RCU code, but the question of what use_softirq should default
> > to on PREEMPT_RT remains.
> > 
> > v3: Use IS_ENABLED
> 
> Out of curiosity, does PREEMPT_RT use the NOCB callback offloading? If no,
> should it use it? IIUC, that does make the work the softirq have to do less
> work since the callbacks are executed in threaded context.
> 
> If yes, can RT tolerate use_softirq=false and what could a realistic softirq

s/use_softirq=false/use_softirq=true/

thanks,

 - Joel

[PATCH v3 6/6] x86: bug.h: use asm_inline in _BUG_FLAGS definitions

This helps preventing a BUG* or WARN* in some static inline from
preventing that (or one of its callers) being inlined, so should allow
gcc to make better informed inlining decisions.

For example, with gcc 9.2, tcp_fastopen_no_cookie() vanishes from
net/ipv4/tcp_fastopen.o. It does not itself have any BUG or WARN, but
it calls dst_metric() which has a WARN_ON_ONCE - and despite that
WARN_ON_ONCE vanishing since the condition is compile-time false,
dst_metric() is apparently sufficiently "large" that when it gets
inlined into tcp_fastopen_no_cookie(), the latter becomes too large
for inlining.

Overall, if one asks size(1), .text decreases a little and .data
increases by about the same amount (x86-64 defconfig)

$ size vmlinux.{before,after}
   textdata bss dec hex filename
197097265202600 1630280 26542606195020e vmlinux.before
197093305203068 1630280 265426781950256 vmlinux.after

while bloat-o-meter says

add/remove: 10/28 grow/shrink: 103/51 up/down: 3669/-2854 (815)
...
Total: Before=14783683, After=14784498, chg +0.01%

Signed-off-by: Rasmus Villemoes 
---
 arch/x86/include/asm/bug.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h
index 6804d6642767..facba9bc30ca 100644
--- a/arch/x86/include/asm/bug.h
+++ b/arch/x86/include/asm/bug.h
@@ -32,7 +32,7 @@
 
 #define _BUG_FLAGS(ins, flags) \
 do {   \
-   asm volatile("1:\t" ins "\n"\
+   asm_inline volatile("1:\t" ins "\n" \
 ".pushsection __bug_table,\"aw\"\n"\
 "2:\t" __BUG_REL(1b) "\t# bug_entry::bug_addr\n"   \
 "\t"  __BUG_REL(%c0) "\t# bug_entry::file\n"   \
@@ -49,7 +49,7 @@ do {  
\
 
 #define _BUG_FLAGS(ins, flags) \
 do {   \
-   asm volatile("1:\t" ins "\n"\
+   asm_inline volatile("1:\t" ins "\n" \
 ".pushsection __bug_table,\"aw\"\n"\
 "2:\t" __BUG_REL(1b) "\t# bug_entry::bug_addr\n"   \
 "\t.word %c0""\t# bug_entry::flags\n"  \
-- 
2.20.1

[PATCH v3 2/6] lib/zstd/mem.h: replace __inline by inline

Currently, compiler_types.h #defines __inline as inline (and further
#defines inline to automatically attach some attributes), so this does
not change functionality. It serves as preparation for removing the
#define of __inline.

While at it, also remove the __attribute__((unused)) - it's already
included in the definition of the inline macro, and "open-coded"
__attribute__(()) should be avoided.

Since commit a95b37e20db9 (kbuild: get  out of
), compiler_types.h is automatically included by all
kernel C code - i.e., the definition of inline including the unused
attribute is guaranteed to be in effect whenever ZSTD_STATIC is
expanded.

Signed-off-by: Rasmus Villemoes 
---
 lib/zstd/mem.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/zstd/mem.h b/lib/zstd/mem.h
index 3a0f34c8706c..93d7a2c377fe 100644
--- a/lib/zstd/mem.h
+++ b/lib/zstd/mem.h
@@ -27,7 +27,7 @@
 /*-
 *  Compiler specifics
 **/
-#define ZSTD_STATIC static __inline __attribute__((unused))
+#define ZSTD_STATIC static inline
 
 /*-**
 *  Basic Types
-- 
2.20.1

[PATCH v3 5/6] x86: alternative.h: use asm_inline for all alternative variants

Most, if not all, uses of the alternative* family just provide one or
two instructions in .text, but the string literal can be quite large,
causing gcc to overestimate the size of the generated code. That in
turn affects its decisions about inlining of the function containing
the alternative() asm statement.

New enough versions of gcc allow one to overrule the estimated size by
using "asm inline" instead of just "asm". So replace asm by the helper
asm_inline, which for older gccs just expands to asm.

Signed-off-by: Rasmus Villemoes 
---
 arch/x86/include/asm/alternative.h | 14 +++---
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/asm/alternative.h 
b/arch/x86/include/asm/alternative.h
index 094fbc9c0b1c..13adca37c99a 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -201,10 +201,10 @@ static inline int alternatives_text_reserved(void *start, 
void *end)
  * without volatile and memory clobber.
  */
 #define alternative(oldinstr, newinstr, feature)   \
-   asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) : : : "memory")
+   asm_inline volatile (ALTERNATIVE(oldinstr, newinstr, feature) : : : 
"memory")
 
 #define alternative_2(oldinstr, newinstr1, feature1, newinstr2, feature2) \
-   asm volatile(ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, 
feature2) ::: "memory")
+   asm_inline volatile(ALTERNATIVE_2(oldinstr, newinstr1, feature1, 
newinstr2, feature2) ::: "memory")
 
 /*
  * Alternative inline assembly with input.
@@ -218,7 +218,7 @@ static inline int alternatives_text_reserved(void *start, 
void *end)
  * Leaving an unused argument 0 to keep API compatibility.
  */
 #define alternative_input(oldinstr, newinstr, feature, input...)   \
-   asm volatile (ALTERNATIVE(oldinstr, newinstr, feature)  \
+   asm_inline volatile (ALTERNATIVE(oldinstr, newinstr, feature)   \
: : "i" (0), ## input)
 
 /*
@@ -231,18 +231,18 @@ static inline int alternatives_text_reserved(void *start, 
void *end)
  */
 #define alternative_input_2(oldinstr, newinstr1, feature1, newinstr2,   \
   feature2, input...)   \
-   asm volatile(ALTERNATIVE_2(oldinstr, newinstr1, feature1,\
+   asm_inline volatile(ALTERNATIVE_2(oldinstr, newinstr1, feature1, \
newinstr2, feature2) \
: : "i" (0), ## input)
 
 /* Like alternative_input, but with a single output argument */
 #define alternative_io(oldinstr, newinstr, feature, output, input...)  \
-   asm volatile (ALTERNATIVE(oldinstr, newinstr, feature)  \
+   asm_inline volatile (ALTERNATIVE(oldinstr, newinstr, feature)   \
: output : "i" (0), ## input)
 
 /* Like alternative_io, but for replacing a direct call with another one. */
 #define alternative_call(oldfunc, newfunc, feature, output, input...)  \
-   asm volatile (ALTERNATIVE("call %P[old]", "call %P[new]", feature) \
+   asm_inline volatile (ALTERNATIVE("call %P[old]", "call %P[new]", 
feature) \
: output : [old] "i" (oldfunc), [new] "i" (newfunc), ## input)
 
 /*
@@ -253,7 +253,7 @@ static inline int alternatives_text_reserved(void *start, 
void *end)
  */
 #define alternative_call_2(oldfunc, newfunc1, feature1, newfunc2, feature2,   \
   output, input...)  \
-   asm volatile (ALTERNATIVE_2("call %P[old]", "call %P[new1]", feature1,\
+   asm_inline volatile (ALTERNATIVE_2("call %P[old]", "call %P[new1]", 
feature1,\
"call %P[new2]", feature2)\
: output, ASM_CALL_CONSTRAINT \
: [old] "i" (oldfunc), [new1] "i" (newfunc1), \
-- 
2.20.1

[PATCH v3 1/6] staging: rtl8723bs: replace __inline by inline

Currently, __inline is #defined as inline in compiler_types.h, so this
should not change functionality. It is preparation for removing said
#define.

While at it, change some "inline static" to the customary "static
inline" order.

Signed-off-by: Rasmus Villemoes 
---
 drivers/staging/rtl8723bs/core/rtw_pwrctrl.c |  4 ++--
 drivers/staging/rtl8723bs/core/rtw_wlan_util.c   |  2 +-
 drivers/staging/rtl8723bs/include/drv_types.h|  6 +++---
 .../staging/rtl8723bs/include/osdep_service.h| 10 +-
 .../rtl8723bs/include/osdep_service_linux.h  | 14 +++---
 drivers/staging/rtl8723bs/include/rtw_mlme.h | 14 +++---
 drivers/staging/rtl8723bs/include/rtw_recv.h | 16 
 drivers/staging/rtl8723bs/include/sta_info.h |  2 +-
 drivers/staging/rtl8723bs/include/wifi.h | 14 +++---
 drivers/staging/rtl8723bs/include/wlan_bssdef.h  |  2 +-
 10 files changed, 42 insertions(+), 42 deletions(-)

diff --git a/drivers/staging/rtl8723bs/core/rtw_pwrctrl.c 
b/drivers/staging/rtl8723bs/core/rtw_pwrctrl.c
index ae7fb7046c93..3750fbaeec4f 100644
--- a/drivers/staging/rtl8723bs/core/rtw_pwrctrl.c
+++ b/drivers/staging/rtl8723bs/core/rtw_pwrctrl.c
@@ -830,12 +830,12 @@ static void pwr_rpwm_timeout_handler(struct timer_list *t)
_set_workitem(>rpwmtimeoutwi);
 }
 
-static __inline void register_task_alive(struct pwrctrl_priv *pwrctrl, u32 tag)
+static inline void register_task_alive(struct pwrctrl_priv *pwrctrl, u32 tag)
 {
pwrctrl->alives |= tag;
 }
 
-static __inline void unregister_task_alive(struct pwrctrl_priv *pwrctrl, u32 
tag)
+static inline void unregister_task_alive(struct pwrctrl_priv *pwrctrl, u32 tag)
 {
pwrctrl->alives &= ~tag;
 }
diff --git a/drivers/staging/rtl8723bs/core/rtw_wlan_util.c 
b/drivers/staging/rtl8723bs/core/rtw_wlan_util.c
index 76c50377f0fe..34e1ce1b0689 100644
--- a/drivers/staging/rtl8723bs/core/rtw_wlan_util.c
+++ b/drivers/staging/rtl8723bs/core/rtw_wlan_util.c
@@ -451,7 +451,7 @@ void set_channel_bwmode(struct adapter *padapter, unsigned 
char channel, unsigne
mutex_unlock(&(adapter_to_dvobj(padapter)->setch_mutex));
 }
 
-__inline u8 *get_my_bssid(struct wlan_bssid_ex *pnetwork)
+inline u8 *get_my_bssid(struct wlan_bssid_ex *pnetwork)
 {
return pnetwork->MacAddress;
 }
diff --git a/drivers/staging/rtl8723bs/include/drv_types.h 
b/drivers/staging/rtl8723bs/include/drv_types.h
index 96346ce064aa..d3648f3b1de2 100644
--- a/drivers/staging/rtl8723bs/include/drv_types.h
+++ b/drivers/staging/rtl8723bs/include/drv_types.h
@@ -478,7 +478,7 @@ struct sdio_data intf_data;
 #define dvobj_to_pwrctl(dvobj) (&(dvobj->pwrctl_priv))
 #define pwrctl_to_dvobj(pwrctl) container_of(pwrctl, struct dvobj_priv, 
pwrctl_priv)
 
-__inline static struct device *dvobj_to_dev(struct dvobj_priv *dvobj)
+static inline struct device *dvobj_to_dev(struct dvobj_priv *dvobj)
 {
/* todo: get interface type from dvobj and the return the dev 
accordingly */
 #ifdef RTW_DVOBJ_CHIP_HW_TYPE
@@ -636,14 +636,14 @@ struct adapter {
 
 /* define RTW_DISABLE_FUNC(padapter, func) 
(atomic_add(_to_dvobj(padapter)->disable_func, (func))) */
 /* define RTW_ENABLE_FUNC(padapter, func) 
(atomic_sub(_to_dvobj(padapter)->disable_func, (func))) */
-__inline static void RTW_DISABLE_FUNC(struct adapter *padapter, int func_bit)
+static inline void RTW_DISABLE_FUNC(struct adapter *padapter, int func_bit)
 {
int df = atomic_read(_to_dvobj(padapter)->disable_func);
df |= func_bit;
atomic_set(_to_dvobj(padapter)->disable_func, df);
 }
 
-__inline static void RTW_ENABLE_FUNC(struct adapter *padapter, int func_bit)
+static inline void RTW_ENABLE_FUNC(struct adapter *padapter, int func_bit)
 {
int df = atomic_read(_to_dvobj(padapter)->disable_func);
df &= ~(func_bit);
diff --git a/drivers/staging/rtl8723bs/include/osdep_service.h 
b/drivers/staging/rtl8723bs/include/osdep_service.h
index d2616af95ffa..81a9c19ecc6a 100644
--- a/drivers/staging/rtl8723bs/include/osdep_service.h
+++ b/drivers/staging/rtl8723bs/include/osdep_service.h
@@ -110,12 +110,12 @@ int _rtw_netif_rx(_nic_hdl ndev, struct sk_buff *skb);
 
 extern void _rtw_init_queue(struct __queue *pqueue);
 
-static __inline void thread_enter(char *name)
+static inline void thread_enter(char *name)
 {
allow_signal(SIGTERM);
 }
 
-__inline static void flush_signals_thread(void)
+static inline void flush_signals_thread(void)
 {
if (signal_pending (current))
{
@@ -125,7 +125,7 @@ __inline static void flush_signals_thread(void)
 
 #define rtw_warn_on(condition) WARN_ON(condition)
 
-__inline static int rtw_bug_check(void *parg1, void *parg2, void *parg3, void 
*parg4)
+static inline int rtw_bug_check(void *parg1, void *parg2, void *parg3, void 
*parg4)
 {
int ret = true;
 
@@ -136,7 +136,7 @@ __inline static int rtw_bug_check(void *parg1, void *parg2, 
void *parg3, void *p
 #define _RND(sz, r)

[PATCH v3 0/6] make use of gcc 9's "asm inline()"

gcc 9+ (and gcc 8.3, 7.5) provides a way to override the otherwise
crude heuristic that gcc uses to estimate the size of the code
represented by an asm() statement. From the gcc docs

  If you use 'asm inline' instead of just 'asm', then for inlining
  purposes the size of the asm is taken as the minimum size, ignoring
  how many instructions GCC thinks it is.

For compatibility with older compilers, we obviously want a

  #if [understands asm inline]
  #define asm_inline asm inline
  #else
  #define asm_inline asm
  #endif

But since we #define the identifier inline to attach some attributes,
we have to use an alternate spelling of that keyword. gcc provides
both __inline__ and __inline, and we currently #define both to inline,
so they all have the same semantics. We have to free up one of
__inline__ and __inline, and the latter is by far the easiest. 

The two x86 changes cause smaller code gen differences than I'd
expect, but I think we do want the asm_inline thing available sooner
or later, so this is just to get the ball rolling.

Changes since v1: __inline instead of __inline__, making the diffstat
400 lines smaller.

Changes since v2: Check support of "asm inline" in Kconfig rather than
based on gcc version, since the feature was backported to gcc 7.x and
gcc 8.x. That also automatically enables it if and when Clang grows
support, though that compiler apparently does not have the same
problems with overestimating sizes of asm()s that gcc has.

Patch 1 has already been picked up by Greg in staging-next, it's
included here for completeness. I don't know how to route the rest, or
if they should simply wait for 5.5 given how close we are to the merge
window for 5.4.

Rasmus Villemoes (6):
  staging: rtl8723bs: replace __inline by inline
  lib/zstd/mem.h: replace __inline by inline
  compiler_types.h: don't #define __inline
  compiler-types.h: add asm_inline definition
  x86: alternative.h: use asm_inline for all alternative variants
  x86: bug.h: use asm_inline in _BUG_FLAGS definitions

 arch/x86/include/asm/alternative.h  | 14 +++---
 arch/x86/include/asm/bug.h  |  4 ++--
 drivers/staging/rtl8723bs/core/rtw_pwrctrl.c|  4 ++--
 drivers/staging/rtl8723bs/core/rtw_wlan_util.c  |  2 +-
 drivers/staging/rtl8723bs/include/drv_types.h   |  6 +++---
 .../staging/rtl8723bs/include/osdep_service.h   | 10 +-
 .../rtl8723bs/include/osdep_service_linux.h | 14 +++---
 drivers/staging/rtl8723bs/include/rtw_mlme.h| 14 +++---
 drivers/staging/rtl8723bs/include/rtw_recv.h| 16 
 drivers/staging/rtl8723bs/include/sta_info.h|  2 +-
 drivers/staging/rtl8723bs/include/wifi.h| 14 +++---
 drivers/staging/rtl8723bs/include/wlan_bssdef.h |  2 +-
 include/linux/compiler_types.h  | 17 -
 init/Kconfig|  3 +++
 lib/zstd/mem.h  |  2 +-
 15 files changed, 71 insertions(+), 53 deletions(-)

-- 
2.20.1

Re: [PATCH RT v3 5/5] rcutorture: Avoid problematic critical section nesting on RT

On Wed, Sep 11, 2019 at 05:57:29PM +0100, Scott Wood wrote:
> rcutorture was generating some nesting scenarios that are not
> reasonable.  Constrain the state selection to avoid them.
> 
> Example #1:
> 
> 1. preempt_disable()
> 2. local_bh_disable()
> 3. preempt_enable()
> 4. local_bh_enable()
> 
> On PREEMPT_RT, BH disabling takes a local lock only when called in
> non-atomic context.  Thus, atomic context must be retained until after BH
> is re-enabled.  Likewise, if BH is initially disabled in non-atomic
> context, it cannot be re-enabled in atomic context.
> 
> Example #2:
> 
> 1. rcu_read_lock()
> 2. local_irq_disable()
> 3. rcu_read_unlock()
> 4. local_irq_enable()

If I understand correctly, these examples are not unrealistic in the real
world unless RCU is used in the scheduler.

> 
> If the thread is preempted between steps 1 and 2,
> rcu_read_unlock_special.b.blocked will be set, but it won't be
> acted on in step 3 because IRQs are disabled.  Thus, reporting of the
> quiescent state will be delayed beyond the local_irq_enable().

Yes, with consolidated RCU this can happen but AFAIK it has not seen to be a
problem since deferred QS reporting will happen take care of it, which can
also happen from subsequent rcu_read_unlock_special().

> For now, these scenarios will continue to be tested on non-PREEMPT_RT
> kernels, until debug checks are added to ensure that they are not
> happening elsewhere.

Are you seeing real issues that need this patch? It would be good to not
complicate rcutorture if not needed.

thanks,

 - Joel


> 
> Signed-off-by: Scott Wood 
> ---
> v3: Limit to RT kernels, and remove one constraint that, while it
> is bad on both RT and non-RT (missing a schedule), does not oops or
> otherwise prevent using rcutorture.  It wolud be added once debug checks
> are implemented.
> 
>  kernel/rcu/rcutorture.c | 96 
> +
>  1 file changed, 82 insertions(+), 14 deletions(-)
> 
> diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
> index efaa5b3f4d3f..ecb82cc432af 100644
> --- a/kernel/rcu/rcutorture.c
> +++ b/kernel/rcu/rcutorture.c
> @@ -60,10 +60,13 @@
>  #define RCUTORTURE_RDR_RBH0x08   /*  ... rcu_read_lock_bh(). */
>  #define RCUTORTURE_RDR_SCHED  0x10   /*  ... rcu_read_lock_sched(). */
>  #define RCUTORTURE_RDR_RCU0x20   /*  ... entering another RCU reader. */
> -#define RCUTORTURE_RDR_NBITS  6  /* Number of bits defined above. */
> +#define RCUTORTURE_RDR_ATOM_BH0x40   /*  ... disabling bh while 
> atomic */
> +#define RCUTORTURE_RDR_ATOM_RBH   0x80   /*  ... RBH while atomic */
> +#define RCUTORTURE_RDR_NBITS  8  /* Number of bits defined above. */
>  #define RCUTORTURE_MAX_EXTEND \
>   (RCUTORTURE_RDR_BH | RCUTORTURE_RDR_IRQ | RCUTORTURE_RDR_PREEMPT | \
> -  RCUTORTURE_RDR_RBH | RCUTORTURE_RDR_SCHED)
> +  RCUTORTURE_RDR_RBH | RCUTORTURE_RDR_SCHED | \
> +  RCUTORTURE_RDR_ATOM_BH | RCUTORTURE_RDR_ATOM_RBH)
>  #define RCUTORTURE_RDR_MAX_LOOPS 0x7 /* Maximum reader extensions. */
>   /* Must be power of two minus one. */
>  #define RCUTORTURE_RDR_MAX_SEGS (RCUTORTURE_RDR_MAX_LOOPS + 3)
> @@ -1092,31 +1095,52 @@ static void rcutorture_one_extend(int *readstate, int 
> newstate,
>   WARN_ON_ONCE((idxold >> RCUTORTURE_RDR_SHIFT) > 1);
>   rtrsp->rt_readstate = newstate;
>  
> - /* First, put new protection in place to avoid critical-section gap. */
> + /*
> +  * First, put new protection in place to avoid critical-section gap.
> +  * Disable preemption around the ATOM disables to ensure that
> +  * in_atomic() is true.
> +  */
>   if (statesnew & RCUTORTURE_RDR_BH)
>   local_bh_disable();
> + if (statesnew & RCUTORTURE_RDR_RBH)
> + rcu_read_lock_bh();
>   if (statesnew & RCUTORTURE_RDR_IRQ)
>   local_irq_disable();
>   if (statesnew & RCUTORTURE_RDR_PREEMPT)
>   preempt_disable();
> - if (statesnew & RCUTORTURE_RDR_RBH)
> - rcu_read_lock_bh();
>   if (statesnew & RCUTORTURE_RDR_SCHED)
>   rcu_read_lock_sched();
> + preempt_disable();
> + if (statesnew & RCUTORTURE_RDR_ATOM_BH)
> + local_bh_disable();
> + if (statesnew & RCUTORTURE_RDR_ATOM_RBH)
> + rcu_read_lock_bh();
> + preempt_enable();
>   if (statesnew & RCUTORTURE_RDR_RCU)
>   idxnew = cur_ops->readlock() << RCUTORTURE_RDR_SHIFT;
>  
> - /* Next, remove old protection, irq first due to bh conflict. */
> + /*
> +  * Next, remove old protection, in decreasing order of strength
> +  * to avoid unlock paths that aren't safe in the stronger
> +  * context.  Disable preemption around the ATOM enables in
> +  * case the context was only atomic due to IRQ disabling.
> +  */
> + preempt_disable();
>   if (statesold & RCUTORTURE_RDR_IRQ)
>   local_irq_enable();
> -

Re: [PATCH 00/13] nvdimm: Use more common kernel coding style

On Thu, 2019-09-12 at 23:58 +0200, Miguel Ojeda wrote:
> On Thu, Sep 12, 2019 at 11:08 PM Joe Perches  wrote:
> > Please name the major projects and then point to their
> > .clang-format equivalents.
> > 
> > Also note the size/scope/complexity of the major projects.
> 
> Mozilla, WebKit, LLVM and Microsoft. They have their style distributed
> with the official clang-format, not sure if they enforce it.
> 
> Same for Chromium/Chrome, but it looks like they indeed enforce it:

thanks for that list.

> > I used the latest one, and quite a bit of the conversion
> > was unpleasant to read.
> 
> It would be good to see particularly bad snippets to see if we can do
> something about them (and, if needed, try to improve clang-format to
> support whatever we need).

As I mentioned earlier, look at the __stringify conversion.
Also the C() blocks.

btw: emacs 'mark-whole-buffer indent-region',
the tool I used for each file in patch 1, also
made a mess of the C() block.

> Did you tweak the parameters with the new ones?

No.  I used

$ clang-format --version
clang-format version 10.0.0 (git://github.com/llvm/llvm-project.git 
305b961f64b75e73110e309341535f6d5a48ed72)

and the existing .clang_format from
next-20190904 35394d031b710e832849fca60d0f53b513f0c390

> I am preparing an RFC
> patch for an updated .clang-format configuration that improves quite a
> bit the results w.r.t. to the current one (and allows for some leeway
> on the developer's side, which helps prevent some cases too).

Well, one day no doubt an automated tool will be
more useful for the kernel.  Hope you keep at it
and good luck.

> > Marking sections _no_auto_format_ isn't really a
> > good solution is it?
> 
> I am thinking about special tables that are hand-crafted or very
> complex macros. For those, yes, I think it is a fine solution. That is
> why clang-format has that feature to begin with, and you can see an
> example in Mozilla's style guide which points here:
> 
>   https://github.com/mozilla/gecko-dev/blob/master/xpcom/io/nsEscape.cpp#L22
> 
> Cheers,
> Miguel

Re: [PATCH] sched: fix migration to invalid cpu in __set_cpus_allowed_ptr

2019-09-12 Thread Valentin Schneider

On 12/09/2019 02:55, shikemeng wrote:
> From 089dbf0216628ac6ae98742ab90725ca9c2bf201 Mon Sep 17 00:00:00 2001
> From:  
> Date: Tue, 10 Sep 2019 09:44:58 -0400
> Subject: [PATCH] sched: fix migration to invalid cpu in __set_cpus_allowed_ptr
> 
> reason: migration to invalid cpu in __set_cpus_allowed_ptr
> archive path: patches/euleros/sched
> 

The above should probably be trimmed from the log.

> Oops occur when running qemu on arm64:
>  Unable to handle kernel paging request at virtual address 08effe40
>  Internal error: Oops: 9607 [#1] SMP
>  Process migration/0 (pid: 12, stack limit = 0x084e3736)
>  pstate: 2085 (nzCv daIf -PAN -UAO)
>  pc : __ll_sc___cmpxchg_case_acq_4+0x4/0x20
>  lr : move_queued_task.isra.21+0x124/0x298
>  ...
>  Call trace:
>   __ll_sc___cmpxchg_case_acq_4+0x4/0x20
>   __migrate_task+0xc8/0xe0
>   migration_cpu_stop+0x170/0x180
>   cpu_stopper_thread+0xec/0x178
>   smpboot_thread_fn+0x1ac/0x1e8
>   kthread+0x134/0x138
>   ret_from_fork+0x10/0x18
> 
> __set_cpus_allowed_ptr will choose an active dest_cpu in affinity mask to 
> migrage the process if process is not
> currently running on any one of the CPUs specified in affinity 
> mask.__set_cpus_allowed_ptr will choose an invalid
> dest_cpu(>= nr_cpu_ids, 1024 in my virtual machine) if CPUS in affinity mask 
> are deactived by cpu_down after
> cpumask_intersects check.

Right, cpumask_any_and() returns >= nr_cpu_ids when there isn't any valid CPU
bit set.

> Cpumask_test_cpu of dest_cpu afterwards is overflow and may passes if 
> corresponding bit
> is coincidentally set.

Ouch. I was going to say the cpu_active_mask check from is_cpu_allowed()
should've stopped the whole thing there, but AFAICT there's no safeguard
against > nr_cpu_ids bit accesses. I see CONFIG_DEBUG_PER_CPU_MAPS should
fire a warning for such accesses, but we don't enable it by default.

Would it make sense to do something like

return test_bit(...) && cpu < nr_cpu_ids;

for cpumask_test_cpu()? We still get the warn with the right config, but we
prevent sneaky mistakes like this one. And it seems it's not the only one
according to:

--
virtual patch
virtual report

@nocheck@
expression E;
identifier any_func =~ "^cpumask_any";
position pos;
@@

E@pos = any_func(...);
... when != E >= nr_cpu_ids
when != E < nr_cpu_ids

@script:python depends on nocheck && report@
p << nocheck.pos;
@@
coccilib.report.print_report(p[0], "Missing cpumask_any_*() return value 
check!")
---

Some of those seem benign since they are on e.g. cpu_online_mask, some other
are somewhat dubious (e.g. deadline.c::dl_task_can_attach()).

As a consequence, kernel will access a invalid rq address associate with the 
invalid cpu in
> migration_cpu_stop->__migrate_task->move_queued_task and the Oops occurs. 
> Process as follows may trigger the Oops:
> 1) A process repeatedly bind itself to cpu0 and cpu1 in turn by calling 
> sched_setaffinity
> 2) A shell script repeatedly "echo 0 > /sys/devices/system/cpu/cpu1/online" 
> and "echo 1 > /sys/devices/system/cpu/cpu1/online" in turn
> 3) Oops appears if the invalid cpu is set in memory after tested cpumask.
> 
> 
> Change-Id: I9c2f95aecd3da568991b7408397215f26c990e40

- This doesn't respect the 75 char per line limit
- Change IDs don't belong here (we're not using Gerrit)
- You're missing a Signed-off-by.

You'll find all the guidelines you need for formatting patches in
Documentation/process/submitting-patches.rst.

> ---
>  kernel/sched/core.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 4b63fef..5181ea9 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -1112,7 +1112,8 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
> if (cpumask_equal(>cpus_allowed, new_mask))
> goto out;
> 
> -   if (!cpumask_intersects(new_mask, cpu_valid_mask)) {
> +   dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
> +   if (dest_cpu >= nr_cpu_ids) {
> ret = -EINVAL;
> goto out;
> }
> @@ -1133,7 +1134,6 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
> if (cpumask_test_cpu(task_cpu(p), new_mask))
> goto out;
> 
> -   dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);> if 
> (task_running(rq, p) || p->state == TASK_WAKING) {
> struct migration_arg arg = { p, dest_cpu };
> /* Need help from migration thread: drop lock and wait. */
> --
> 1.8.5.6
>

Re: [PATCH RT v3 1/5] rcu: Acquire RCU lock when disabling BHs

On Wed, Sep 11, 2019 at 05:57:25PM +0100, Scott Wood wrote:
> A plain local_bh_disable() is documented as creating an RCU critical
> section, and (at least) rcutorture expects this to be the case.  However,
> in_softirq() doesn't block a grace period on PREEMPT_RT, since RCU checks
> preempt_count() directly.  Even if RCU were changed to check
> in_softirq(), that wouldn't allow blocked BH disablers to be boosted.
> 
> Fix this by calling rcu_read_lock() from local_bh_disable(), and update
> rcu_read_lock_bh_held() accordingly.
> 
> Signed-off-by: Scott Wood 

Reviewed-by: Joel Fernandes (Google) 

thanks,

 - Joel

> ---
> v3: Remove change to rcu_read_lock_bh_held(), and move debug portions
> of rcu_read_[un]lock_bh() to separate functions
> ---
>  include/linux/rcupdate.h | 40 
>  kernel/softirq.c | 12 +---
>  2 files changed, 41 insertions(+), 11 deletions(-)
> 
> diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
> index 388ace315f32..9ce7c5006a5e 100644
> --- a/include/linux/rcupdate.h
> +++ b/include/linux/rcupdate.h
> @@ -600,6 +600,36 @@ static inline void rcu_read_unlock(void)
>   rcu_lock_release(_lock_map); /* Keep acq info for rls diags. */
>  }
>  
> +#ifdef CONFIG_PREEMPT_RT_FULL
> +/*
> + * On RT, local_bh_disable() calls rcu_read_lock() -- no need to
> + * track it twice.
> + */
> +static inline void rcu_bh_lock_acquire(void)
> +{
> +}
> +
> +static inline void rcu_bh_lock_release(void)
> +{
> +}
> +#else
> +static inline void rcu_bh_lock_acquire(void)
> +{
> + __acquire(RCU_BH);
> + rcu_lock_acquire(_bh_lock_map);
> + RCU_LOCKDEP_WARN(!rcu_is_watching(),
> +  "rcu_read_lock_bh() used illegally while idle");
> +}
> +
> +static inline void rcu_bh_lock_release(void)
> +{
> + RCU_LOCKDEP_WARN(!rcu_is_watching(),
> +  "rcu_read_unlock_bh() used illegally while idle");
> + rcu_lock_release(_bh_lock_map);
> + __release(RCU_BH);
> +}
> +#endif
> +
>  /**
>   * rcu_read_lock_bh() - mark the beginning of an RCU-bh critical section
>   *
> @@ -615,10 +645,7 @@ static inline void rcu_read_unlock(void)
>  static inline void rcu_read_lock_bh(void)
>  {
>   local_bh_disable();
> - __acquire(RCU_BH);
> - rcu_lock_acquire(_bh_lock_map);
> - RCU_LOCKDEP_WARN(!rcu_is_watching(),
> -  "rcu_read_lock_bh() used illegally while idle");
> + rcu_bh_lock_acquire();
>  }
>  
>  /*
> @@ -628,10 +655,7 @@ static inline void rcu_read_lock_bh(void)
>   */
>  static inline void rcu_read_unlock_bh(void)
>  {
> - RCU_LOCKDEP_WARN(!rcu_is_watching(),
> -  "rcu_read_unlock_bh() used illegally while idle");
> - rcu_lock_release(_bh_lock_map);
> - __release(RCU_BH);
> + rcu_bh_lock_release();
>   local_bh_enable();
>  }
>  
> diff --git a/kernel/softirq.c b/kernel/softirq.c
> index d16d080a74f7..6080c9328df1 100644
> --- a/kernel/softirq.c
> +++ b/kernel/softirq.c
> @@ -115,8 +115,10 @@ void __local_bh_disable_ip(unsigned long ip, unsigned 
> int cnt)
>   long soft_cnt;
>  
>   WARN_ON_ONCE(in_irq());
> - if (!in_atomic())
> + if (!in_atomic()) {
>   local_lock(bh_lock);
> + rcu_read_lock();
> + }
>   soft_cnt = this_cpu_inc_return(softirq_counter);
>   WARN_ON_ONCE(soft_cnt == 0);
>   current->softirq_count += SOFTIRQ_DISABLE_OFFSET;
> @@ -151,8 +153,10 @@ void _local_bh_enable(void)
>  #endif
>  
>   current->softirq_count -= SOFTIRQ_DISABLE_OFFSET;
> - if (!in_atomic())
> + if (!in_atomic()) {
> + rcu_read_unlock();
>   local_unlock(bh_lock);
> + }
>  }
>  
>  void _local_bh_enable_rt(void)
> @@ -185,8 +189,10 @@ void __local_bh_enable_ip(unsigned long ip, unsigned int 
> cnt)
>   WARN_ON_ONCE(count < 0);
>   local_irq_enable();
>  
> - if (!in_atomic())
> + if (!in_atomic()) {
> + rcu_read_unlock();
>   local_unlock(bh_lock);
> + }
>  
>   current->softirq_count -= SOFTIRQ_DISABLE_OFFSET;
>   preempt_check_resched();
> -- 
> 1.8.3.1
>

Re: [PATCH v2 1/4] mm: correct mask size for slub page->objects

2019-09-12 Thread Kirill A. Shutemov

On Thu, Sep 12, 2019 at 03:11:14PM -0600, Yu Zhao wrote:
> On Thu, Sep 12, 2019 at 12:40:35PM +0300, Kirill A. Shutemov wrote:
> > On Wed, Sep 11, 2019 at 08:31:08PM -0600, Yu Zhao wrote:
> > > Mask of slub objects per page shouldn't be larger than what
> > > page->objects can hold.
> > > 
> > > It requires more than 2^15 objects to hit the problem, and I don't
> > > think anybody would. It'd be nice to have the mask fixed, but not
> > > really worth cc'ing the stable.
> > > 
> > > Fixes: 50d5c41cd151 ("slub: Do not use frozen page flag but a bit in the 
> > > page counters")
> > > Signed-off-by: Yu Zhao 
> > 
> > I don't think the patch fixes anything.
> 
> Technically it does. It makes no sense for a mask to have more bits
> than the variable that holds the masked value. I had to look up the
> commit history to find out why and go through the code to make sure
> it doesn't actually cause any problem.
> 
> My hope is that nobody else would have to go through the same trouble.

Just put some comments then.

-- 
 Kirill A. Shutemov

Re: [PATCH] bpf: validate bpf_func when BPF_JIT is enabled

2019-09-12 Thread Sami Tolvanen

On Thu, Sep 12, 2019 at 3:52 AM Toke Høiland-Jørgensen  wrote:
> I think it would be good if you do both. I'm a bit worried that XDP
> performance will end up in a "death by a thousand paper cuts" situation,
> so I'd rather push back on even relatively small overheads like this; so
> being able to turn it off in the config would be good.

OK, thanks for the feedback. In that case, I think it's probably
better to wait until we have CFI ready for upstreaming and use the
same config for this one.

> Can you share more details about what the "future CFI checking" is
> likely to look like?

Sure, I posted an overview of CFI and what we're doing in Pixel devices here:

https://android-developers.googleblog.com/2018/10/control-flow-integrity-in-android-kernel.html

Sami

Re: Documentation for plain accesses and data races

2019-09-12 Thread Paul E. McKenney

On Fri, Sep 06, 2019 at 02:11:29PM -0400, Alan Stern wrote:
> Folks:
> 
> I have spent some time writing up a section for 
> tools/memory-model/Documentation/explanation.txt on plain accesses and 
> data races.  The initial version is below.
> 
> I'm afraid it's rather long and perhaps gets too bogged down in 
> complexities.  On the other hand, this is a complicated topic so to 
> some extent this is unavoidable.
> 
> In any case, I'd like to hear your comments and reviews.

Good stuff, thank you for putting this together!

Please see below for some questions, comments, and confusion interspersed.

> Alan
> 
> 
> 
> 
> PLAIN ACCESSES AND DATA RACES
> -
> 
> In the LKMM, memory accesses such as READ_ONCE(x), atomic_inc(),
> smp_load_acquire(), and so on are collectively referred to as
> "marked" accesses, because they are all annotated with special
> operations of one kind or another.  Ordinary C-language memory
> accesses such as x or y = 0 are simply called "plain" accesses.
> 
> Early versions of the LKMM had nothing to say about plain accesses.
> The C standard allows compilers to assume that the variables affected
> by plain accesses are not concurrently read or written by any other
> threads or CPUs.  This leaves compilers free to implement all manner
> of transformations or optimizations of code containing plain accesses,
> making such code very difficult for a memory model to handle.
> 
> Here is just one example of a possible pitfall:
> 
>   int a = 6;
>   int *x = 
> 
>   P0()
>   {
>   int *r1;
>   int r2 = 0;
> 
>   r1 = x;
>   if (r1 != NULL)
>   r2 = READ_ONCE(*r1);
>   }
> 
>   P1()
>   {
>   WRITE_ONCE(x, NULL);
>   }

I tried making a litmus test out of this:


C plain-1

{
int a = 6;
int *x = 
}

P0(int **x)
{
int *r1;
int r2 = 0;

r1 = *x;
if (r1 != 0)
r2 = READ_ONCE(*r1);
}

P1(int **x)
{
WRITE_ONCE(*x, 0);
}

locations [a; x; r1]
exists ~r2=6 /\ ~r2=0


However, r1 steadfastly refuses to have any value other than zero.


$ herd7 -conf linux-kernel.cfg /tmp/argh
Test plain-1 Allowed
States 1
a=6; r1=0; r2=0; x=0;
No
Witnesses
Positive: 0 Negative: 2
Flag data-race
Condition exists (not (r2=6) /\ not (r2=0))
Observation plain-1 Never 0 2
Time plain-1 0.00
Hash=b0fdbd0f627fd65e0cd413bf87f6f4a4


What am I doing wrong here?  Outdated herd7 version?

$ herd7 -version
7.52+7(dev), Rev: c81f1ff06f30d5c28c34d893a29f5f8505334179

Hmmm...  I might well be in an inconsistent herd7/ocaml state.  If no
one sees anything obvious, I will try reinstalling from scratch, but
that will not likely happen in the next few days.

> On the face of it, one would expect that when this code runs, the only
> possible final values for r2 are 6 and 0, depending on whether or not
> P1's store to x propagates to P0 before P0's load from x executes.
> But since P0's load from x is a plain access, the compiler may decide
> to carry out the load twice (for the comparison against NULL, then again
> for the READ_ONCE()) and eliminate the temporary variable r1.  The
> object code generated for P0 could therefore end up looking rather
> like this:
> 
>   P0()
>   {
>   int r2 = 0;
> 
>   if (x != NULL)
>   r2 = READ_ONCE(*x);
>   }
> 
> And now it is obvious that this code runs the risk of dereferencing a
> NULL pointer, because P1's store to x might propagate to P0 after the
> test against NULL has been made but before the READ_ONCE() executes.
> If the original code had said "r1 = READ_ONCE(x)" instead of "r1 = x",
> the compiler would not have performed this optimization and there
> would be no possibility of a NULL-pointer dereference.
> 
> Given the possibility of transformations like this one, the LKMM
> doesn't try to predict all possible outcomes of code containing plain
> accesses.  It is content to determine whether the code violates the

I suggest starting this sentence with something like "It is instead
content to determine", adding "instead", to help the reader transition.

> compiler's assumptions, which would render the ultimate outcome
> undefined.
> 
> In technical terms, the compiler is allowed to assume that when the
> program executes, there will not be any data races.  A "data race"
> occurs when two conflicting memory accesses execute concurrently;
> two memory accesses "conflict" if:
> 
>   they access the same location,
> 
>   they occur on different CPUs (or in different

Re: [PATCH 00/13] nvdimm: Use more common kernel coding style

2019-09-12 Thread Miguel Ojeda

On Thu, Sep 12, 2019 at 11:08 PM Joe Perches  wrote:
>
> Please name the major projects and then point to their
> .clang-format equivalents.
>
> Also note the size/scope/complexity of the major projects.

Mozilla, WebKit, LLVM and Microsoft. They have their style distributed
with the official clang-format, not sure if they enforce it.

Same for Chromium/Chrome, but it looks like they indeed enforce it:

  "A checkout should give you clang-format to automatically format C++
code. By policy, Clang's formatting of code should always be accepted
in code reviews."

I would bet other Google projects do so as well (since Chandler
Carruth has been giving talks about clang-format for 7+ years). Nick?

I hope those are major enough. There is also precedent in other
languages (e.g. Java, C#, Rust).

> I used the latest one, and quite a bit of the conversion
> was unpleasant to read.

It would be good to see particularly bad snippets to see if we can do
something about them (and, if needed, try to improve clang-format to
support whatever we need).

Did you tweak the parameters with the new ones? I am preparing an RFC
patch for an updated .clang-format configuration that improves quite a
bit the results w.r.t. to the current one (and allows for some leeway
on the developer's side, which helps prevent some cases too).

> Marking sections _no_auto_format_ isn't really a
> good solution is it?

I am thinking about special tables that are hand-crafted or very
complex macros. For those, yes, I think it is a fine solution. That is
why clang-format has that feature to begin with, and you can see an
example in Mozilla's style guide which points here:

  https://github.com/mozilla/gecko-dev/blob/master/xpcom/io/nsEscape.cpp#L22

Cheers,
Miguel

[PATCH v2] dt-bindings: iio: accel: add binding documentation for ADIS16240

2019-09-12 Thread Rodrigo Carvalho

This patch add device tree binding documentation for ADIS16240.

Signed-off-by: Rodrigo Ribeiro Carvalho 
---
V2:
  - Remove true constant for spi-cpha and spi-cpol
  - Add description field for spi-cpha and spi-cpol
  - Add maxItems field for spi-cpha and spi-cpol

 .../bindings/iio/accel/adi,adis16240.yaml | 61 +++
 1 file changed, 61 insertions(+)
 create mode 100644 
Documentation/devicetree/bindings/iio/accel/adi,adis16240.yaml

diff --git a/Documentation/devicetree/bindings/iio/accel/adi,adis16240.yaml 
b/Documentation/devicetree/bindings/iio/accel/adi,adis16240.yaml
new file mode 100644
index ..4b1bd2419604
--- /dev/null
+++ b/Documentation/devicetree/bindings/iio/accel/adi,adis16240.yaml
@@ -0,0 +1,61 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/iio/accel/adi,adis16240.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: ADIS16240 Programmable Impact Sensor and Recorder driver
+
+maintainers:
+  - Alexandru Ardelean 
+
+description: |
+  ADIS16240 Programmable Impact Sensor and Recorder driver that supports
+  SPI interface.
+https://www.analog.com/en/products/adis16240.html
+
+properties:
+  compatible:
+enum:
+  - adi,adis16240
+
+  reg:
+maxItems: 1
+
+  spi-cpha:
+description: |
+  See Documentation/devicetree/bindings/spi/spi-controller.yaml
+maxItems: 1
+
+  spi-cpol: |
+description: |
+  See Documentation/devicetree/bindings/spi/spi-controller.yaml
+maxItems: 1
+
+  interrupts:
+maxItems: 1
+
+required:
+  - compatible
+  - reg
+  - interrupts
+
+examples:
+  - |
+#include 
+#include 
+spi0 {
+#address-cells = <1>;
+#size-cells = <0>;
+
+/* Example for a SPI device node */
+accelerometer@0 {
+compatible = "adi,adis16240";
+reg = <0>;
+spi-max-frequency = <250>;
+spi-cpol;
+spi-cpha;
+interrupt-parent = <>;
+interrupts = <0 IRQ_TYPE_LEVEL_HIGH>;
+};
+};
-- 
2.23.0.rc1

problem starting /sbin/init (32-bit 5.3-rc8)

2019-09-12 Thread Randy Dunlap

This is 32-bit kernel, just happens to be running on a 64-bit laptop.
I added the debug printk in __phys_addr() just before "[cut here]".

CONFIG_HARDENED_USERCOPY=y

The BUG is this line in arch/x86/mm/physaddr.c:
VIRTUAL_BUG_ON((phys_addr >> PAGE_SHIFT) > max_low_pfn);
It's line 83 in my source file only due to adding  and
a conditional pr_crit() call.


[   19.730409][T1] debug: unmapping init [mem 0xdc7bc000-0xdca30fff]
[   19.734289][T1] Write protecting kernel text and read-only data: 13888k
[   19.737675][T1] rodata_test: all tests were successful
[   19.740757][T1] Run /sbin/init as init process
[   19.792877][T1] __phys_addr: max_low_pfn=0x36ffe, x=0xff001ff1, 
phys_addr=0x3f001ff1
[   19.796561][T1] [ cut here ]
[   19.797501][T1] kernel BUG at ../arch/x86/mm/physaddr.c:83!
[   19.802799][T1] invalid opcode:  [#1] PREEMPT SMP DEBUG_PAGEALLOC
[   19.803782][T1] CPU: 1 PID: 1 Comm: swapper/0 Not tainted 5.3.0-rc8 #6
[   19.803782][T1] Hardware name: Dell Inc. Inspiron 1318   
/0C236D, BIOS A04 01/15/2009
[   19.803782][T1] EIP: __phys_addr+0xaf/0x100
[   19.803782][T1] Code: 85 c0 74 67 89 f7 c1 ef 0c 39 f8 73 2e 56 53 50 68 
90 9f 1f dc 68 00 eb 45 dc e8 ec b3 09 00 83 c4 14 3b 3d 30 55 cf dc 76 11 <0f> 
0b b8 7c 3b 5c dc e8 45 53 4c 00 90 8d 74 26 00 89 d8 e8 39 cd
[   19.803782][T1] EAX: 0044 EBX: ff001ff1 ECX:  EDX: db90a471
[   19.803782][T1] ESI: 3f001ff1 EDI: 0003f001 EBP: f41ddea0 ESP: f41dde90
[   19.803782][T1] DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068 EFLAGS: 
00010216
[   19.803782][T1] CR0: 80050033 CR2: dc218544 CR3: 1ca39000 CR4: 000406d0
[   19.803782][T1] Call Trace:
[   19.803782][T1]  __check_object_size+0xaf/0x3c0
[   19.803782][T1]  ? __might_sleep+0x80/0xa0
[   19.803782][T1]  copy_strings+0x1c2/0x370
[   19.803782][T1]  copy_strings_kernel+0x2b/0x40
[   19.803782][T1]  __do_execve_file+0x4ca/0x810
[   19.803782][T1]  ? kmem_cache_alloc+0x1c7/0x370
[   19.803782][T1]  do_execve+0x1b/0x20
[   19.803782][T1]  run_init_process+0x31/0x40
[   19.803782][T1]  try_to_run_init_process+0x11/0x40
[   19.803782][T1]  kernel_init+0xda/0x120
[   19.803782][T1]  ? rest_init+0x130/0x130
[   19.803782][T1]  ret_from_fork+0x2e/0x38
[   19.803782][T1] Modules linked in:
[   19.876679][T1] ---[ end trace 2b8071cbe5f1eece ]---
[   19.879467][T1] EIP: __phys_addr+0xaf/0x100
[   19.882125][T1] Code: 85 c0 74 67 89 f7 c1 ef 0c 39 f8 73 2e 56 53 50 68 
90 9f 1f dc 68 00 eb 45 dc e8 ec b3 09 00 83 c4 14 3b 3d 30 55 cf dc 76 11 <0f> 
0b b8 7c 3b 5c dc e8 45 53 4c 00 90 8d 74 26 00 89 d8 e8 39 cd
[   19.889474][T1] EAX: 0044 EBX: ff001ff1 ECX:  EDX: db90a471
[   19.892635][T1] ESI: 3f001ff1 EDI: 0003f001 EBP: f41ddea0 ESP: f41dde90
[   19.895806][T1] DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068 EFLAGS: 
00010216
[   19.899106][T1] CR0: 80050033 CR2: dc218544 CR3: 1ca39000 CR4: 000406d0
[   19.902276][T1] Kernel panic - not syncing: Fatal exception
[   19.903268][T1] Kernel Offset: 0x1a80 from 0xc100 (relocation 
range: 0xc000-0xf77fdfff)
[   19.903268][T1] ---[ end Kernel panic - not syncing: Fatal exception ]---


Full boot log or kernel .config file are available if wanted.

-- 
~Randy

[PATCH] irqchip/sifive-plic: add irq_mask and irq_unmask

2019-09-12 Thread Darius Rad

As per the existing comment, irq_mask and irq_unmask do not need
to do anything for the PLIC.  However, the functions must exist
(the pointers cannot be NULL) as they are not optional, based on
the documentation (Documentation/core-api/genericirq.rst) as well
as existing usage (e.g., include/linux/irqchip/chained_irq.h).

Signed-off-by: Darius Rad 
---
 drivers/irqchip/irq-sifive-plic.c | 13 +
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/drivers/irqchip/irq-sifive-plic.c 
b/drivers/irqchip/irq-sifive-plic.c
index cf755964f2f8..52d5169f924f 100644
--- a/drivers/irqchip/irq-sifive-plic.c
+++ b/drivers/irqchip/irq-sifive-plic.c
@@ -111,6 +111,13 @@ static void plic_irq_disable(struct irq_data *d)
plic_irq_toggle(cpu_possible_mask, d->hwirq, 0);
 }
 
+/*
+ * There is no need to mask/unmask PLIC interrupts.  They are "masked"
+ * by reading claim and "unmasked" when writing it back.
+ */
+static void plic_irq_mask(struct irq_data *d) { }
+static void plic_irq_unmask(struct irq_data *d) { }
+
 #ifdef CONFIG_SMP
 static int plic_set_affinity(struct irq_data *d,
 const struct cpumask *mask_val, bool force)
@@ -138,12 +145,10 @@ static int plic_set_affinity(struct irq_data *d,
 
 static struct irq_chip plic_chip = {
.name   = "SiFive PLIC",
-   /*
-* There is no need to mask/unmask PLIC interrupts.  They are "masked"
-* by reading claim and "unmasked" when writing it back.
-*/
.irq_enable = plic_irq_enable,
.irq_disable= plic_irq_disable,
+   .irq_mask   = plic_irq_mask,
+   .irq_unmask = plic_irq_unmask,
 #ifdef CONFIG_SMP
.irq_set_affinity = plic_set_affinity,
 #endif
-- 
2.20.1

Re: [PATCH RT v3 4/5] rcu: Disable use_softirq on PREEMPT_RT