[PATCH 4/4] kvm: Add VFIO device for handling IOMMU cache coherency

2013-10-01 Thread Alex Williamson
So far we've succeeded at making KVM and VFIO mostly unaware of each
other, but there's an important point where that breaks down.  Intel
VT-d hardware may or may not support snoop control.  When snoop
control is available, intel-iommu promotes No-Snoop transactions on
PCIe to be cache coherent.  That allows KVM to handle things like the
x86 WBINVD opcode as a nop.  When the hardware does not support this,
KVM must implement a hardware visible WBINVD for the guest.

We could simply let userspace tell KVM how to handle WBINVD, but it's
privileged for a reason.  Allowing an arbitrary user to enable
physical WBINVD gives them more access to the hardware.  Previously,
this has only been enabled for guests supporting legacy PCI device
assignment.  In such cases it's necessary for proper guest execution.
We therefore create a new KVM-VFIO virtual device.  The user can add
and remove VFIO groups to this device via file descriptors.  KVM
makes use of the VFIO external user interface to validate that the
user has access to physical hardware and, for now, assumes the I/O
is noncoherent.  Eventually we'll add an interface to allow KVM to
determine the conherency of the domain as noted in the TODO.

Signed-off-by: Alex Williamson alex.william...@redhat.com
---
 Documentation/virtual/kvm/devices/vfio.txt |   22 ++
 arch/x86/kvm/Makefile  |2 
 include/linux/kvm_host.h   |1 
 include/uapi/linux/kvm.h   |4 
 virt/kvm/kvm_main.c|3 
 virt/kvm/vfio.c|  262 
 6 files changed, 293 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/virtual/kvm/devices/vfio.txt
 create mode 100644 virt/kvm/vfio.c

diff --git a/Documentation/virtual/kvm/devices/vfio.txt 
b/Documentation/virtual/kvm/devices/vfio.txt
new file mode 100644
index 000..ef51740
--- /dev/null
+++ b/Documentation/virtual/kvm/devices/vfio.txt
@@ -0,0 +1,22 @@
+VFIO virtual device
+===
+
+Device types supported:
+  KVM_DEV_TYPE_VFIO
+
+Only one VFIO instance may be created per VM.  The created device
+tracks VFIO groups in use by the VM and features of those groups
+important to the correctness and acceleration of the VM.  As groups
+are enabled and disabled for use by the VM, KVM should be updated
+about their presence.  When registered with KVM, a reference to the
+VFIO-group is held by KVM.
+
+Groups:
+  KVM_DEV_VFIO_GROUP
+
+KVM_DEV_VFIO_GROUP attributes:
+  KVM_DEV_VFIO_GROUP_ADD: Add a VFIO group to VFIO-KVM device tracking
+  KVM_DEV_VFIO_GROUP_DEL: Remove a VFIO group from VFIO-KVM device tracking
+
+For each, kvm_device_attr.addr points to an int32_t file descriptor
+for the VFIO group.
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index bf4fb04..25d22b2 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -9,7 +9,7 @@ KVM := ../../../virt/kvm
 
 kvm-y  += $(KVM)/kvm_main.o $(KVM)/ioapic.o \
$(KVM)/coalesced_mmio.o $(KVM)/irq_comm.o \
-   $(KVM)/eventfd.o $(KVM)/irqchip.o
+   $(KVM)/eventfd.o $(KVM)/irqchip.o $(KVM)/vfio.o
 kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT)+= $(KVM)/assigned-dev.o $(KVM)/iommu.o
 kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index e239c93..8a6bfa0 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1082,6 +1082,7 @@ struct kvm_device *kvm_device_from_filp(struct file 
*filp);
 
 extern struct kvm_device_ops kvm_mpic_ops;
 extern struct kvm_device_ops kvm_xics_ops;
+extern struct kvm_device_ops kvm_vfio_ops;
 
 #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
 
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 99c2533..7c1a349 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -843,6 +843,10 @@ struct kvm_device_attr {
 #define KVM_DEV_TYPE_FSL_MPIC_20   1
 #define KVM_DEV_TYPE_FSL_MPIC_42   2
 #define KVM_DEV_TYPE_XICS  3
+#define KVM_DEV_TYPE_VFIO  4
+#define  KVM_DEV_VFIO_GROUP1
+#define   KVM_DEV_VFIO_GROUP_ADD   1
+#define   KVM_DEV_VFIO_GROUP_DEL   2
 
 /*
  * ioctls for VM fds
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 8727820..85185af 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2273,6 +2273,9 @@ static int kvm_ioctl_create_device(struct kvm *kvm,
ops = kvm_xics_ops;
break;
 #endif
+   case KVM_DEV_TYPE_VFIO:
+   ops = kvm_vfio_ops;
+   break;
default:
return -ENODEV;
}
diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c
new file mode 100644
index 000..37a8261
--- /dev/null
+++ b/virt/kvm/vfio.c
@@ -0,0 +1,262 @@
+/*
+ * VFIO-KVM bridge pseudo device
+ *
+ * Copyright (C) 2013 Red Hat, 

[PATCH 2/4] kvm/x86: Convert iommu_flags to iommu_noncoherent

2013-10-01 Thread Alex Williamson
Default to operating in coherent mode.  This simplifies the logic when
we switch to a model of registering and unregistering noncoherent I/O
with KVM.

Signed-off-by: Alex Williamson alex.william...@redhat.com
---
 arch/ia64/include/asm/kvm_host.h |2 +-
 arch/x86/include/asm/kvm_host.h  |2 +-
 arch/x86/kvm/vmx.c   |2 +-
 arch/x86/kvm/x86.c   |2 +-
 include/linux/kvm_host.h |3 ---
 virt/kvm/iommu.c |   16 
 6 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/arch/ia64/include/asm/kvm_host.h b/arch/ia64/include/asm/kvm_host.h
index 989dd3f..1933b7a 100644
--- a/arch/ia64/include/asm/kvm_host.h
+++ b/arch/ia64/include/asm/kvm_host.h
@@ -480,7 +480,7 @@ struct kvm_arch {
 
struct list_head assigned_dev_head;
struct iommu_domain *iommu_domain;
-   int iommu_flags;
+   bool iommu_noncoherent;
 
unsigned long irq_sources_bitmap;
unsigned long irq_states[KVM_IOAPIC_NUM_PINS];
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index c76ff74..1b6b5f9 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -557,7 +557,7 @@ struct kvm_arch {
 
struct list_head assigned_dev_head;
struct iommu_domain *iommu_domain;
-   int iommu_flags;
+   bool iommu_noncoherent;
struct kvm_pic *vpic;
struct kvm_ioapic *vioapic;
struct kvm_pit *vpit;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a1216de..8b2270a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -7405,7 +7405,7 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t 
gfn, bool is_mmio)
if (is_mmio)
ret = MTRR_TYPE_UNCACHABLE  VMX_EPT_MT_EPTE_SHIFT;
else if (vcpu-kvm-arch.iommu_domain 
-   !(vcpu-kvm-arch.iommu_flags  KVM_IOMMU_CACHE_COHERENCY))
+vcpu-kvm-arch.iommu_noncoherent)
ret = kvm_get_guest_memory_type(vcpu, gfn) 
  VMX_EPT_MT_EPTE_SHIFT;
else
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e5ca72a..b1231b0 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2716,7 +2716,7 @@ static void wbinvd_ipi(void *garbage)
 static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)
 {
return vcpu-kvm-arch.iommu_domain 
-   !(vcpu-kvm-arch.iommu_flags  KVM_IOMMU_CACHE_COHERENCY);
+  vcpu-kvm-arch.iommu_noncoherent;
 }
 
 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 0fbbc7a..f46da56 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -747,9 +747,6 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
 int kvm_request_irq_source_id(struct kvm *kvm);
 void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id);
 
-/* For vcpu-arch.iommu_flags */
-#define KVM_IOMMU_CACHE_COHERENCY  0x1
-
 #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
 int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot);
 void kvm_iommu_unmap_pages(struct kvm *kvm, struct kvm_memory_slot *slot);
diff --git a/virt/kvm/iommu.c b/virt/kvm/iommu.c
index 72a130b..9cde444 100644
--- a/virt/kvm/iommu.c
+++ b/virt/kvm/iommu.c
@@ -79,7 +79,7 @@ int kvm_iommu_map_pages(struct kvm *kvm, struct 
kvm_memory_slot *slot)
flags = IOMMU_READ;
if (!(slot-flags  KVM_MEM_READONLY))
flags |= IOMMU_WRITE;
-   if (kvm-arch.iommu_flags  KVM_IOMMU_CACHE_COHERENCY)
+   if (!kvm-arch.iommu_noncoherent)
flags |= IOMMU_CACHE;
 
 
@@ -158,7 +158,8 @@ int kvm_assign_device(struct kvm *kvm,
 {
struct pci_dev *pdev = NULL;
struct iommu_domain *domain = kvm-arch.iommu_domain;
-   int r, last_flags;
+   int r;
+   bool noncoherent;
 
/* check if iommu exists and in use */
if (!domain)
@@ -174,15 +175,13 @@ int kvm_assign_device(struct kvm *kvm,
return r;
}
 
-   last_flags = kvm-arch.iommu_flags;
-   if (iommu_domain_has_cap(kvm-arch.iommu_domain,
-IOMMU_CAP_CACHE_COHERENCY))
-   kvm-arch.iommu_flags |= KVM_IOMMU_CACHE_COHERENCY;
+   noncoherent = !iommu_domain_has_cap(kvm-arch.iommu_domain,
+   IOMMU_CAP_CACHE_COHERENCY);
 
/* Check if need to update IOMMU page table for guest memory */
-   if ((last_flags ^ kvm-arch.iommu_flags) ==
-   KVM_IOMMU_CACHE_COHERENCY) {
+   if (noncoherent != kvm-arch.iommu_noncoherent) {
kvm_iommu_unmap_memslots(kvm);
+   kvm-arch.iommu_noncoherent = noncoherent;
r = kvm_iommu_map_memslots(kvm);
if (r)
goto out_unmap;
@@ -350,6 +349,7 @@ int kvm_iommu_unmap_guest(struct kvm *kvm)
mutex_lock(kvm-slots_lock);

[PATCH 0/4] KVM noncoherent DMA registration and VFIO pseudo device

2013-10-01 Thread Alex Williamson
This is a follow-up to my previous RFC series on this topic.  The goal
is to unify how KVM manages guests in the presence of non-coherent DMA
trafic and provide a way for QEMU to register VFIO groups to enable
that support.  Since this changes the way KVM handles things like
WBINVD, we use the VFIO external user interface so that KVM can verify
if a user has physical hardware access.  For now we assume VFIO domains
are always non-coherent, which is true if the IOMMU driver honors the
IOMMU_CACHE mapping attribute.  Once this is in and we figure out what
exactly VFIO should do regarding support for coherent IOMMU domains,
we can create a new interface for KVM to call.  This is noted with a
TODO in this patch.  I've reworked the device interface to use
attributes and pass an address for the file descriptor since the RFC.
Alexey, I expect that a new attribute to associate an LIOBN to a group
and arch callbacks to make use of that information is all that you'll
need to add for POWER.  Thanks,

Alex

---

Alex Williamson (4):
  kvm: Destroy  free KVM devices on release
  kvm/x86: Convert iommu_flags to iommu_noncoherent
  kvm: Create non-coherent DMA registeration
  kvm: Add VFIO device for handling IOMMU cache coherency


 Documentation/virtual/kvm/devices/vfio.txt |   22 ++
 arch/ia64/include/asm/kvm_host.h   |2 
 arch/powerpc/kvm/book3s_xics.c |1 
 arch/x86/include/asm/kvm_host.h|4 
 arch/x86/kvm/Makefile  |2 
 arch/x86/kvm/vmx.c |3 
 arch/x86/kvm/x86.c |   21 ++
 include/linux/kvm_host.h   |   23 ++
 include/uapi/linux/kvm.h   |4 
 virt/kvm/iommu.c   |   22 +-
 virt/kvm/kvm_main.c|8 +
 virt/kvm/vfio.c|  262 
 12 files changed, 355 insertions(+), 19 deletions(-)
 create mode 100644 Documentation/virtual/kvm/devices/vfio.txt
 create mode 100644 virt/kvm/vfio.c
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/4] kvm: Destroy free KVM devices on release

2013-10-01 Thread Alex Williamson
The KVM device interface allocates a struct kvm_device and calls
kvm_device_ops.create on it from KVM VM ioctl KVM_CREATE_DEVICE.
This returns a file descriptor to the user for them to set/get/check
further attributes.  On closing the file descriptor, one would assume
that kvm_device_ops.destroy is called and all traces of the device
would go away.  One would be wrong, it actually does nothing more
than release the struct kvm reference, waiting until the VM is
destroyed before doing more.  This leaves devices that only want a
single instance of themselves per VM in a tough spot.

To fix this, do full cleanup on release of the device file descriptor.
It's also non-symmetric that one of the existing devices frees the
struct kvm_device from it's .destroy function, while the other
doesn't.  KVM-core allocates the structure and should therefore be
responsible for freeing it.  Finally, add a missing kfree for the
device creation error path.

Signed-off-by: Alex Williamson alex.william...@redhat.com
---
 arch/powerpc/kvm/book3s_xics.c |1 -
 virt/kvm/kvm_main.c|5 +
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c
index a3a5cb8..9a82426 100644
--- a/arch/powerpc/kvm/book3s_xics.c
+++ b/arch/powerpc/kvm/book3s_xics.c
@@ -1220,7 +1220,6 @@ static void kvmppc_xics_free(struct kvm_device *dev)
for (i = 0; i = xics-max_icsid; i++)
kfree(xics-ics[i]);
kfree(xics);
-   kfree(dev);
 }
 
 static int kvmppc_xics_create(struct kvm_device *dev, u32 type)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 979bff4..8727820 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -572,6 +572,7 @@ static void kvm_destroy_devices(struct kvm *kvm)
 
list_del(node);
dev-ops-destroy(dev);
+   kfree(dev);
}
 }
 
@@ -2229,6 +2230,9 @@ static int kvm_device_release(struct inode *inode, struct 
file *filp)
struct kvm_device *dev = filp-private_data;
struct kvm *kvm = dev-kvm;
 
+   list_del(dev-vm_node);
+   dev-ops-destroy(dev);
+   kfree(dev);
kvm_put_kvm(kvm);
return 0;
 }
@@ -2292,6 +2296,7 @@ static int kvm_ioctl_create_device(struct kvm *kvm,
ret = anon_inode_getfd(ops-name, kvm_device_fops, dev, O_RDWR | 
O_CLOEXEC);
if (ret  0) {
ops-destroy(dev);
+   kfree(dev);
return ret;
}
 

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 3/4] kvm: Create non-coherent DMA registeration

2013-10-01 Thread Alex Williamson
We currently use some ad-hoc arch variables tied to legacy KVM device
assignment to manage emulation of instructions that depend on whether
non-coherent DMA is present.  Create an interface for this so that we
can register coherency for other devices, like vfio assigned devices.

Signed-off-by: Alex Williamson alex.william...@redhat.com
---
 arch/x86/include/asm/kvm_host.h |2 ++
 arch/x86/kvm/vmx.c  |3 +--
 arch/x86/kvm/x86.c  |   21 +++--
 include/linux/kvm_host.h|   19 +++
 virt/kvm/iommu.c|6 ++
 5 files changed, 47 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 1b6b5f9..50c1e9c1 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -558,6 +558,8 @@ struct kvm_arch {
struct list_head assigned_dev_head;
struct iommu_domain *iommu_domain;
bool iommu_noncoherent;
+#define __KVM_HAVE_ARCH_NONCOHERENT_DMA
+   atomic_t noncoherent_dma_count;
struct kvm_pic *vpic;
struct kvm_ioapic *vioapic;
struct kvm_pit *vpit;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 8b2270a..a982c9e 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -7404,8 +7404,7 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t 
gfn, bool is_mmio)
 */
if (is_mmio)
ret = MTRR_TYPE_UNCACHABLE  VMX_EPT_MT_EPTE_SHIFT;
-   else if (vcpu-kvm-arch.iommu_domain 
-vcpu-kvm-arch.iommu_noncoherent)
+   else if (kvm_arch_has_noncoherent_dma(vcpu-kvm))
ret = kvm_get_guest_memory_type(vcpu, gfn) 
  VMX_EPT_MT_EPTE_SHIFT;
else
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b1231b0..feec86d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2715,8 +2715,7 @@ static void wbinvd_ipi(void *garbage)
 
 static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)
 {
-   return vcpu-kvm-arch.iommu_domain 
-  vcpu-kvm-arch.iommu_noncoherent;
+   return kvm_arch_has_noncoherent_dma(vcpu-kvm);
 }
 
 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
@@ -7420,6 +7419,24 @@ bool kvm_arch_can_inject_async_page_present(struct 
kvm_vcpu *vcpu)
kvm_x86_ops-interrupt_allowed(vcpu);
 }
 
+void kvm_arch_register_noncoherent_dma(struct kvm *kvm)
+{
+   atomic_inc(kvm-arch.noncoherent_dma_count);
+}
+EXPORT_SYMBOL_GPL(kvm_arch_register_noncoherent_dma);
+
+void kvm_arch_unregister_noncoherent_dma(struct kvm *kvm)
+{
+   atomic_dec(kvm-arch.noncoherent_dma_count);
+}
+EXPORT_SYMBOL_GPL(kvm_arch_unregister_noncoherent_dma);
+
+bool kvm_arch_has_noncoherent_dma(struct kvm *kvm)
+{
+   return atomic_read(kvm-arch.noncoherent_dma_count);
+}
+EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma);
+
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index f46da56..e239c93 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -671,6 +671,25 @@ static inline void kvm_arch_free_vm(struct kvm *kvm)
 }
 #endif
 
+#ifdef __KVM_HAVE_ARCH_NONCOHERENT_DMA
+void kvm_arch_register_noncoherent_dma(struct kvm *kvm);
+void kvm_arch_unregister_noncoherent_dma(struct kvm *kvm);
+bool kvm_arch_has_noncoherent_dma(struct kvm *kvm);
+#else
+static inline void kvm_arch_register_noncoherent_dma(void)
+{
+}
+
+static inline void kvm_arch_unregister_noncoherent_dma(void)
+{
+}
+
+static inline bool kvm_arch_has_noncoherent_dma(void)
+{
+   return false;
+}
+#endif
+
 static inline wait_queue_head_t *kvm_arch_vcpu_wq(struct kvm_vcpu *vcpu)
 {
 #ifdef __KVM_HAVE_ARCH_WQP
diff --git a/virt/kvm/iommu.c b/virt/kvm/iommu.c
index 9cde444..0a54456 100644
--- a/virt/kvm/iommu.c
+++ b/virt/kvm/iommu.c
@@ -140,6 +140,9 @@ static int kvm_iommu_map_memslots(struct kvm *kvm)
struct kvm_memslots *slots;
struct kvm_memory_slot *memslot;
 
+   if (kvm-arch.iommu_noncoherent)
+   kvm_arch_register_noncoherent_dma(kvm);
+
idx = srcu_read_lock(kvm-srcu);
slots = kvm_memslots(kvm);
 
@@ -335,6 +338,9 @@ static int kvm_iommu_unmap_memslots(struct kvm *kvm)
 
srcu_read_unlock(kvm-srcu, idx);
 
+   if (kvm-arch.iommu_noncoherent)
+   kvm_arch_unregister_noncoherent_dma(kvm);
+
return 0;
 }
 

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCHv2 1/2] mm: rearrange madvise code to allow for reuse

2013-10-01 Thread Colin Cross
This patch refactors the madvise syscall to allow for parts of it
to be reused by a prctl syscall that affects vmas.

Move the code that walks vmas in a virtual address range into a
function that takes a function pointer as a parameter.  The only
caller for now is sys_madvise, which uses it to call
madvise_vma_behavior on each vma, but the next patch will add
an additional caller.

Move handling all vma behaviors inside madvise_behavior, and
rename it to madvise_vma_behavior.

Move the code that updates the flags on a vma, including splitting
or merging the vma as necessary, into a new function called
madvise_update_vma.  The next patch will add support for updating
a new anon_name field as well.

Signed-off-by: Colin Cross ccr...@android.com
---
 mm/madvise.c | 272 +--
 1 file changed, 151 insertions(+), 121 deletions(-)

diff --git a/mm/madvise.c b/mm/madvise.c
index 7055883..b8820fd 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -39,65 +39,20 @@ static int madvise_need_mmap_write(int behavior)
 }
 
 /*
- * We can potentially split a vm area into separate
- * areas, each area with its own behavior.
+ * Update the vm_flags on regiion of a vma, splitting it or merging it as
+ * necessary.  Must be called with mmap_sem held for writing;
  */
-static long madvise_behavior(struct vm_area_struct * vma,
-struct vm_area_struct **prev,
-unsigned long start, unsigned long end, int behavior)
+static int madvise_update_vma(struct vm_area_struct *vma,
+struct vm_area_struct **prev, unsigned long start,
+unsigned long end, unsigned long new_flags)
 {
struct mm_struct * mm = vma-vm_mm;
-   int error = 0;
pgoff_t pgoff;
-   unsigned long new_flags = vma-vm_flags;
-
-   switch (behavior) {
-   case MADV_NORMAL:
-   new_flags = new_flags  ~VM_RAND_READ  ~VM_SEQ_READ;
-   break;
-   case MADV_SEQUENTIAL:
-   new_flags = (new_flags  ~VM_RAND_READ) | VM_SEQ_READ;
-   break;
-   case MADV_RANDOM:
-   new_flags = (new_flags  ~VM_SEQ_READ) | VM_RAND_READ;
-   break;
-   case MADV_DONTFORK:
-   new_flags |= VM_DONTCOPY;
-   break;
-   case MADV_DOFORK:
-   if (vma-vm_flags  VM_IO) {
-   error = -EINVAL;
-   goto out;
-   }
-   new_flags = ~VM_DONTCOPY;
-   break;
-   case MADV_DONTDUMP:
-   new_flags |= VM_DONTDUMP;
-   break;
-   case MADV_DODUMP:
-   if (new_flags  VM_SPECIAL) {
-   error = -EINVAL;
-   goto out;
-   }
-   new_flags = ~VM_DONTDUMP;
-   break;
-   case MADV_MERGEABLE:
-   case MADV_UNMERGEABLE:
-   error = ksm_madvise(vma, start, end, behavior, new_flags);
-   if (error)
-   goto out;
-   break;
-   case MADV_HUGEPAGE:
-   case MADV_NOHUGEPAGE:
-   error = hugepage_madvise(vma, new_flags, behavior);
-   if (error)
-   goto out;
-   break;
-   }
+   int error;
 
if (new_flags == vma-vm_flags) {
*prev = vma;
-   goto out;
+   return 0;
}
 
pgoff = vma-vm_pgoff + ((start - vma-vm_start)  PAGE_SHIFT);
@@ -113,13 +68,13 @@ static long madvise_behavior(struct vm_area_struct * vma,
if (start != vma-vm_start) {
error = split_vma(mm, vma, start, 1);
if (error)
-   goto out;
+   return error;
}
 
if (end != vma-vm_end) {
error = split_vma(mm, vma, end, 0);
if (error)
-   goto out;
+   return error;
}
 
 success:
@@ -128,10 +83,7 @@ success:
 */
vma-vm_flags = new_flags;
 
-out:
-   if (error == -ENOMEM)
-   error = -EAGAIN;
-   return error;
+   return 0;
 }
 
 #ifdef CONFIG_SWAP
@@ -337,6 +289,77 @@ static long madvise_remove(struct vm_area_struct *vma,
return error;
 }
 
+/*
+ * Apply an madvise behavior to a region of a vma.  madvise_update_vma
+ * will handle splitting a vm area into separate areas, each area with its own
+ * behavior.
+ */
+static int madvise_vma_behavior(struct vm_area_struct *vma,
+struct vm_area_struct **prev,
+unsigned long start, unsigned long end,
+unsigned long behavior)
+{
+   int error = 0;
+   unsigned long new_flags = vma-vm_flags;
+
+   switch (behavior) {
+   case MADV_REMOVE:
+   return madvise_remove(vma, prev, start, end);
+   case MADV_WILLNEED:
+   return madvise_willneed(vma, prev, 

Re: Fwd: [v3.12-rc1] [regression] PM / hibernate: Create memory bitmaps after freezing user space

2013-10-01 Thread Rafael J. Wysocki
On Tuesday, October 01, 2013 06:38:03 PM Ronald wrote:
 This could be a coincidence, but I had a disk data corruption in /var
 (LVM). Most other partitions are read-only during normal operation. So
 I can safely keep testing this kernel. Just mentioning this, in case
 you see this happening with other ppl testing this patch as well.

If you have a failing resume from hibernation a filesystem is perfectly
possible, unfortunately.

The patch itself should't really cause any corruption to happen.

Thanks,
Rafael


 2013/9/30 Rafael J. Wysocki r...@rjwysocki.net:
  On Monday, September 30, 2013 07:45:54 AM Ronald wrote:
  Yes, works as well. Just survived twe cycles with s2disk. I'm
  surprised someone else did not report this earlier btw... Because it
  looks pretty generic (i.e. not specific to a 64bit UP system).
 
  It is a generic bug, actually.  Well, this means the user space driven
  hibernation doesn't receive much testing coverage these days ...
 
  I'll resend the patch with a proper changelog and then push it for 
  v3.12-rc4.
 
  Thanks,
  Rafael
 
 
  2013/9/30 Rafael J. Wysocki r...@rjwysocki.net:
   On Sunday, September 29, 2013 09:22:45 AM Ronald wrote:
   Attached patch fixes the issue. Both methods function as they did
   before. Thanks for the superfast fix!
  
   You're welcome, it's not the final one, however.
  
   Can you please test the one below and report back?
  
   Rafael
  
  
   ---
kernel/power/snapshot.c |5 -
kernel/power/user.c |8 
2 files changed, 12 insertions(+), 1 deletion(-)
  
   Index: linux-pm/kernel/power/snapshot.c
   ===
   --- linux-pm.orig/kernel/power/snapshot.c
   +++ linux-pm/kernel/power/snapshot.c
   @@ -743,7 +743,10 @@ int create_basic_memory_bitmaps(void)
   struct memory_bitmap *bm1, *bm2;
   int error = 0;
  
   -   BUG_ON(forbidden_pages_map || free_pages_map);
   +   if (forbidden_pages_map  free_pages_map)
   +   return 0;
   +   else
   +   BUG_ON(forbidden_pages_map || free_pages_map);
  
   bm1 = kzalloc(sizeof(struct memory_bitmap), GFP_KERNEL);
   if (!bm1)
   Index: linux-pm/kernel/power/user.c
   ===
   --- linux-pm.orig/kernel/power/user.c
   +++ linux-pm/kernel/power/user.c
   @@ -39,6 +39,7 @@ static struct snapshot_data {
   char frozen;
   char ready;
   char platform_support;
   +   bool free_bitmaps;
} snapshot_state;
  
atomic_t snapshot_device_available = ATOMIC_INIT(1);
   @@ -82,6 +83,10 @@ static int snapshot_open(struct inode *i
   data-swap = -1;
   data-mode = O_WRONLY;
   error = pm_notifier_call_chain(PM_RESTORE_PREPARE);
   +   if (!error) {
   +   error = create_basic_memory_bitmaps();
   +   data-free_bitmaps = !error;
   +   }
   if (error)
   pm_notifier_call_chain(PM_POST_RESTORE);
   }
   @@ -111,6 +116,8 @@ static int snapshot_release(struct inode
   pm_restore_gfp_mask();
   free_basic_memory_bitmaps();
   thaw_processes();
   +   } else if (data-free_bitmaps) {
   +   free_basic_memory_bitmaps();
   }
   pm_notifier_call_chain(data-mode == O_RDONLY ?
   PM_POST_HIBERNATION : PM_POST_RESTORE);
   @@ -231,6 +238,7 @@ static long snapshot_ioctl(struct file *
   break;
   pm_restore_gfp_mask();
   free_basic_memory_bitmaps();
   +   data-free_bitmaps = false;
   thaw_processes();
   data-frozen = 0;
   break;
  
  --
  I speak only for myself.
  Rafael J. Wysocki, Intel Open Source Technology Center.
-- 
I speak only for myself.
Rafael J. Wysocki, Intel Open Source Technology Center.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 0/9] procfs: protect /proc/pid/* files with file-f_cred

2013-10-01 Thread Djalal Harouni
/proc/pid/* entries varies at runtime, appropriate permission checks
need to happen during each system call.

Currently some of these sensitive entries are protected by performing
the ptrace_may_access() check. However even with that the /proc file
descriptors can be passed to a more privileged process
(e.g. a suid-exec) which will pass the classic ptrace_may_access()
check. In general the -open() call will be issued by an unprivileged
process while the -read(),-write() calls by a more privileged one.

Example of these files are:
/proc/*/syscall, /proc/*/stack etc.

And any open(/proc/self/*) then suid-exec to read()/write() /proc/self/*


These files are protected during read() by the ptrace_may_access(),
however the file descriptor can be passed to a suid-exec which can be
used to read data and bypass ASLR. Of course this was discussed several
times on LKML.


Solution:
Here we propose a clean solution that uses available mechanisms. No
additions, nor new structs/memory allocation...

After a discussion about some /proc/pid/* file permissions [1],
Eric W. Biederman proposed to use the file-f_cred to check if current's
cred have changed [2], actually he said that Linus was looking
on using the file-f_cred to implement a similar thing! But run in
problems with Chromes sandbox ? a link please ?


So here are my experiments:
The idea is to track the cred of current. If the cred change between
-open() and read()/write() this means that current has lost or gained
some X privileges. If so, in addition to the classic ptrace_may_access()
check, perform a second check using the file's opener cred. This means,
if an unprivileged process that tries to use a privileged one
(e.g. suid-exec) to read privileged files will get caught. The original
process that opened the file does not have the appropriate permissions
to read()/write() the target /proc/pid/* entry.

This second check is done of course during read(),write()...


Advantages of the proposed solution:
* It uses available mechanisms: file-f_cred which is a const cred
  that reference the cred of the opener.

* The file-f_cred can be easily referenced

* It allows to pass file descriptors under certain conditions:
  (1) current at open time may have enough permissions
  (2) current does a suid-exec or change its ruid/euid (new cred)
  (3) current or suid-exec tries to read from the task /proc entry
 Allow the -read() only if the file's opener cred at (1)
 have enough permissions on *this* task /proc entry during
 *this* -read() moment. Otherwise fail.

  IOW allow it, if the opener does not need the help of a suid-exec to
  read/write data.


Disadvantage:
* Currently only /proc/*/{stack,syscall,stat,personality} are handled.
  If the solution is accepted I'll continue with other files, taking
  care to not break userspace. All the facilities are provided.
* Your feedback


[1] https://lkml.org/lkml/2013/8/26/354
[2] https://lkml.org/lkml/2013/8/31/209


Change log
--
v1-v2:
 - Removed the file-f_cred member that was added to seq_file struct.
   Al Viro didn't like it, and Linus suggested to have a pointer on
   'file struct', so it's done by using seq_file-private

 - Added Acked-by: Kees Cook to
   [PATCH v2 4/9] procfs: make /proc/*/{stack,syscall} 0400

 - Added suggested-by: Eric W. Biederman  to
   [PATCH v2 1/9] procfs: add proc_same_open_cred() to check if the cred
   have changed
   [PATCH v2 2/9] procfs: add proc_allow_access() to check if file's
   opener may access task
   [PATCH v2 3/9] procfs: Document the proposed solution to protect
   procfs entries

 - Patchset cleaned

Version 1 was discussed here:
https://lkml.org/lkml/2013/9/25/459


The following series tries to implement what I describe.


Djalal Harouni (9):
 procfs: add proc_same_open_cred() to check if the cred have changed
 procfs: add proc_allow_access() to check if file's opener may access task
 procfs: Document the proposed solution to protect procfs entries
 procfs: make /proc/*/{stack,syscall} 0400
 procfs: make /proc entries that use seq files able to access file-f_cred
 procfs: add permission checks on the file's opener of /proc/*/stat
 procfs: add permission checks on the file's opener of /proc/*/personality
 procfs: improve permission checks on /proc/*/stack
 procfs: improve permission checks on /proc/*/syscall

 fs/proc/array.c|  16 ++-
 fs/proc/base.c | 301 
++---
 fs/proc/internal.h |   3 ++
 3 files changed, 292 insertions(+), 28 deletions(-)
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 1/9] procfs: add proc_same_open_cred() to check if the cred have changed

2013-10-01 Thread Djalal Harouni
Since /proc entries varies at runtime, permission checks need to
happen during each system call.

However even with that /proc file descriptors can be passed to a more
privileged process (e.g. a suid-exec) which will pass the classic
ptrace_may_access() permission check. The open() call will be issued in
general by an unprivileged process while the disclosure of sensitive
/proc information will happen using a more privileged process at
read(), write()...

The /proc need to know if the cred of the original opener matches the
cred of current in order to take the appropriate actions.

The match concerns the cred fields that are used in the
ptrace_may_access() check: uid, gid and cap_permitted.

If there is a match then the current task that tries to read/write
/proc entries has the same privileges as the task that issued the
open() call.

However if the match fails then the cred have changed which means that
perhaps we have gain or lost the privileges of processing the /proc
file descriptor. So add proc_same_open_cred() to check if the cred have
changed.

Cc: Kees Cook keesc...@chromium.org
Suggested-by: Eric W. Biederman ebied...@xmission.com
Signed-off-by: Djalal Harouni tix...@opendz.org
---
 fs/proc/base.c | 29 +
 fs/proc/internal.h |  1 +
 2 files changed, 30 insertions(+)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 1485e38..e834946 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -139,6 +139,35 @@ struct pid_entry {
NULL, proc_single_file_operations, \
{ .proc_show = show } )
 
+/**
+ * proc_same_open_cred - Check if the file's opener cred matches the
+ * current cred.
+ * @fcred:  The file's opener cred (file-f_cred)
+ *
+ * Return 1 if the cred of the file's opener matches the cred of
+ * current, otherwise 0.
+ *
+ * The match concerns the cred that are used in the ptrace_may_access()
+ * check to allow /proc task access. These cred are: uid,gid and
+ * cap_permitted.
+ *
+ * This function can be used to check if /proc file descriptors where
+ * passed to a more privileged process (e.g. execs a suid executable).
+ */
+int proc_same_open_cred(const struct cred *fcred)
+{
+   const struct cred *cred = current_cred();
+   const struct user_namespace *set_ns = fcred-user_ns;
+   const struct user_namespace *subset_ns = cred-user_ns;
+
+   if (set_ns != subset_ns)
+   return 0;
+
+   return (uid_eq(fcred-uid, cred-uid) 
+   gid_eq(fcred-gid, cred-gid) 
+   cap_issubset(cred-cap_permitted, fcred-cap_permitted));
+}
+
 /*
  * Count the number of hardlinks for the pid_entry table, excluding the .
  * and .. links.
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 651d09a..e2459f4 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -159,6 +159,7 @@ extern int proc_pid_statm(struct seq_file *, struct 
pid_namespace *,
 /*
  * base.c
  */
+extern int proc_same_open_cred(const struct cred *);
 extern const struct dentry_operations pid_dentry_operations;
 extern int pid_getattr(struct vfsmount *, struct dentry *, struct kstat *);
 extern int proc_setattr(struct dentry *, struct iattr *);
-- 
1.7.11.7

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Unusually high system CPU usage with recent kernels

2013-10-01 Thread Paul E. McKenney
On Sat, Sep 14, 2013 at 03:59:51PM +0200, Tibor Billes wrote:
  From: Paul E. McKenney Sent: 09/13/13 02:19 AM
  On Wed, Sep 11, 2013 at 08:46:04AM +0200, Tibor Billes wrote:
From: Paul E. McKenney Sent: 09/09/13 10:44 PM

[ . . . ]

   Sure. The attached tar file contains traces of good kernels. The first is 
   with
   version 3.9.7 (no patch applied) which was the last stable kernel I tried 
   and
   didn't have this issue. The second is version 3.11 with your fix applied.
   Judging by the size of the traces, 3.11.0+ is still doing more work than
   3.9.7.
  
  Indeed, though quite a bit less than the problematic traces.
  
  Did you have all three patches applied to 3.11.0, or just the last one?
  If the latter, could you please try it with all three?
 
 Only the last one was applied to 3.11.0. The attachement now contains the
 RCU trace with all thee applied. It seems to be smaller in size, but still
 not close to 3.9.7.
 
 I'm not sure about LKML policies about attaching not-so-small files to
 emails, so I've dropped LKML from the CC list. Please CC the mailing
 list in your reply.

Done!

Another approach is to post the traces on the web and send the URL to
LKML. But whatever works for you is fine by me.
   
   Sending directly to you won again :) Could you please CC the list in your
   reply?
  
  Done! ;-)
 
 Could you please CC the list in your reply again? :)

This trace is quite strange -- there are a number of processes that sleep
and then wake up almost immediately, as in within a microsecond or so.
I don't know why they would be doing that, and I also cannot think of
what RCU might be doing to make them do that.

I am unfortunately out of ideas on this one.

Thanx, Paul

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 2/9] procfs: add proc_allow_access() to check if file's opener may access task

2013-10-01 Thread Djalal Harouni
Since /proc entries varies at runtime, permission checks need to happen
during each system call.

However even with that /proc file descriptors can be passed to a more
privileged process (e.g. a suid-exec) which will pass the classic
ptrace_may_access() permission check. The open() call will be issued in
general by an unprivileged process while the disclosure of sensitive
/proc information will happen using a more privileged process at
read(),write()...

Therfore we need a more sophisticated check to detect if the cred of the
process have changed, and if the cred of the original opener that are
stored in the file-f_cred have enough permission to access the task's
/proc entries during read(), write()...

Add the proc_allow_access() function that will receive the file-f_cred
as an argument, and tries to check if the opener had enough permission
to access the task's /proc entries.

This function should be used with the ptrace_may_access() check.

Cc: Kees Cook keesc...@chromium.org
Suggested-by: Eric W. Biederman ebied...@xmission.com
Signed-off-by: Djalal Harouni tix...@opendz.org
---
 fs/proc/base.c | 56 ++
 fs/proc/internal.h |  2 ++
 2 files changed, 58 insertions(+)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index e834946..c29eeae 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -168,6 +168,62 @@ int proc_same_open_cred(const struct cred *fcred)
cap_issubset(cred-cap_permitted, fcred-cap_permitted));
 }
 
+/* Returns 0 on success, -errno on denial. */
+static int __proc_allow_access(const struct cred *cred,
+  struct task_struct *task, unsigned int mode)
+{
+   int ret = 0;
+   const struct cred *tcred;
+   const struct cred *fcred = cred;
+
+   rcu_read_lock();
+   tcred = __task_cred(task);
+   if (uid_eq(fcred-uid, tcred-euid) 
+   uid_eq(fcred-uid, tcred-suid) 
+   uid_eq(fcred-uid, tcred-uid)  
+   gid_eq(fcred-gid, tcred-egid) 
+   gid_eq(fcred-gid, tcred-sgid) 
+   gid_eq(fcred-gid, tcred-gid))
+   goto out;
+
+   if (mode  PTRACE_MODE_NOAUDIT)
+   ret = security_capable_noaudit(fcred, tcred-user_ns,
+  CAP_SYS_PTRACE);
+   else
+   ret = security_capable(fcred, tcred-user_ns,
+  CAP_SYS_PTRACE);
+
+out:
+   rcu_read_unlock();
+   return !ret ? ret : -EPERM;
+}
+
+/**
+ * proc_allow_access - Check if the file's opener had enough permissions
+ * to access the target process.
+ * @fcred:  The file's opener cred (file-f_cred)
+ * @task:  The target task we want to inspect
+ * @mode:  The ptrace mode
+ *
+ * Return a non-zero if the file's opener had enough permissions to
+ * access the task's /proc entries.
+ *
+ * Since this function will check the permissions of the opener
+ * against the target task, it can be used to protect /proc files
+ * from opening a /proc file descriptor and do a suid-exec.
+ *
+ * Callers must hold the task-signal-cred_guard_mutex
+ */
+int proc_allow_access(const struct cred *fcred,
+ struct task_struct *task, unsigned int mode)
+{
+   int ret;
+   task_lock(task);
+   ret = __proc_allow_access(fcred, task, mode);
+   task_unlock(task);
+   return !ret;
+}
+
 /*
  * Count the number of hardlinks for the pid_entry table, excluding the .
  * and .. links.
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index e2459f4..c3f3c34 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -159,6 +159,8 @@ extern int proc_pid_statm(struct seq_file *, struct 
pid_namespace *,
 /*
  * base.c
  */
+extern int proc_allow_access(const struct cred *,
+struct task_struct *, unsigned int);
 extern int proc_same_open_cred(const struct cred *);
 extern const struct dentry_operations pid_dentry_operations;
 extern int pid_getattr(struct vfsmount *, struct dentry *, struct kstat *);
-- 
1.7.11.7

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 3/9] procfs: Document the proposed solution to protect procfs entries

2013-10-01 Thread Djalal Harouni
Note the proposed solution to protect sensitive procfs entries as
code comment.

Cc: Kees Cook keesc...@chromium.org
Suggested-by: Eric W. Biederman ebied...@xmission.com
Signed-off-by: Djalal Harouni tix...@opendz.org
---
 fs/proc/base.c | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index c29eeae..8d21316 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -102,6 +102,17 @@
  *
  * The classic example of a problem is opening file descriptors
  * in /proc for a task before it execs a suid executable.
+ *
+ * Solution for sensitive files:
+ * At each system call: open(),read(),write()... Perform the
+ * ptrace_may_access() check.
+ *
+ * After open() and during each system call: read(),write()...
+ * If the cred of current have changed then perform the
+ * proc_allow_access() check after the ptrace_may_access() one.
+ *
+ * This way we can determine if current has gained more privileges
+ * by execs a suid executable.
  */
 
 struct pid_entry {
-- 
1.7.11.7

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH V4 2/3] pincntrl: add support for AMS AS3722 pin control driver

2013-10-01 Thread Stephen Warren
On 09/24/2013 11:47 AM, Laxman Dewangan wrote:
 The AS3722 is a compact system PMU suitable for mobile phones, tablets etc.
 
 Add a driver to support accessing the GPIO, pinmux and pin configuration
 of 8 GPIO pins found on the AMS AS3722 through pin control driver and
 gpiolib.
 
 The driver will register itself as the pincontrol driver and gpio driver.

 diff --git a/Documentation/devicetree/bindings/pinctrl/pinctrl-as3722.txt 
 b/Documentation/devicetree/bindings/pinctrl/pinctrl-as3722.txt

 +AMS AS3722 Pincontrol

This binding document doesn't appear to define any compatible value. I
think there should be one binding document per compatible value, rather
than splitting up the documentation of a particular binding into lots of
different files. As such, shouldn't this documentation all be part of
Documentation/devicetree/bindings/regulator/as3722-regulator.txt?

 +DT node contains the different subnode for pin control to represent

Which DT node? This paragraph doesn't make much sense...

 +different states. Each of these subnodes represents some desired
 +configuration for a list of pins. This configuration can include the
 +mux function to select on those pin(s), and various pin configuration
 +parameters, such as pull-up, open drain.

I think that paragraph is meant to say the following, as part of
as3722-regulator.txt:

==
Optional sub-nodes:

- pinmux: Represents the pinmux configuration of the AS3722 GPIO pins.
This node contains pin configuration nodes, as defined by
pinctrl-bindings.txt
==

(and then go on to describe the required/optional properties within the
pin configuration nodes)

 +Valid values for pin names are:

I would say pin property rather than pin names here, to make it
obvious where these values are used.

 + gpio0, gpio1, gpio2, gpio3, gpio4, gpio5, gpio6, gpio7
 +
 +Valid value of function names are:

Similarly, function property here. (and values for not value of).

Other than that, this binding looks reasonable.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 4/9] procfs: make /proc/*/{stack,syscall} 0400

2013-10-01 Thread Djalal Harouni
The /proc/*/{stack,syscall} contain sensitive information and currently
its mode is 0444. Change this to 0400 so the VFS will be able to block
unprivileged processes from getting file descriptors on arbitrary
privileged /proc/*/{stack,syscall} files.

This will also avoid doing extra unnecessary ptrace checks.

Cc: Eric W. Biederman ebied...@xmission.com
Acked-by: Kees Cook keesc...@chromium.org
Signed-off-by: Djalal Harouni tix...@opendz.org
---
 fs/proc/base.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 8d21316..54e926a 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2682,7 +2682,7 @@ static const struct pid_entry tgid_base_stuff[] = {
 #endif
REG(comm,  S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
-   INF(syscall,S_IRUGO, proc_pid_syscall),
+   INF(syscall,S_IRUSR, proc_pid_syscall),
 #endif
INF(cmdline,S_IRUGO, proc_pid_cmdline),
ONE(stat,   S_IRUGO, proc_tgid_stat),
@@ -2710,7 +2710,7 @@ static const struct pid_entry tgid_base_stuff[] = {
INF(wchan,  S_IRUGO, proc_pid_wchan),
 #endif
 #ifdef CONFIG_STACKTRACE
-   ONE(stack,  S_IRUGO, proc_pid_stack),
+   ONE(stack,  S_IRUSR, proc_pid_stack),
 #endif
 #ifdef CONFIG_SCHEDSTATS
INF(schedstat,  S_IRUGO, proc_pid_schedstat),
@@ -3018,7 +3018,7 @@ static const struct pid_entry tid_base_stuff[] = {
 #endif
REG(comm,  S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
-   INF(syscall,   S_IRUGO, proc_pid_syscall),
+   INF(syscall,   S_IRUSR, proc_pid_syscall),
 #endif
INF(cmdline,   S_IRUGO, proc_pid_cmdline),
ONE(stat,  S_IRUGO, proc_tid_stat),
@@ -3048,7 +3048,7 @@ static const struct pid_entry tid_base_stuff[] = {
INF(wchan, S_IRUGO, proc_pid_wchan),
 #endif
 #ifdef CONFIG_STACKTRACE
-   ONE(stack,  S_IRUGO, proc_pid_stack),
+   ONE(stack,  S_IRUSR, proc_pid_stack),
 #endif
 #ifdef CONFIG_SCHEDSTATS
INF(schedstat, S_IRUGO, proc_pid_schedstat),
-- 
1.7.11.7

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: fuse-3.12-fixes?

2013-10-01 Thread Sedat Dilek
On Mon, Sep 30, 2013 at 5:21 PM, Miklos Szeredi mik...@szeredi.hu wrote:
 On Mon, Sep 30, 2013 at 5:13 PM, Linus Torvalds
 torva...@linux-foundation.org wrote:
 On Mon, Sep 30, 2013 at 12:20 AM, Sedat Dilek sedat.di...@gmail.com wrote:

 any reasons why these fuse-3.12-fixes did not went into -rc3?

 Perhaps because they were never sent to me?

 Searching my email archives I see that a pull request went to Al and
 the mailing lists, but not actually to me. I do have a pending pull
 request from Al (which came in after -rc3) but that doesn't contain
 them either. And the for-linus branch implies that perhaps somebody
 _iindended_ to send them to me, but somehow missed.

 Mea culpa.  I sent it to Al, but didn't create a for_al branch for it,
 and Al didn't pull.  I'll send a new pull request after testing the
 fuse_dentry_revalidate fixes.


Hm, I see some updates in the for-linus GIT branch, but no for-al...

- Sedat -
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 5/9] procfs: make /proc entries that use seq files able to access file-f_cred

2013-10-01 Thread Djalal Harouni
In order to check if the cred of current have changed between -open()
and -read(), we must able to access the file's opener cred
'file-f_cred' at any point.

To make this possible for /proc/*/{stack,personality,stat} pass the
'file struct' as a 3rd argument to single_open() so it will be stored in
seq_file-private in seq_open(). This way these entries are able to
continue to use seq files, and access the file-f_cred easily.

This is also a preparation for the following patches which will check
the corresponding file-f_cred.

Signed-off-by: Djalal Harouni tix...@opendz.org
---
 fs/proc/base.c | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 54e926a..d4b604d 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -765,7 +765,8 @@ static const struct file_operations 
proc_info_file_operations = {
 
 static int proc_single_show(struct seq_file *m, void *v)
 {
-   struct inode *inode = m-private;
+   struct file *filp = m-private;
+   struct inode *inode = file_inode(filp);
struct pid_namespace *ns;
struct pid *pid;
struct task_struct *task;
@@ -785,7 +786,9 @@ static int proc_single_show(struct seq_file *m, void *v)
 
 static int proc_single_open(struct inode *inode, struct file *filp)
 {
-   return single_open(filp, proc_single_show, inode);
+   /* Later we need inode and filp-f_cred, so pass filp as 3rd
+* argument, it will be referenced by seq_file-private */
+   return single_open(filp, proc_single_show, filp);
 }
 
 static const struct file_operations proc_single_file_operations = {
-- 
1.7.11.7

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 6/9] procfs: add permission checks on the file's opener of /proc/*/stat

2013-10-01 Thread Djalal Harouni
Some fields of the /proc/*/stat are sensitive fields that need
appropriate protection.

However, /proc file descriptors can be passed to a more privileged
process (e.g. a suid-exec) which will pass the classic
ptrace_may_access() permission check during read().

To prevent it, use proc_same_open_cred() to detect if current's cred
have changed between -open() and -read(), if so, call
proc_allow_access() to check if the original file's opener had enough
permissions to read these sensitive fields. This will prevent passing
file descriptors to a more privileged process to leak data.

The patch also adds a previously missing signal-cred_guard_mutex lock.

This patch does not break userspace since it only hides the fields that
were supposed to be protected.

Cc: Kees Cook keesc...@chromium.org
Cc: Eric W. Biederman ebied...@xmission.com
Signed-off-by: Djalal Harouni tix...@opendz.org
---
 fs/proc/array.c | 16 ++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/fs/proc/array.c b/fs/proc/array.c
index cbd0f1b..f034e05 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -394,7 +394,7 @@ static int do_task_stat(struct seq_file *m, struct 
pid_namespace *ns,
char state;
pid_t ppid = 0, pgid = -1, sid = -1;
int num_threads = 0;
-   int permitted;
+   int permitted = 0;
struct mm_struct *mm;
unsigned long long start_time;
unsigned long cmin_flt = 0, cmaj_flt = 0;
@@ -404,10 +404,22 @@ static int do_task_stat(struct seq_file *m, struct 
pid_namespace *ns,
unsigned long rsslim = 0;
char tcomm[sizeof(task-comm)];
unsigned long flags;
+   struct file *file = m-private;
+   int same_cred = proc_same_open_cred(file-f_cred);
+   unsigned int ptrace_mode = PTRACE_MODE_READ | PTRACE_MODE_NOAUDIT;
 
state = *get_task_state(task);
vsize = eip = esp = 0;
-   permitted = ptrace_may_access(task, PTRACE_MODE_READ | 
PTRACE_MODE_NOAUDIT);
+
+   if (!mutex_lock_killable(task-signal-cred_guard_mutex)) {
+   permitted = ptrace_may_access(task, ptrace_mode);
+   if (permitted  !same_cred)
+   permitted = proc_allow_access(file-f_cred,
+ task, ptrace_mode);
+
+   mutex_unlock(task-signal-cred_guard_mutex);
+   }
+
mm = get_task_mm(task);
if (mm) {
vsize = task_vsize(mm);
-- 
1.7.11.7

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 7/9] procfs: add permission checks on the file's opener of /proc/*/personality

2013-10-01 Thread Djalal Harouni
If current's cred have changed between -open() and -read(), then call
proc_allow_access() to check if the original file's opener had enough
permissions to access the /proc/*/personality entry during -read().

Cc: Kees Cook keesc...@chromium.org
Cc: Eric W. Biederman ebied...@xmission.com
Signed-off-by: Djalal Harouni tix...@opendz.org
---
 fs/proc/base.c | 18 +++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index d4b604d..77f5b84 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2647,11 +2647,23 @@ static const struct file_operations 
proc_projid_map_operations = {
 static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task)
 {
+   struct file *file = m-private;
+   const struct cred *fcred = file-f_cred;
+   int same_cred = proc_same_open_cred(fcred);
int err = lock_trace(task);
-   if (!err) {
-   seq_printf(m, %08x\n, task-personality);
-   unlock_trace(task);
+   if (err)
+   return err;
+
+   if (!same_cred 
+   !proc_allow_access(fcred, task, PTRACE_MODE_ATTACH)) {
+   err = -EPERM;
+   goto out;
}
+
+   seq_printf(m, %08x\n, task-personality);
+
+out:
+   unlock_trace(task);
return err;
 }
 
-- 
1.7.11.7

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 8/9] procfs: improve permission checks on /proc/*/stack

2013-10-01 Thread Djalal Harouni
Permission checks need to happen during each system call. Therefore we
need to convert the /proc/*/stack entry from a ONE node to a REG node.
Doing this will make /proc/*/stack have its own file operations to
implement appropriate checks and avoid breaking shared ONE file
operations.

The patch makes sure that /proc/*/stack is still using seq files to
provide its output.

The patch adds stack_open() to check if the file's opener has enough
permission to ptrace the task during -open().

However, even with this, /proc file descriptors can be passed to a more
privileged process (e.g. a suid-exec) which will pass the classic
ptrace_may_access() permission check during read().

To prevent this, use proc_same_open_cred() to detect if the cred of
current have changed between -open() and -read(), if so, then call
proc_allow_access() to check if the original file's opener had enough
privileges to access the /proc's task entries during -read(). This will
block passing file descriptors to a more privileged process.

If the cred did not change then continue with read().

For readability, split code into another task_stack_show() function
which is used to get the stack trace of a task.

Cc: Kees Cook keesc...@chromium.org
Cc: Eric W. Biederman ebied...@xmission.com
Signed-off-by: Djalal Harouni tix...@opendz.org
---
 fs/proc/base.c | 87 ++
 1 file changed, 75 insertions(+), 12 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 77f5b84..b80588a 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -395,13 +395,14 @@ static void unlock_trace(struct task_struct *task)
 
 #define MAX_STACK_TRACE_DEPTH  64
 
-static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns,
- struct pid *pid, struct task_struct *task)
+static int task_stack_show(struct seq_file *m, struct task_struct *task)
 {
-   struct stack_trace trace;
-   unsigned long *entries;
int err;
int i;
+   int same_cred;
+   struct stack_trace trace;
+   unsigned long *entries;
+   struct file *filp = m-private;
 
entries = kmalloc(MAX_STACK_TRACE_DEPTH * sizeof(*entries), GFP_KERNEL);
if (!entries)
@@ -412,20 +413,82 @@ static int proc_pid_stack(struct seq_file *m, struct 
pid_namespace *ns,
trace.entries   = entries;
trace.skip  = 0;
 
+   same_cred = proc_same_open_cred(filp-f_cred);
+
err = lock_trace(task);
-   if (!err) {
-   save_stack_trace_tsk(task, trace);
+   if (err)
+   goto free;
 
-   for (i = 0; i  trace.nr_entries; i++) {
-   seq_printf(m, [%pK] %pS\n,
-  (void *)entries[i], (void *)entries[i]);
-   }
+   if (!same_cred 
+   !proc_allow_access(filp-f_cred, task, PTRACE_MODE_ATTACH)) {
+   err = -EPERM;
unlock_trace(task);
+   goto free;
+   }
+
+   save_stack_trace_tsk(task, trace);
+   unlock_trace(task);
+
+   for (i = 0; i  trace.nr_entries; i++) {
+   seq_printf(m, [%pK] %pS\n,
+  (void *)entries[i], (void *)entries[i]);
}
+
+free:
kfree(entries);
+   return err;
+}
 
+static int stack_show(struct seq_file *m, void *v)
+{
+   int ret;
+   struct pid *pid;
+   struct task_struct *task;
+   struct file *filp = m-private;
+   struct inode *inode = file_inode(filp);
+
+   ret = -ESRCH;
+   pid = proc_pid(inode);
+   task = get_pid_task(pid, PIDTYPE_PID);
+   if (!task)
+   return ret;
+
+   ret = task_stack_show(m, task);
+
+   put_task_struct(task);
+   return ret;
+}
+
+static int stack_open(struct inode *inode, struct file *filp)
+{
+   int err = -ESRCH;
+   struct task_struct *task = get_proc_task(file_inode(filp));
+
+   if (!task)
+   return err;
+
+   err = mutex_lock_killable(task-signal-cred_guard_mutex);
+   if (err)
+   goto out;
+
+   err = -EPERM;
+   if (ptrace_may_access(task, PTRACE_MODE_ATTACH))
+   /* We need inode and filp-f_cred, so pass filp
+* as third argument */
+   err = single_open(filp, stack_show, filp);
+
+   mutex_unlock(task-signal-cred_guard_mutex);
+out:
+   put_task_struct(task);
return err;
 }
+
+static const struct file_operations proc_pid_stack_operations = {
+   .open   = stack_open,
+   .read   = seq_read,
+   .llseek = seq_lseek,
+   .release= single_release,
+};
 #endif
 
 #ifdef CONFIG_SCHEDSTATS
@@ -2725,7 +2788,7 @@ static const struct pid_entry tgid_base_stuff[] = {
INF(wchan,  S_IRUGO, proc_pid_wchan),
 #endif
 #ifdef CONFIG_STACKTRACE
-   ONE(stack,  S_IRUSR, proc_pid_stack),
+   REG(stack,  S_IRUSR, proc_pid_stack_operations),
 

[PATCH] [workqueue] check values of pwq and wq in print_worker_info() before use

2013-10-01 Thread Helge Deller
print_worker_info() includes no validity check on the pwq and wq
pointers before handing them over to the probe_kernel_read() functions.

It seems that most architectures don't care about that, but at least on
the parisc architecture this leads to a kernel crash since accesses to
page zero are protected by the kernel for security reasons.

Fix this problem by verifying the contents of pwq and wq before usage.
Even if probe_kernel_read() usually prevents such crashes by disabling
page faults, clean code should always include such checks. 

Without this fix issuing echo t  /proc/sysrq-trigger will immediately
crash the Linux kernel on the parisc architecture.

CC: Tejun Heo t...@kernel.org
CC: Libin huawei.li...@huawei.com
CC: linux-par...@vger.kernel.org
CC: james.bottom...@hansenpartnership.com
Signed-off-by: Helge Deller del...@gmx.de

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 987293d..c03b47f 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -4512,8 +4512,10 @@ void print_worker_info(const char *log_lvl, struct 
task_struct *task)
 */
probe_kernel_read(fn, worker-current_func, sizeof(fn));
probe_kernel_read(pwq, worker-current_pwq, sizeof(pwq));
-   probe_kernel_read(wq, pwq-wq, sizeof(wq));
-   probe_kernel_read(name, wq-name, sizeof(name) - 1);
+   if (pwq)
+   probe_kernel_read(wq, pwq-wq, sizeof(wq));
+   if (wq)
+   probe_kernel_read(name, wq-name, sizeof(name) - 1);
 
/* copy worker description */
probe_kernel_read(desc_valid, worker-desc_valid, sizeof(desc_valid));
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 9/9] procfs: improve permission checks on /proc/*/syscall

2013-10-01 Thread Djalal Harouni
Permission checks need to happen during each system call. Therefore we
need to convert the /proc/*/syscall entry from an INF node to a REG
node. Doing this will make /proc/*/syscall have its own file operations
to implement appropriate checks and avoid breaking shared INF file
operations.

Add the syscall_open() to check if the file's opener has enough
permission to ptrace the task during -open().

Add the syscall_read() to check permissions and to read task syscall
information. If the classic ptrace_may_access() check is passed, then
check if current's cred have changed between -open() and -read(), if
so, call proc_allow_access() to check if the original file's opener had
enough permissions to read the task syscall info. This will block
passing the file descriptor to a more privileged process.

For readability split code into another task_syscall_read() function
which is used to get the syscall entries of the task.

Cc: Kees Cook keesc...@chromium.org
Cc: Eric W. Biederman ebied...@xmission.com
Signed-off-by: Djalal Harouni tix...@opendz.org
---
 fs/proc/base.c | 93 --
 1 file changed, 84 insertions(+), 9 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index b80588a..f98889d 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -150,6 +150,9 @@ struct pid_entry {
NULL, proc_single_file_operations, \
{ .proc_show = show } )
 
+/* 4K page size but our output routines use some slack for overruns */
+#define PROC_BLOCK_SIZE(3*1024)
+
 /**
  * proc_same_open_cred - Check if the file's opener cred matches the
  * current cred.
@@ -678,13 +681,32 @@ static int proc_pid_limits(struct task_struct *task, char 
*buffer)
 }
 
 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
-static int proc_pid_syscall(struct task_struct *task, char *buffer)
+static int syscall_open(struct inode *inode, struct file *filp)
+{
+   int err = -ESRCH;
+   struct task_struct *task = get_proc_task(file_inode(filp));
+
+   if (!task)
+   return err;
+
+   err = mutex_lock_killable(task-signal-cred_guard_mutex);
+   if (err)
+   goto out;
+
+   if (!ptrace_may_access(task, PTRACE_MODE_ATTACH))
+   err = -EPERM;
+
+   mutex_unlock(task-signal-cred_guard_mutex);
+out:
+   put_task_struct(task);
+   return err;
+}
+
+static int task_syscall_read(struct task_struct *task, char *buffer)
 {
+   int res;
long nr;
unsigned long args[6], sp, pc;
-   int res = lock_trace(task);
-   if (res)
-   return res;
 
if (task_current_syscall(task, nr, args, 6, sp, pc))
res = sprintf(buffer, running\n);
@@ -696,9 +718,64 @@ static int proc_pid_syscall(struct task_struct *task, char 
*buffer)
   nr,
   args[0], args[1], args[2], args[3], args[4], args[5],
   sp, pc);
-   unlock_trace(task);
+
return res;
 }
+
+static ssize_t syscall_read(struct file *file, char __user *buf,
+   size_t count, loff_t *ppos)
+{
+   int same_cred;
+   ssize_t length;
+   unsigned long page;
+   const struct cred *fcred = file-f_cred;
+   struct inode *inode = file_inode(file);
+   struct task_struct *task = get_proc_task(inode);
+
+   length = -ESRCH;
+   if (!task)
+   return length;
+
+   if (count  PROC_BLOCK_SIZE)
+   count = PROC_BLOCK_SIZE;
+
+   length = -ENOMEM;
+   page = __get_free_page(GFP_TEMPORARY);
+   if (!page)
+   goto out;
+
+   same_cred = proc_same_open_cred(fcred);
+
+   length = lock_trace(task);
+   if (length)
+   goto out_free;
+
+   if (!same_cred 
+   !proc_allow_access(fcred, task, PTRACE_MODE_ATTACH)) {
+   length = -EPERM;
+   unlock_trace(task);
+   goto out_free;
+   }
+
+   length = task_syscall_read(task, (char *)page);
+   unlock_trace(task);
+
+   if (length = 0)
+   length = simple_read_from_buffer(buf, count, ppos,
+(char *)page, length);
+
+out_free:
+   free_page(page);
+out:
+   put_task_struct(task);
+   return length;
+}
+
+static const struct file_operations proc_pid_syscall_operations = {
+   .open   = syscall_open,
+   .read   = syscall_read,
+   .llseek = generic_file_llseek,
+};
 #endif /* CONFIG_HAVE_ARCH_TRACEHOOK */
 
 //
@@ -789,8 +866,6 @@ static const struct inode_operations 
proc_def_inode_operations = {
.setattr= proc_setattr,
 };
 
-#define PROC_BLOCK_SIZE(3*1024)/* 4K page size but our 
output routines use some slack for overruns */
-
 static ssize_t proc_info_read(struct file * file, char __user * buf,
  

[PATCH 4/4] ALSA: hda - hdmi: Disable ramp-up/down for non-PCM on AMD codecs

2013-10-01 Thread Anssi Hannula
Recent AMD HDMI codecs (revision ID 3 and later, 0x100300 as reported by
procfs codec#0) have a configurable ramp-up/down functionality.

The documentation ( http://www.x.org/docs/AMD/AMD_HDA_verbs_v2.pdf )
specifies that 180 (meaning 180/256 =~ 0.7) is recommended for PCM and 0
for non-PCM.

Apply the recommended values according to provided S/PDIF AES0 settings.

Signed-off-by: Anssi Hannula anssi.hann...@iki.fi
---
 sound/pci/hda/patch_hdmi.c | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/sound/pci/hda/patch_hdmi.c b/sound/pci/hda/patch_hdmi.c
index c0cd4ca..22f30fe 100644
--- a/sound/pci/hda/patch_hdmi.c
+++ b/sound/pci/hda/patch_hdmi.c
@@ -2673,6 +2673,10 @@ static int patch_nvhdmi_8ch_7x(struct hda_codec *codec)
 #define ATI_VERB_GET_MULTICHANNEL_70xf88
 #define ATI_VERB_GET_MULTICHANNEL_MODE 0xf89
 
+/* AMD specific HDA cvt verbs */
+#define ATI_VERB_SET_RAMP_RATE 0x770
+#define ATI_VERB_GET_RAMP_RATE 0xf70
+
 #define ATI_OUT_ENABLE 0x1
 
 #define ATI_HBR_CAPABLE 0x01
@@ -2824,6 +2828,7 @@ static int atihdmi_playback_pcm_prepare(struct 
hda_pcm_stream *hinfo,
unsigned int format,
struct snd_pcm_substream *substream)
 {
+   hda_nid_t cvt_nid = hinfo-nid;
struct hdmi_spec *spec = codec-spec;
int pin_idx = hinfo_to_pin_index(spec, hinfo);
struct hdmi_spec_per_pin *per_pin = get_pin(spec, pin_idx);
@@ -2853,6 +2858,15 @@ static int atihdmi_playback_pcm_prepare(struct 
hda_pcm_stream *hinfo,
return -EINVAL;
}
 
+   if (is_amdhdmi_rev3(codec)) {
+   int ramp_rate = 180; /* default as per spec */
+   /* disable ramp-up/down for non-pcm as per spec */
+   if (format  AC_FMT_TYPE_NON_PCM)
+   ramp_rate = 0;
+
+   snd_hda_codec_write(codec, cvt_nid, 0, ATI_VERB_SET_RAMP_RATE, 
ramp_rate);
+   }
+
return generic_hdmi_playback_pcm_prepare(hinfo, codec, stream_tag, 
format, substream);
 }
 
-- 
1.8.1.5

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 01/10] pwm-backlight: Refactor backlight power on/off

2013-10-01 Thread Thierry Reding
On Tue, Oct 01, 2013 at 12:26:07PM -0600, Stephen Warren wrote:
 On 09/23/2013 03:40 PM, Thierry Reding wrote:
  In preparation for adding an optional regulator and enable GPIO to the
  driver, split the power on and power off sequences into separate
  functions to reduce code duplication at the multiple call sites.
 
  diff --git a/drivers/video/backlight/pwm_bl.c 
  b/drivers/video/backlight/pwm_bl.c
 
  +static void pwm_backlight_power_off(struct pwm_bl_data *pb)
  +{
  +   pwm_disable(pb-pwm);
 
 Both the call-sites you're replacing do the following before pwm_disable():
 
   pwm_config(pb-pwm, 0, pb-period);
 
 While I agree that probably shouldn't be necessary, I think it's at
 least worth mentioning that in the commit description just to make it
 obvious that it was a deliberate change. Splitting that change into a
 separate patch might be reasonable in order to keep refactoring and
 functional changes separate, although perhaps it's not worth it.

Actually I'll add that back. I'm not sure exactly why I dropped it (it
may just have been oversight) and there have been reports that some HW
actually requires pwm_config(..., 0, ...) before pwm_disable() to turn
off properly.

 There are also a couple unrelated whitespace changes thrown in here.

I think they increase readability, but I can certainly move them into a
separate patch.

Thierry


pgpZyJ1PYEFoR.pgp
Description: PGP signature


[RFC/RFT v2 0/4] ALSA: hda - hdmi: ATI/AMD multi-channel and HBR support

2013-10-01 Thread Anssi Hannula
Hi all!

Here is a second revision of the ATI/AMD multichannel patch, now a
patchset.

Since the last revision from a bit over the week ago, AMD has released
more documentation ( http://www.x.org/docs/AMD/AMD_HDA_verbs_v2.pdf ),
so the patchset contains new additions:

- HBR bitstreaming support (HD passthrough, for DTS-HD and Dolby TrueHD)
  o This completes the set, now we have HBR on NVIDIA, Intel and AMD :)

- More complete ELD emulation on revision 3+ (0x100300) codecs
  o Untested, though from what I can see the newly added fields may not
be filled in yet by the radeon driver.

- Ramp up/down configuration according to non-pcm bit on rev3+
  o I guess this means gradual fade-in/out? Untested.

Other changes since the previous post:
- ELD emulation moved to hda_eld.c and cleaned up a bit
- Some fixes to hdmi_chmap_ctl_tlv() changes
(and maybe something else I forgot - if so, sorry)

Also, the pairwise channel mapping code has been tested to work with
custom channel maps. The TLV contents are shown as follows by amixer:
| chmap-paired=FL,FR
| chmap-paired=FL,FR,NA,LFE
| chmap-paired=FL,FR,FC,NA
| chmap-paired=FL,FR,RL,RR
| chmap-paired=FL,FR,FC,LFE
| chmap-paired=FL,FR,NA,LFE,RL,RR
| chmap-paired=FL,FR,FC,NA,RL,RR
[etc]

Rafał, do you know about the missing (AFAICS) support for lipsync (F7B)
and sink information (F81) in radeon driver?

This patchset requires at least more testing before it should be applied.

A few other somewhat open things:
- I named everything ati/ATI except for is_amdhdmi_rev3() and
  has_amd_full_remap_support() that are AMD-only, though AMD-only verbs
  I still named ATI to have similar names to other verbs. Does that
  seem sensible or how should this be done?
- Currently setup_audio_infoframe() writes the channel mapping every
  time for ATI/AMD, should we try to avoid that? If so, any
  suggestions?
  (argh, just noticed this is broken for manual channel maps on non-ATI;
   if just the chmap is changed, channel mapping is not updated;
   so this requires fixes in the generic side as well)

I'm especially interested in testers with:

- Older codecs other than 0x1002aa01. My best guess still is that the
  new code works on them as well.
  o On these I'd like to know if multichannel and the new formats
work, i.e. e.g.
speaker-test -D hdmi:CARD=Generic,DEV=0 -c8 -r192000 -F S32_LE

- Codec 0x1002aa01 revisions 0x100300 (Rev ID 3) or later (see
  /proc/asound/cardX/codec#Y). These are HD7750+ I think. Stuff to be
  tested on these:
  o The ramp up/down stuff, i.e. patch 4. Is there any difference seen
with these, in the beginning/end (i.e. fade-out/in):
speaker-test -D hdmi:CARD=Generic,DEV=0,AES0=0x04 -c2 -r48000
speaker-test -D hdmi:CARD=Generic,DEV=0,AES0=0x06 -c2 -r48000
Also, is there a difference in the beginning of these
(maybe garbage sound and/or slightly slower startup?):
aplay -Dhdmi:CARD=Generic,DEV=0,AES0=4 -r44100 -f s16_le -c2 
testi.dts.cut.spdif
aplay -Dhdmi:CARD=Generic,DEV=0,AES0=6 -r44100 -f s16_le -c2 
testi.dts.cut.spdif

  o Contents of /proc/asound/cardX/eld#0. I'd like to see the contents
both with radeon and with the proprietary fglrx driver in use

Olivier, can you test again on your rev3 hw? :)

The patchset can be found in combined form (for e.g. testing purposes) at:
http://onse.fi/files/atihdmi5.patch

The test file referenced above can be found at:
http://onse.fi/files/testi.dts.cut.spdif.gz (just regular DTS)

Anssi Hannula (4):
  ALSA: hda - hdmi: Add ATI/AMD multi-channel audio support
  ALSA: hda - hdmi: Add ELD emulation for ATI/AMD codecs
  ALSA: hda - hdmi: Add HBR bitstreaming support for ATI/AMD HDMI codecs
  ALSA: hda - hdmi: Disable ramp-up/down for non-PCM on AMD codecs

 sound/pci/hda/hda_eld.c| 157 
+
 sound/pci/hda/hda_local.h  |   5 +++
 sound/pci/hda/patch_hdmi.c | 424 
++--
 3 files changed, 547 insertions(+), 39 deletions(-)

-- 
Anssi Hannula

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: x86: sort reboot DMI quirks by vendor.

2013-10-01 Thread Dave Jones
Grouping them by vendor should make it easier to spot duplicates.

Signed-off-by: Dave Jones da...@fedoraproject.org

diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index d9333a4..7692520 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -136,236 +136,248 @@ static int __init set_kbd_reboot(const struct 
dmi_system_id *d)
  * This is a single dmi_table handling all reboot quirks.
  */
 static struct dmi_system_id __initdata reboot_dmi_table[] = {
-   {   /* Handle problems with rebooting on Dell E520's */
-   .callback = set_bios_reboot,
-   .ident = Dell E520,
+
+   /* Acer */
+   {   /* Handle reboot issue on Acer Aspire one */
+   .callback = set_kbd_reboot,
+   .ident = Acer Aspire One A110,
.matches = {
-   DMI_MATCH(DMI_SYS_VENDOR, Dell Inc.),
-   DMI_MATCH(DMI_PRODUCT_NAME, Dell DM061),
+   DMI_MATCH(DMI_SYS_VENDOR, Acer),
+   DMI_MATCH(DMI_PRODUCT_NAME, AOA110),
},
},
-   {   /* Handle problems with rebooting on Dell 1300's */
-   .callback = set_bios_reboot,
-   .ident = Dell PowerEdge 1300,
+
+   /* Apple */
+   {   /* Handle problems with rebooting on Apple MacBook5 */
+   .callback = set_pci_reboot,
+   .ident = Apple MacBook5,
.matches = {
-   DMI_MATCH(DMI_SYS_VENDOR, Dell Computer Corporation),
-   DMI_MATCH(DMI_PRODUCT_NAME, PowerEdge 1300/),
+   DMI_MATCH(DMI_SYS_VENDOR, Apple Inc.),
+   DMI_MATCH(DMI_PRODUCT_NAME, MacBook5),
},
},
-   {   /* Handle problems with rebooting on Dell 300's */
-   .callback = set_bios_reboot,
-   .ident = Dell PowerEdge 300,
+   {   /* Handle problems with rebooting on Apple MacBookPro5 */
+   .callback = set_pci_reboot,
+   .ident = Apple MacBookPro5,
.matches = {
-   DMI_MATCH(DMI_SYS_VENDOR, Dell Computer Corporation),
-   DMI_MATCH(DMI_PRODUCT_NAME, PowerEdge 300/),
+   DMI_MATCH(DMI_SYS_VENDOR, Apple Inc.),
+   DMI_MATCH(DMI_PRODUCT_NAME, MacBookPro5),
},
},
-   {   /* Handle problems with rebooting on Dell Optiplex 745's SFF */
-   .callback = set_bios_reboot,
-   .ident = Dell OptiPlex 745,
+   {   /* Handle problems with rebooting on Apple Macmini3,1 */
+   .callback = set_pci_reboot,
+   .ident = Apple Macmini3,1,
.matches = {
-   DMI_MATCH(DMI_SYS_VENDOR, Dell Inc.),
-   DMI_MATCH(DMI_PRODUCT_NAME, OptiPlex 745),
+   DMI_MATCH(DMI_SYS_VENDOR, Apple Inc.),
+   DMI_MATCH(DMI_PRODUCT_NAME, Macmini3,1),
},
},
-   {   /* Handle problems with rebooting on Dell Optiplex 745's DFF */
-   .callback = set_bios_reboot,
-   .ident = Dell OptiPlex 745,
+   {   /* Handle problems with rebooting on the iMac9,1. */
+   .callback = set_pci_reboot,
+   .ident = Apple iMac9,1,
.matches = {
-   DMI_MATCH(DMI_SYS_VENDOR, Dell Inc.),
-   DMI_MATCH(DMI_PRODUCT_NAME, OptiPlex 745),
-   DMI_MATCH(DMI_BOARD_NAME, 0MM599),
+   DMI_MATCH(DMI_SYS_VENDOR, Apple Inc.),
+   DMI_MATCH(DMI_PRODUCT_NAME, iMac9,1),
},
},
-   {   /* Handle problems with rebooting on Dell Optiplex 745 with 
0KW626 */
+
+   /* ASUS */
+   {   /* Handle problems with rebooting on ASUS P4S800 */
.callback = set_bios_reboot,
-   .ident = Dell OptiPlex 745,
+   .ident = ASUS P4S800,
.matches = {
-   DMI_MATCH(DMI_SYS_VENDOR, Dell Inc.),
-   DMI_MATCH(DMI_PRODUCT_NAME, OptiPlex 745),
-   DMI_MATCH(DMI_BOARD_NAME, 0KW626),
+   DMI_MATCH(DMI_BOARD_VENDOR, ASUSTeK Computer INC.),
+   DMI_MATCH(DMI_BOARD_NAME, P4S800),
},
},
-   {   /* Handle problems with rebooting on Dell Optiplex 330 with 
0KP561 */
+
+   /* Dell */
+   {   /* Handle problems with rebooting on Dell DXP061 */
.callback = set_bios_reboot,
-   .ident = Dell OptiPlex 330,
+   .ident = Dell DXP061,
.matches = {
DMI_MATCH(DMI_SYS_VENDOR, Dell Inc.),
-   DMI_MATCH(DMI_PRODUCT_NAME, OptiPlex 330),
-   DMI_MATCH(DMI_BOARD_NAME, 0KP561),
+ 

[PATCH 1/4] ALSA: hda - hdmi: Add ATI/AMD multi-channel audio support

2013-10-01 Thread Anssi Hannula
ATI/AMD codecs do not support all the standard HDA HDMI/DP functions,
instead various vendor-specific verbs are provided.

This commit addresses these missing functions:
- standard channel mapping support
- standard infoframe configuration support

ATI/AMD provides their own verbs that allow the following:
- setting CA for infoframe
- setting down-mix information for infoframe
- channel pair remapping
- individual channel remapping (revision ID 3+, 0x100300+)

The documentation for the verbs has now been released by AMD:
http://www.x.org/docs/AMD/AMD_HDA_verbs_v2.pdf

Add support for the ATI/AMD specific verbs and use them instead of the
generic methods on ATI/AMD codecs. This allows multi-channel PCM audio
to work.

Channel remapping is restricted to pairwise mapping on codecs with
revision ID 2 (0x100200 as reported by procfs codec#X) or lower. This
means cards up to Radeon HD7670 as far as I know. This will not affect
standard multi-channel modes since these codecs support automatic
FC-LFE swapping for HDMI.

ATI/AMD codecs do not advertise all of their supported rates, formats
and channel counts, therefore that information is forced accordingly so
that all HDMI 1.x PCM parameters are marked as supported.

Support for multiple ports is also added to patch_atihdmi so that
0x1002aa01 codecs with multiple ports will work properly when switched
back to that patch.

Signed-off-by: Anssi Hannula anssi.hann...@iki.fi
Tested-by: Peter Frühberger frit...@xbmc.org
---
 sound/pci/hda/hda_local.h  |   5 +
 sound/pci/hda/patch_hdmi.c | 355 +++--
 2 files changed, 317 insertions(+), 43 deletions(-)

diff --git a/sound/pci/hda/hda_local.h b/sound/pci/hda/hda_local.h
index 2e7493e..7c0b89e 100644
--- a/sound/pci/hda/hda_local.h
+++ b/sound/pci/hda/hda_local.h
@@ -786,4 +786,9 @@ static inline void snd_hda_eld_proc_free(struct hda_codec 
*codec,
 #define SND_PRINT_CHANNEL_ALLOCATION_ADVISED_BUFSIZE 80
 void snd_print_channel_allocation(int spk_alloc, char *buf, int buflen);
 
+/* shared with patch_hdmi.c and hda_eld.c */
+#define is_atihdmi(codec) (((codec)-vendor_id  0x) == 0x1002)
+#define is_amdhdmi_rev3(codec) \
+   ((codec)-vendor_id == 0x1002791a  ((codec)-revision_id  0xff00) = 
0x0300)
+
 #endif /* __SOUND_HDA_LOCAL_H */
diff --git a/sound/pci/hda/patch_hdmi.c b/sound/pci/hda/patch_hdmi.c
index 3d8cd044..19adb01 100644
--- a/sound/pci/hda/patch_hdmi.c
+++ b/sound/pci/hda/patch_hdmi.c
@@ -6,6 +6,7 @@
  *  Copyright (c) 2006 ATI Technologies Inc.
  *  Copyright (c) 2008 NVIDIA Corp.  All rights reserved.
  *  Copyright (c) 2008 Wei Ni w...@nvidia.com
+ *  Copyright (c) 2013 Anssi Hannula anssi.hann...@iki.fi
  *
  *  Authors:
  * Wu Fengguang w...@linux.intel.com
@@ -46,6 +47,9 @@ MODULE_PARM_DESC(static_hdmi_pcm, Don't restrict PCM 
parameters per ELD info);
 
 #define is_haswell(codec)  ((codec)-vendor_id == 0x80862807)
 
+/* is_atihdmi() and is_amdhdmi_rev3() are in hda_local.h */
+#define has_amd_full_remap_support(codec) is_amdhdmi_rev3(codec)
+
 struct hdmi_spec_per_cvt {
hda_nid_t cvt_nid;
int assigned;
@@ -89,7 +93,7 @@ struct hdmi_spec {
 
struct hdmi_eld temp_eld;
/*
-* Non-generic ATI/NVIDIA specific
+* Non-generic VIA/NVIDIA specific
 */
struct hda_multi_out multiout;
struct hda_pcm_stream pcm_playback;
@@ -573,6 +577,20 @@ static int hdmi_channel_allocation(struct hdmi_eld *eld, 
int channels)
return ca;
 }
 
+#ifdef CONFIG_SND_DEBUG_VERBOSE
+static int atihdmi_get_chan_slot(struct hda_codec *codec, hda_nid_t pin_nid, 
int asp_slot);
+
+static int hdmi_get_chan_slot(struct hda_codec *codec, hda_nid_t pin_nid, int 
asp_slot)
+{
+   if (is_atihdmi(codec))
+   return atihdmi_get_chan_slot(codec, pin_nid, asp_slot);
+
+   return snd_hda_codec_read(codec, pin_nid, 0,
+ AC_VERB_GET_HDMI_CHAN_SLOT,
+ asp_slot);
+}
+#endif
+
 static void hdmi_debug_channel_mapping(struct hda_codec *codec,
   hda_nid_t pin_nid)
 {
@@ -581,14 +599,26 @@ static void hdmi_debug_channel_mapping(struct hda_codec 
*codec,
int slot;
 
for (i = 0; i  8; i++) {
-   slot = snd_hda_codec_read(codec, pin_nid, 0,
-   AC_VERB_GET_HDMI_CHAN_SLOT, i);
+   slot = hdmi_get_chan_slot(codec, pin_nid, i);
printk(KERN_DEBUG HDMI: ASP channel %d = slot %d\n,
slot  4, slot  0xf);
}
 #endif
 }
 
+static int atihdmi_set_chan_slot(struct hda_codec *codec, hda_nid_t pin_nid,
+ int chanslot_setup);
+
+static int hdmi_set_chan_slot(struct hda_codec *codec, hda_nid_t pin_nid,
+  int chanslot_setup)
+{
+   if (is_atihdmi(codec))
+   return 

Re: [PATCH V3 1/3] mfd: add support for AMS AS3722 PMIC

2013-10-01 Thread Stephen Warren
On 09/24/2013 05:58 AM, Laxman Dewangan wrote:
 The AMS AS3722 is a compact system PMU suitable for mobile phones,
 tablets etc. It has 4 DC/DC step-down regulators, 3 DC/DC step-down
 controller, 11 LDOs, RTC, automatic battery, temperature and
 over-current monitoring, 8 GPIOs, ADC and watchdog.
 
 Add MFD core driver for the AS3722 to support core functionality.

 diff --git a/Documentation/devicetree/bindings/mfd/as3722.txt 
 b/Documentation/devicetree/bindings/mfd/as3722.txt

 +Required properties:
 +---
 +- compatible : Must be ams,as3722.
 +- reg:  I2C device address.
 +- interrupt-controller : AS3722 has its own internal IRQs

You should at least specify that the property is a Boolean.

That description a justification for why that property should be
present, not a description of the property.

If the device only has *internal* IRQs, you shouldn't need any of the
IRQ properties in DT. Rather, I believe you need the IRQ properties
because the device has *external* IRQ inputs via the gpio-in-interrupt
GPIO mux function.

 +- interrupt-parent : The parent interrupt controller.

While that property is allowed, it's not required. It's also unusual to
mention it in individual bindings, since it's a system-defined property.

 +Optional properties:
 +---

 +- pinctrl-names: It is define in the
 + Documentation/devicetree/bindings/pinctrl/pinctrl-binding.txt

You need to specify which values must be present in this property.

Why is the property optional?

If you're going to mention pinctrl-names, shouldn't you also mention
pinctrl-$n?

Instead, I would simply say:

==
Optional properties:

A pinctrl state named default must be defined, using the bindings in
../pinctrl/pinctrl-binding.txt.
==

Oh, and don't use an absolute file-name within the Linux kernel source
tree, since that path name will change after the binding docs are moved
out of the kernel source tree.

As I mentioned when reviewing the pinmux binding, that binding should be
described here.

In the example, I see a regulators binding. That should be here too.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/4] ALSA: hda - hdmi: Add ELD emulation for ATI/AMD codecs

2013-10-01 Thread Anssi Hannula
ATI/AMD HDMI/DP codecs do not include standard HDA ELD (EDID-like data)
support.

In place of providing access to an ELD buffer, various vendor-specific
verbs are provided to provide the relevant information. Revision ID 3
and later (0x100300 as reported by procfs codec#X) have support for
providing more information than the previous revisions (but only if
supported by the display driver).

Generate ELD from the information provided by the vendor-specific verbs
on ATI/AMD codecs.

The specification is available at:
http://www.x.org/docs/AMD/AMD_HDA_verbs_v2.pdf

Signed-off-by: Anssi Hannula anssi.hann...@iki.fi
Tested-by: Peter Frühberger frit...@xbmc.org
---
 sound/pci/hda/hda_eld.c | 157 
 1 file changed, 157 insertions(+)

diff --git a/sound/pci/hda/hda_eld.c b/sound/pci/hda/hda_eld.c
index d0d7ac1e..750841e 100644
--- a/sound/pci/hda/hda_eld.c
+++ b/sound/pci/hda/hda_eld.c
@@ -2,6 +2,7 @@
  * Generic routines and proc interface for ELD(EDID Like Data) information
  *
  * Copyright(c) 2008 Intel Corporation.
+ * Copyright (c) 2013 Anssi Hannula anssi.hann...@iki.fi
  *
  * Authors:
  * Wu Fengguang w...@linux.intel.com
@@ -316,6 +317,9 @@ int snd_hdmi_get_eld_size(struct hda_codec *codec, 
hda_nid_t nid)
 AC_DIPSIZE_ELD_BUF);
 }
 
+static int atihdmi_get_eld(struct hda_codec *codec, hda_nid_t nid,
+  unsigned char *buf, int *eld_size);
+
 int snd_hdmi_get_eld(struct hda_codec *codec, hda_nid_t nid,
 unsigned char *buf, int *eld_size)
 {
@@ -323,6 +327,9 @@ int snd_hdmi_get_eld(struct hda_codec *codec, hda_nid_t nid,
int ret = 0;
int size;
 
+   if (is_atihdmi(codec))
+   return atihdmi_get_eld(codec, nid, buf, eld_size);
+
/*
 * ELD size is initialized to zero in caller function. If no errors and
 * ELD is valid, actual eld_size is assigned.
@@ -671,3 +678,153 @@ void snd_hdmi_eld_update_pcm_info(struct parsed_hdmi_eld 
*e,
hinfo-maxbps = min(hinfo-maxbps, maxbps);
hinfo-channels_max = min(hinfo-channels_max, channels_max);
 }
+
+
+/* ATI/AMD specific stuff (ELD emulation) */
+
+#define ATI_VERB_SET_AUDIO_DESCRIPTOR  0x776
+#define ATI_VERB_SET_SINK_INFO_INDEX   0x780
+#define ATI_VERB_GET_SPEAKER_ALLOCATION0xf70
+#define ATI_VERB_GET_AUDIO_DESCRIPTOR  0xf76
+#define ATI_VERB_GET_AUDIO_VIDEO_DELAY 0xf7b
+#define ATI_VERB_GET_SINK_INFO_INDEX   0xf80
+#define ATI_VERB_GET_SINK_INFO_DATA0xf81
+
+#define ATI_SPKALLOC_SPKALLOC  0x007f
+#define ATI_SPKALLOC_TYPE_HDMI 0x0100
+#define ATI_SPKALLOC_TYPE_DISPLAYPORT  0x0200
+
+/* first three bytes are just standard SAD */
+#define ATI_AUDIODESC_CHANNELS 0x0007
+#define ATI_AUDIODESC_RATES0xff00
+#define ATI_AUDIODESC_LPCM_STEREO_RATES0xff00
+
+/* in standard HDMI VSDB format */
+#define ATI_DELAY_VIDEO_LATENCY0x00ff
+#define ATI_DELAY_AUDIO_LATENCY0xff00
+
+enum ati_sink_info_idx {
+   ATI_INFO_IDX_MANUFACTURER_ID= 0,
+   ATI_INFO_IDX_PRODUCT_ID = 1,
+   ATI_INFO_IDX_SINK_DESC_LEN  = 2,
+   ATI_INFO_IDX_PORT_ID_LOW= 3,
+   ATI_INFO_IDX_PORT_ID_HIGH   = 4,
+   ATI_INFO_IDX_SINK_DESC_FIRST= 5,
+   ATI_INFO_IDX_SINK_DESC_LAST = 22, /* max len 18 bytes */
+};
+
+static int atihdmi_get_eld(struct hda_codec *codec, hda_nid_t nid,
+unsigned char *buf, int *eld_size)
+{
+   int spkalloc, ati_sad, aud_synch;
+   int sink_desc_len = 0;
+   int pos, i;
+
+   /* ATI/AMD does not have ELD, emulate it */
+
+   spkalloc = snd_hda_codec_read(codec, nid, 0, 
ATI_VERB_GET_SPEAKER_ALLOCATION, 0);
+
+   if (!spkalloc) {
+   snd_printd(KERN_INFO HDMI ATI/AMD: no speaker allocation for 
ELD\n);
+   return -EINVAL;
+   }
+
+   memset(buf, 0, ELD_FIXED_BYTES + ELD_MAX_MNL + ELD_MAX_SAD * 3);
+
+   /* version */
+   buf[0] = ELD_VER_CEA_861D  3;
+
+   /* speaker allocation from EDID */
+   buf[7] = spkalloc  ATI_SPKALLOC_SPKALLOC;
+
+   /* is DisplayPort? */
+   if (spkalloc  ATI_SPKALLOC_TYPE_DISPLAYPORT)
+   buf[5] |= 0x04;
+
+   pos = ELD_FIXED_BYTES;
+
+   if (is_amdhdmi_rev3(codec)) {
+   int sink_info;
+
+   snd_hda_codec_write(codec, nid, 0, 
ATI_VERB_SET_SINK_INFO_INDEX, ATI_INFO_IDX_PORT_ID_LOW);
+   sink_info = snd_hda_codec_read(codec, nid, 0, 
ATI_VERB_GET_SINK_INFO_DATA, 0);
+   put_unaligned_le32(sink_info, buf + 8);
+
+   snd_hda_codec_write(codec, nid, 0, 
ATI_VERB_SET_SINK_INFO_INDEX, ATI_INFO_IDX_PORT_ID_HIGH);
+   sink_info = snd_hda_codec_read(codec, nid, 0, 
ATI_VERB_GET_SINK_INFO_DATA, 0);
+   put_unaligned_le32(sink_info, buf + 12);
+
+   

Re: [PATCH] hotplug: Optimize {get,put}_online_cpus()

2013-10-01 Thread Paul E. McKenney
On Thu, Sep 26, 2013 at 01:10:42PM +0200, Peter Zijlstra wrote:
 On Wed, Sep 25, 2013 at 02:22:00PM -0700, Paul E. McKenney wrote:
  A couple of nits and some commentary, but if there are races, they are
  quite subtle.  ;-)
 
 *whee*..
 
 I made one little change in the logic; I moved the waitcount increment
 to before the __put_online_cpus() call, such that the writer will have
 to wait for us to wake up before trying again -- not for us to actually
 have acquired the read lock, for that we'd need to mess up
 __get_online_cpus() a bit more.
 
 Complete patch below.

OK, looks like Oleg is correct, the cpuhp_seq can be dispensed with.

I still don't see anything wrong with it, so time for a serious stress
test on a large system.  ;-)

Additional commentary interspersed.

Thanx, Paul

 ---
 Subject: hotplug: Optimize {get,put}_online_cpus()
 From: Peter Zijlstra pet...@infradead.org
 Date: Tue Sep 17 16:17:11 CEST 2013
 
 The current implementation of get_online_cpus() is global of nature
 and thus not suited for any kind of common usage.
 
 Re-implement the current recursive r/w cpu hotplug lock such that the
 read side locks are as light as possible.
 
 The current cpu hotplug lock is entirely reader biased; but since
 readers are expensive there aren't a lot of them about and writer
 starvation isn't a particular problem.
 
 However by making the reader side more usable there is a fair chance
 it will get used more and thus the starvation issue becomes a real
 possibility.
 
 Therefore this new implementation is fair, alternating readers and
 writers; this however requires per-task state to allow the reader
 recursion.
 
 Many comments are contributed by Paul McKenney, and many previous
 attempts were shown to be inadequate by both Paul and Oleg; many
 thanks to them for persisting to poke holes in my attempts.
 
 Cc: Oleg Nesterov o...@redhat.com
 Cc: Paul McKenney paul...@linux.vnet.ibm.com
 Cc: Thomas Gleixner t...@linutronix.de
 Cc: Steven Rostedt rost...@goodmis.org
 Signed-off-by: Peter Zijlstra pet...@infradead.org
 ---
  include/linux/cpu.h   |   58 +
  include/linux/sched.h |3 
  kernel/cpu.c  |  209 
 +++---
  kernel/sched/core.c   |2 
  4 files changed, 208 insertions(+), 64 deletions(-)

I stripped the removed lines to keep my eyes from going buggy.

 --- a/include/linux/cpu.h
 +++ b/include/linux/cpu.h
 @@ -16,6 +16,7 @@
  #include linux/node.h
  #include linux/compiler.h
  #include linux/cpumask.h
 +#include linux/percpu.h
 
  struct device;
 
 @@ -173,10 +174,61 @@ extern struct bus_type cpu_subsys;
  #ifdef CONFIG_HOTPLUG_CPU
  /* Stop CPUs going up and down. */
 
 +extern void cpu_hotplug_init_task(struct task_struct *p);
 +
  extern void cpu_hotplug_begin(void);
  extern void cpu_hotplug_done(void);
 +
 +extern int __cpuhp_state;
 +DECLARE_PER_CPU(unsigned int, __cpuhp_refcount);
 +
 +extern void __get_online_cpus(void);
 +
 +static inline void get_online_cpus(void)
 +{
 + might_sleep();
 +
 + /* Support reader recursion */
 + /* The value was = 1 and remains so, reordering causes no harm. */
 + if (current-cpuhp_ref++)
 + return;
 +
 + preempt_disable();
 + if (likely(!__cpuhp_state)) {
 + /* The barrier here is supplied by synchronize_sched(). */

I guess I shouldn't complain about the comment given where it came
from, but...

A more accurate comment would say that we are in an RCU-sched read-side
critical section, so the writer cannot both change __cpuhp_state from
readers_fast and start checking counters while we are here.  So if we see
!__cpuhp_state, we know that the writer won't be checking until we past
the preempt_enable() and that once the synchronize_sched() is done,
the writer will see anything we did within this RCU-sched read-side
critical section.

(The writer -can- change __cpuhp_state from readers_slow to readers_block
while we are in this read-side critical section and then start summing
counters, but that corresponds to a different if statement.)

 + __this_cpu_inc(__cpuhp_refcount);
 + } else {
 + __get_online_cpus(); /* Unconditional memory barrier. */
 + }
 + preempt_enable();
 + /*
 +  * The barrier() from preempt_enable() prevents the compiler from
 +  * bleeding the critical section out.
 +  */
 +}
 +
 +extern void __put_online_cpus(void);
 +
 +static inline void put_online_cpus(void)
 +{
 + /* The value was = 1 and remains so, reordering causes no harm. */
 + if (--current-cpuhp_ref)
 + return;
 +
 + /*
 +  * The barrier() in preempt_disable() prevents the compiler from
 +  * bleeding the critical section out.
 +  */
 + preempt_disable();
 + if (likely(!__cpuhp_state)) {
 + /* The barrier here is supplied by synchronize_sched().  */

Same here, both for the implied self-criticism 

[PATCH 3/4] ALSA: hda - hdmi: Add HBR bitstreaming support for ATI/AMD HDMI codecs

2013-10-01 Thread Anssi Hannula
ATI/AMD HDMI codecs do not include standard HDA HDMI HBR support (which
is required for bitstreaming DTS-HD and Dolby TrueHD), instead they have
custom verbs for checking and enabling it.

Add support for the ATI/AMD HDMI HBR verbs.

The specification is available at:
http://www.x.org/docs/AMD/AMD_HDA_verbs_v2.pdf

Signed-off-by: Anssi Hannula anssi.hann...@iki.fi
Tested-by: Peter Frühberger frit...@xbmc.org
---
 sound/pci/hda/patch_hdmi.c | 65 +-
 1 file changed, 64 insertions(+), 1 deletion(-)

diff --git a/sound/pci/hda/patch_hdmi.c b/sound/pci/hda/patch_hdmi.c
index 19adb01..c0cd4ca 100644
--- a/sound/pci/hda/patch_hdmi.c
+++ b/sound/pci/hda/patch_hdmi.c
@@ -1140,7 +1140,7 @@ static int hdmi_setup_stream(struct hda_codec *codec, 
hda_nid_t cvt_nid,
new_pinctl);
 
}
-   if (is_hbr_format(format)  !new_pinctl) {
+   if (is_hbr_format(format)  !new_pinctl  !is_atihdmi(codec)) {
snd_printdd(hdmi_setup_stream: HBR is not supported\n);
return -EINVAL;
}
@@ -2654,6 +2654,7 @@ static int patch_nvhdmi_8ch_7x(struct hda_codec *codec)
 #define ATI_VERB_SET_MULTICHANNEL_23   0x778
 #define ATI_VERB_SET_MULTICHANNEL_45   0x779
 #define ATI_VERB_SET_MULTICHANNEL_67   0x77a
+#define ATI_VERB_SET_HBR_CONTROL   0x77c
 #define ATI_VERB_SET_MULTICHANNEL_10x785
 #define ATI_VERB_SET_MULTICHANNEL_30x786
 #define ATI_VERB_SET_MULTICHANNEL_50x787
@@ -2665,6 +2666,7 @@ static int patch_nvhdmi_8ch_7x(struct hda_codec *codec)
 #define ATI_VERB_GET_MULTICHANNEL_23   0xf78
 #define ATI_VERB_GET_MULTICHANNEL_45   0xf79
 #define ATI_VERB_GET_MULTICHANNEL_67   0xf7a
+#define ATI_VERB_GET_HBR_CONTROL   0xf7c
 #define ATI_VERB_GET_MULTICHANNEL_10xf85
 #define ATI_VERB_GET_MULTICHANNEL_30xf86
 #define ATI_VERB_GET_MULTICHANNEL_50xf87
@@ -2673,6 +2675,9 @@ static int patch_nvhdmi_8ch_7x(struct hda_codec *codec)
 
 #define ATI_OUT_ENABLE 0x1
 
+#define ATI_HBR_CAPABLE 0x01
+#define ATI_HBR_ENABLE 0x10
+
 static void atihdmi_set_ca(struct hda_codec *codec, hda_nid_t pin_nid, int ca)
 {
printk(ATI: setting ca %d\n, ca);
@@ -2813,6 +2818,63 @@ static int atihdmi_get_chan_slot(struct hda_codec 
*codec, hda_nid_t pin_nid, int
 }
 #endif
 
+static int atihdmi_playback_pcm_prepare(struct hda_pcm_stream *hinfo,
+   struct hda_codec *codec,
+   unsigned int stream_tag,
+   unsigned int format,
+   struct snd_pcm_substream *substream)
+{
+   struct hdmi_spec *spec = codec-spec;
+   int pin_idx = hinfo_to_pin_index(spec, hinfo);
+   struct hdmi_spec_per_pin *per_pin = get_pin(spec, pin_idx);
+   hda_nid_t pin_nid = per_pin-pin_nid;
+   int hbr_ctl, hbr_ctl_new;
+
+   hbr_ctl = snd_hda_codec_read(codec, pin_nid, 0, 
ATI_VERB_GET_HBR_CONTROL, 0);
+   if (hbr_ctl  ATI_HBR_CAPABLE) {
+   if (is_hbr_format(format))
+   hbr_ctl_new = hbr_ctl | ATI_HBR_ENABLE;
+   else
+   hbr_ctl_new = hbr_ctl  ~ATI_HBR_ENABLE;
+
+   snd_printdd(atihdmi_playback_pcm_prepare: 
+   NID=0x%x, %shbr-ctl=0x%x\n,
+   pin_nid,
+   hbr_ctl == hbr_ctl_new ?  : new-,
+   hbr_ctl_new);
+
+   if (hbr_ctl != hbr_ctl_new)
+   snd_hda_codec_write(codec, pin_nid, 0,
+   ATI_VERB_SET_HBR_CONTROL,
+   hbr_ctl_new);
+
+   } else if (is_hbr_format(format)) {
+   snd_printdd(atihdmi_playback_pcm_prepare: HBR is not 
supported\n);
+   return -EINVAL;
+   }
+
+   return generic_hdmi_playback_pcm_prepare(hinfo, codec, stream_tag, 
format, substream);
+}
+
+static int atihdmi_build_pcms(struct hda_codec *codec)
+{
+   struct hdmi_spec *spec = codec-spec;
+   int err, pin_idx;
+
+   err = generic_hdmi_build_pcms(codec);
+
+   if (err)
+   return err;
+
+   for (pin_idx = 0; pin_idx  spec-num_pins; pin_idx++) {
+   struct hda_pcm *info = get_pcm_rec(spec, pin_idx);
+
+   info-stream[SNDRV_PCM_STREAM_PLAYBACK].ops.prepare = 
atihdmi_playback_pcm_prepare;
+   }
+
+   return 0;
+}
+
 static int atihdmi_init(struct hda_codec *codec)
 {
struct hdmi_spec *spec = codec-spec;
@@ -2849,6 +2911,7 @@ static int patch_atihdmi(struct hda_codec *codec)
return err;
 
codec-patch_ops.init = atihdmi_init;
+   codec-patch_ops.build_pcms = atihdmi_build_pcms;
 
/* ATI/AMD converters do not advertise all of their capabilities */
spec = codec-spec;
-- 
1.8.1.5

--
To unsubscribe from this list: send the line 

Re: [PATCH] [workqueue] check values of pwq and wq in print_worker_info() before use

2013-10-01 Thread Tejun Heo
Hello,

On Tue, Oct 01, 2013 at 10:35:20PM +0200, Helge Deller wrote:
 print_worker_info() includes no validity check on the pwq and wq
 pointers before handing them over to the probe_kernel_read() functions.
 
 It seems that most architectures don't care about that, but at least on
 the parisc architecture this leads to a kernel crash since accesses to
 page zero are protected by the kernel for security reasons.
 
 Fix this problem by verifying the contents of pwq and wq before usage.
 Even if probe_kernel_read() usually prevents such crashes by disabling
 page faults, clean code should always include such checks. 
 
 Without this fix issuing echo t  /proc/sysrq-trigger will immediately
 crash the Linux kernel on the parisc architecture.

Hmm... um had similar problem but the root cause here is that the arch
isn't implementing probe_kernel_read() properly.  We really have no
idea what the pointer value may be at the dump point and that's why we
use probe_kernel_read().  If something like the above is necessary for
the time being, the correct place would be the arch
probe_kernel_read() implementation.  James, would it be difficult
implement proper probe_kernel_read() on parisc?

Thanks.

-- 
tejun
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 05/10] ARM: SAMSUNG: Initialize PWM backlight enable_gpio field

2013-10-01 Thread Thierry Reding
On Tue, Oct 01, 2013 at 12:31:04PM -0600, Stephen Warren wrote:
 On 09/23/2013 03:41 PM, Thierry Reding wrote:
  The GPIO API defines 0 as being a valid GPIO number, so this field needs
  to be initialized explicitly.
 
   static void __init smdkv210_map_io(void)
 
  @@ -70,6 +70,7 @@ static struct samsung_bl_drvdata samsung_dfl_bl_data 
  __initdata = {
  .max_brightness = 255,
  .dft_brightness = 255,
  .pwm_period_ns  = 78770,
  +   .enable_gpio= -1,
  .init   = samsung_bl_init,
  .exit   = samsung_bl_exit,
  },
  @@ -121,6 +122,10 @@ void __init samsung_bl_set(struct samsung_bl_gpio_info 
  *gpio_info,
  samsung_bl_data-lth_brightness = bl_data-lth_brightness;
  if (bl_data-pwm_period_ns)
  samsung_bl_data-pwm_period_ns = bl_data-pwm_period_ns;
  +   if (bl_data-enable_gpio)
  +   samsung_bl_data-enable_gpio = bl_data-enable_gpio;
  +   if (bl_data-enable_gpio_flags)
  +   samsung_bl_data-enable_gpio_flags = bl_data-enable_gpio_flags;
 
 Won't this cause the core pwm_bl driver to request/manipulate the GPIO,
 whereas this driver already does that inside the samsung_bl_init/exit
 callbacks? I think you either need to adjust those callbacks, or not set
 the new standard GPIO property in samsung_bl_data.

I don't think so. The samsung_bl_data is a copy of samsung_dfl_bl_data
augmented by board-specific settings. So in fact copying these values
here is essential to allow boards to override the enable_gpio and flags
fields. Currently no board sets the enable_gpio to a valid GPIO so it's
all still handled by the callbacks only.

Thierry


pgplAElP1sXZJ.pgp
Description: PGP signature


Re: [PATCH 1/4] i2c: busses: i2c-st: Add ST I2C controller

2013-10-01 Thread Stephen Warren
On 10/01/2013 04:39 AM, Maxime COQUELIN wrote:
 This patch adds support to SSC (Synchronous Serial Controller)
 I2C driver. This IP also supports SPI protocol, but this is not
 the aim of this driver.
 
 This IP is embedded in all ST SoCs for Set-top box platorms, and
 supports I2C Standard and Fast modes.

 diff --git a/Documentation/devicetree/bindings/i2c/i2c-st.txt 
 b/Documentation/devicetree/bindings/i2c/i2c-st.txt

 +Required properties :

 +- clocks : phandle to the I2C clock source
 +- clock-names : from common clock binding: Shall be ssc

I'd prefer to define that as:

clock-names: Must contain ssc.
clocks: Must contain an entry for each name in clock-names. See the
common clock bindings.

That way, it makes it clear that clock-names is the primary lookup
mechanism, rather than some auxiliary documentation.

 +Recommended properties :
 +- clock-frequency : Desired I2C bus clock frequency in Hz. Otherwise

s/Otherwise/If not specified,/

 +  the default 100 kHz frequency will be used. As only Normal and Fast modes
 +  are supported, possible values are 10 and 40.

I think that's just optional. Since there's a well-defined sensible
default, there's no need to recommend it.

 +Optional properties :
 +- i2c-min-scl-pulse-width-us : The minimum valid SCL pulse width that is 
 allowed
 +  through the deglitch circuit. In units of us.
 +- i2c-min-sda-pulse-width-us : The minimum valid SDA pulse width that is 
 allowed
 +  through the deglitch circuit. In units of us.

Are those properties specific to this binding, or intended to be
generic? If specific to this binding, a vendor prefix should be present
in the property name. If not, you probably want to document the
properties in some common file.

 +Examples :

s/Examples/Example/ since there's just one.

 +i2c0: i2c@fed4 {
 + compatible  = st,comms-ssc-i2c;
 + reg = 0xfed4 0x110;
 + interrupts  =  GIC_SPI 187 IRQ_TYPE_EDGE_RISING;
 + clocks  = CLK_S_ICN_REG_0;
 + clock-names = ssc;
 + clock-frequency = 40;
 + pinctrl-names   = default;
 + pinctrl-0   = pinctrl_i2c0_default;

That wasn't mentioned in the binding definition. You'd probably want to
document the requirement for those two properties by saying something like:

A pinctrl state named default must be defined, using the bindings in
../pinctrl/pinctrl-binding.txt.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] arm64: Remove duplicate DEBUG_STACK_USAGE config

2013-10-01 Thread Stephen Boyd
This config item already exists generically in lib/Kconfig.debug.
Remove the duplicate config in arm64.

Signed-off-by: Stephen Boyd sb...@codeaurora.org
---
 arch/arm64/Kconfig.debug | 7 ---
 1 file changed, 7 deletions(-)

diff --git a/arch/arm64/Kconfig.debug b/arch/arm64/Kconfig.debug
index 1a6bfe9..835c5597 100644
--- a/arch/arm64/Kconfig.debug
+++ b/arch/arm64/Kconfig.debug
@@ -6,13 +6,6 @@ config FRAME_POINTER
bool
default y
 
-config DEBUG_STACK_USAGE
-   bool Enable stack utilization instrumentation
-   depends on DEBUG_KERNEL
-   help
- Enables the display of the minimum amount of free stack which each
- task has ever had available in the sysrq-T output.
-
 config EARLY_PRINTK
bool Early printk support
default y
-- 
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
hosted by The Linux Foundation

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: 3.12.0-rc3: Symbol license change in commit caf5c03f (ACPI: Move acpi_bus_get_device() from bus.c to scan.c)

2013-10-01 Thread Rafael J. Wysocki
On Tuesday, October 01, 2013 12:59:53 PM Peter Hurley wrote:
 I have no love lost for proprietary modules but changing
 acpi_bus_get_device() symbol's license seems gratuitous considering
 the symbol pre-dates the mainline git tree and the code is just
 being moved from one source file to another.

Well, I didn't know whether or not any binary modules use that function in the
first place.

It looks like some of them do, so below is a revert of that change (that I'm
going to push for -rc4).

I wonder what module exactly you have in mind, though?

Rafael


---
From: Rafael J. Wysocki rafael.j.wyso...@intel.com
Subject: ACPI: Use EXPORT_SYMBOL() for acpi_bus_get_device()

Commit caf5c03f (ACPI: Move acpi_bus_get_device() from bus.c to
scan.c) caused acpi_bus_get_device() to be exported using
EXPORT_SYMBOL_GPL(), but that broke some binary drivers in
existence, so revert that change.

Signed-off-by: Rafael J. Wysocki rafael.j.wyso...@intel.com
---
 drivers/acpi/scan.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

Index: linux-pm/drivers/acpi/scan.c
===
--- linux-pm.orig/drivers/acpi/scan.c
+++ linux-pm/drivers/acpi/scan.c
@@ -968,7 +968,7 @@ int acpi_bus_get_device(acpi_handle hand
}
return 0;
 }
-EXPORT_SYMBOL_GPL(acpi_bus_get_device);
+EXPORT_SYMBOL(acpi_bus_get_device);
 
 int acpi_device_add(struct acpi_device *device,
void (*release)(struct device *))

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/7] x86, kaslr: move CPU flags out of cpucheck

2013-10-01 Thread H. Peter Anvin
On 10/01/2013 12:37 PM, Kees Cook wrote:
 Refactor the CPU flags handling out of the cpucheck routines so that
 they can be reused by the future ASLR routines (in order to detect CPU
 features like RDRAND and RDTSC).
 
 This reworks has_eflag() and has_fpu() to be used on both 32-bit and
 64-bit, and refactors the calls to cpuid to make them PIC-safe on 32-bit.
 
 Signed-off-by: Kees Cook keesc...@chromium.org

Please flag the ones that specifically touch the boot code so that is
clear.  Neither the title or the description makes that at all clear,
and at first reading is fairly confusing as a result.

-hpa


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 08/10] pwm-backlight: Use new enable_gpio field

2013-10-01 Thread Thierry Reding
On Tue, Oct 01, 2013 at 12:39:36PM -0600, Stephen Warren wrote:
 On 09/23/2013 03:41 PM, Thierry Reding wrote:
  Make use of the new enable_gpio field and allow it to be set from DT as
  well. Now that all legacy users of platform data have been converted to
  initialize this field to an invalid value, it is safe to use the field
  from the driver.
 
  diff --git 
  a/Documentation/devicetree/bindings/video/backlight/pwm-backlight.txt 
  b/Documentation/devicetree/bindings/video/backlight/pwm-backlight.txt
 
   Optional properties:
 
  +  - enable-gpios: a list of GPIOs to enable and disable the backlight
 
 That seems to imply that multiple GPIOs are legal. Was that intended? If
 not, I would suggest:
 
 - enable-gpios: contains a single GPIO specifier for the GPIO which
   enables/disables the backlight. See [1] for the format.
 
   
   [0]: Documentation/devicetree/bindings/pwm/pwm.txt
 + [1]: Documentation/devicetree/bindings/gpio/gpio.txt

Yes, that sounds better. Indeed only a single GPIO is supported. Adding
more than one would require representing the timing between the two and
that will take us back to where we left off with the power sequences.

  diff --git a/drivers/video/backlight/pwm_bl.c 
  b/drivers/video/backlight/pwm_bl.c
 
  @@ -51,12 +55,27 @@ static void pwm_backlight_power_on(struct pwm_bl_data 
  *pb, int brightness,
   pb-lth_brightness;
   
  pwm_config(pb-pwm, duty_cycle, pb-period);
  +
  +   if (gpio_is_valid(pb-enable_gpio)) {
  +   if (pb-enable_gpio_flags  PWM_BACKLIGHT_GPIO_ACTIVE_LOW)
  +   gpio_set_value(pb-enable_gpio, 0);
  +   else
  +   gpio_set_value(pb-enable_gpio, 1);
  +   }
 
 Feel completely free to ignore this, but when the difference in two
 code-paths is solely in function parameters, I prefer to calculate the
 parameter inside the if statement, then call the function outside the
 conditional code, to highlight the common/different parts:
 
 int value;
 
 /* or an if statement for the next 1 line, if you don't like ?: */
 value = (pb-enable_gpio_flags  PWM_BACKLIGHT_GPIO_ACTIVE_LOW) ? 0 : 1;
 gpio_set_value((pb-enable_gpio, value);

I actually tried that, but it looked cluttered, so I opted for the
alternative. Not that this matters all that much, because beginning with
3.13 the GPIO subsystem should be able to handle the active low flag
internally using the new gpiod_*() API. I plan on converting the driver
to use that during the next cycle.

  @@ -148,11 +168,10 @@ static int pwm_backlight_parse_dt(struct device *dev,
 
  +   data-enable_gpio = of_get_named_gpio_flags(node, enable-gpios, 0,
  +   flags);
  +   if (gpio_is_valid(data-enable_gpio)  (flags  OF_GPIO_ACTIVE_LOW))
  +   data-enable_gpio_flags |= PWM_BACKLIGHT_GPIO_ACTIVE_LOW;
 
 This doesn't seem to handle deferred probe correctly; I would expect
 something more like:
 
   data-enable_gpio = of_get_named_gpio_flags(...);
   if (data-enable_gpio == -EPROBE_DEFERRED)
   return data-enable_gpio;
   if (gpio_is_valid(...))
   ...
   return 0;
 
 I suppose it's possible that the value filters down to
 gpio_request_one() and this actually does work out OK, but that feels
 very accidental/implicit to me.

Yes, I think it does indeed propagate to gpio_request_one(), but you're
right, it should be caught explicitly.

Thierry


pgppd7x8RDVD2.pgp
Description: PGP signature


Re: [PATCH 4/7] x86, kaslr: select random base offset

2013-10-01 Thread H. Peter Anvin
On 10/01/2013 12:37 PM, Kees Cook wrote:
 +
 +#include asm/archrandom.h
 +static inline int rdrand(unsigned long *v)
 +{
 + int ok;
 + asm volatile(1:  RDRAND_LONG \n\t
 +  jc 2f\n\t
 +  decl %0\n\t
 +  jnz 1b\n\t
 +  2:
 +  : =r (ok), =a (*v)
 +  : 0 (RDRAND_RETRY_LOOPS));
 + return ok;
 +}
 +

This looks just like rdrand_long() in arch/x86/kernel/cpu/rdrand.c and
could move into the header file, no?

-hpa


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] [workqueue] check values of pwq and wq in print_worker_info() before use

2013-10-01 Thread Helge Deller
On 10/01/2013 10:43 PM, Tejun Heo wrote:
 Hello,
 
 On Tue, Oct 01, 2013 at 10:35:20PM +0200, Helge Deller wrote:
 print_worker_info() includes no validity check on the pwq and wq
 pointers before handing them over to the probe_kernel_read() functions.

 It seems that most architectures don't care about that, but at least on
 the parisc architecture this leads to a kernel crash since accesses to
 page zero are protected by the kernel for security reasons.

 Fix this problem by verifying the contents of pwq and wq before usage.
 Even if probe_kernel_read() usually prevents such crashes by disabling
 page faults, clean code should always include such checks. 

 Without this fix issuing echo t  /proc/sysrq-trigger will immediately
 crash the Linux kernel on the parisc architecture.
 
 Hmm... um had similar problem but the root cause here is that the arch
 isn't implementing probe_kernel_read() properly.  We really have no
 idea what the pointer value may be at the dump point and that's why we
 use probe_kernel_read().  If something like the above is necessary for
 the time being, the correct place would be the arch
 probe_kernel_read() implementation.  James, would it be difficult
 implement proper probe_kernel_read() on parisc?

No, it's not really complicated.
That was my initial way to work around that problem.

But is this really necessary? Isn't a pointer which points to mem zero most
likely wrong on any architecture?

In addition I wrote another patch to work around that problem in the parisc
page fault handler (which is needed anyway) too:
https://patchwork.kernel.org/patch/2971701/

So, in summary my patch here is not really necessary, but for the sake of
clean code I think it doesn't hurt either and as such it would be nice if
you could apply it.

Helge
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 09/10] pwm-backlight: Use an optional power supply

2013-10-01 Thread Thierry Reding
On Tue, Oct 01, 2013 at 12:43:57PM -0600, Stephen Warren wrote:
 On 09/23/2013 03:41 PM, Thierry Reding wrote:
  Many backlights require a power supply to work properly. This commit
  uses a power-supply regulator, if available, to power up and power down
  the panel.
 
 I think that all backlights require a power supply, albeit the supply
 may not be SW-controllable. Hence, shouldn't the regulator be mandatory
 in the binding, yet the driver be defensively coded such that if one
 isn't specified, the driver continues to work?

That has already changed in my local version of this patch.

  diff --git a/drivers/video/backlight/pwm_bl.c 
  b/drivers/video/backlight/pwm_bl.c
 
  @@ -253,6 +264,16 @@ static int pwm_backlight_probe(struct platform_device 
  *pdev)
  }
  }
   
  +   pb-power_supply = devm_regulator_get_optional(pdev-dev, power);
 
 ... so I think that should be devm_regulator_get(), since the regulator
 isn't really optional.
 
  +   if (IS_ERR(pb-power_supply)) {
  +   if (PTR_ERR(pb-power_supply) != -ENODEV) {
  +   ret = PTR_ERR(pb-power_supply);
  +   goto err_gpio;
  +   }
  +
  +   pb-power_supply = NULL;
 
 If devm_regulator_get_optional() returns an error value or a valid
 value, then I don't think that this driver should transmute error values
 into NULL; NULL might be a perfectly valid regulator value. Related, I
 think the if (pb-power_supply) tests should be replaced with if
 (!IS_ERR(pb-power_supply)) instead.

All of that is already done in my local tree. This actually turns out to
work rather smoothly with the new support for optional regulators. The
regulator core will give you a dummy regulator (assuming it's there
physically but hasn't been wired up in software) that's always on, so
the driver doesn't even have to special case it anymore.

Thierry


pgpmJw5PLJXe6.pgp
Description: PGP signature


Re: [PATCH 05/10] ARM: SAMSUNG: Initialize PWM backlight enable_gpio field

2013-10-01 Thread Stephen Warren
On 10/01/2013 02:43 PM, Thierry Reding wrote:
 On Tue, Oct 01, 2013 at 12:31:04PM -0600, Stephen Warren wrote:
 On 09/23/2013 03:41 PM, Thierry Reding wrote:
 The GPIO API defines 0 as being a valid GPIO number, so this
 field needs to be initialized explicitly.
 
 static void __init smdkv210_map_io(void)
 
 @@ -70,6 +70,7 @@ static struct samsung_bl_drvdata
 samsung_dfl_bl_data __initdata = { .max_brightness = 255, 
 .dft_brightness = 255, .pwm_period_ns  = 78770, +   .enable_gpio
 = -1, .init   = samsung_bl_init, .exit   =
 samsung_bl_exit, }, @@ -121,6 +122,10 @@ void __init
 samsung_bl_set(struct samsung_bl_gpio_info *gpio_info, 
 samsung_bl_data-lth_brightness = bl_data-lth_brightness; if
 (bl_data-pwm_period_ns) samsung_bl_data-pwm_period_ns =
 bl_data-pwm_period_ns; +   if (bl_data-enable_gpio) +
 samsung_bl_data-enable_gpio = bl_data-enable_gpio; +  if
 (bl_data-enable_gpio_flags) +
 samsung_bl_data-enable_gpio_flags =
 bl_data-enable_gpio_flags;
 
 Won't this cause the core pwm_bl driver to request/manipulate the
 GPIO, whereas this driver already does that inside the
 samsung_bl_init/exit callbacks? I think you either need to adjust
 those callbacks, or not set the new standard GPIO property in
 samsung_bl_data.
 
 I don't think so. The samsung_bl_data is a copy of
 samsung_dfl_bl_data augmented by board-specific settings. So in
 fact copying these values here is essential to allow boards to
 override the enable_gpio and flags fields. Currently no board sets
 the enable_gpio to a valid GPIO so it's all still handled by the
 callbacks only.

Oh yes, you're right. I was confusing the new enable_gpio field in
pwm_bl's platform data with some other field in a custom data structure.

One minor point though:

 +   if (bl_data-enable_gpio) + samsung_bl_data-enable_gpio =
 bl_data-enable_gpio;

That assumes that enable_gpio==0 means none, whereas you've gone to
great pains in the rest of the series to allow 0 to be a valid GPIO
ID. right now, the default value of samsung_bl_data-enable_gpio is
-1, and if !bl_data-enable_gpio, that value won't be propagated across.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 09/10] pwm-backlight: Use an optional power supply

2013-10-01 Thread Stephen Warren
On 10/01/2013 02:53 PM, Thierry Reding wrote:
 On Tue, Oct 01, 2013 at 12:43:57PM -0600, Stephen Warren wrote:
 On 09/23/2013 03:41 PM, Thierry Reding wrote:
 Many backlights require a power supply to work properly. This
 commit uses a power-supply regulator, if available, to power up
 and power down the panel.
 
 I think that all backlights require a power supply, albeit the
 supply may not be SW-controllable. Hence, shouldn't the regulator
 be mandatory in the binding, yet the driver be defensively coded
 such that if one isn't specified, the driver continues to work?
 
 That has already changed in my local version of this patch.
 
 diff --git a/drivers/video/backlight/pwm_bl.c
 b/drivers/video/backlight/pwm_bl.c
 
 @@ -253,6 +264,16 @@ static int pwm_backlight_probe(struct
 platform_device *pdev) } }
 
 +   pb-power_supply = devm_regulator_get_optional(pdev-dev,
 power);
 
 ... so I think that should be devm_regulator_get(), since the
 regulator isn't really optional.
 
 +   if (IS_ERR(pb-power_supply)) { +   if
 (PTR_ERR(pb-power_supply) != -ENODEV) { +  ret =
 PTR_ERR(pb-power_supply); +goto err_gpio; +
 } + +
 pb-power_supply = NULL;
 
 If devm_regulator_get_optional() returns an error value or a
 valid value, then I don't think that this driver should transmute
 error values into NULL; NULL might be a perfectly valid regulator
 value. Related, I think the if (pb-power_supply) tests should be
 replaced with if (!IS_ERR(pb-power_supply)) instead.
 
 All of that is already done in my local tree. This actually turns
 out to work rather smoothly with the new support for optional
 regulators. The regulator core will give you a dummy regulator
 (assuming it's there physically but hasn't been wired up in
 software) that's always on, so the driver doesn't even have to
 special case it anymore.

OK, hopefully it (the regulator core) complains about the missing DT
property though; I assume you're using regulator_get() not
regulator_get_optional(), since the supply really is not optional.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] [workqueue] check values of pwq and wq in print_worker_info() before use

2013-10-01 Thread Tejun Heo
On Tue, Oct 01, 2013 at 10:53:31PM +0200, Helge Deller wrote:
 So, in summary my patch here is not really necessary, but for the sake of
 clean code I think it doesn't hurt either and as such it would be nice if
 you could apply it.

What? function *must* take any value and try to access it and not
cause failure.  That's the *whole* purpose of that interface.  How is
having incomplete spurious checks around it clean code in any sense
of the word?  That doesn't make any sense.

 Nacked-by: Tejun Heo t...@kernel.org

and *please* don't add any checks like that anywhere else in the
kernel.

Thanks.

-- 
tejun
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2 0/5] mm: migrate zbud pages

2013-10-01 Thread Seth Jennings
On Mon, Sep 30, 2013 at 10:28:46AM +0200, Krzysztof Kozlowski wrote:
 On pią, 2013-09-27 at 17:00 -0500, Seth Jennings wrote:
  I have to say that when I first came up with the idea, I was thinking
  the address space would be at the zswap layer and the radix slots would
  hold zbud handles, not struct page pointers.
  
  However, as I have discovered today, this is problematic when it comes
  to reclaim and migration and serializing access.
  
  I wanted to do as much as possible in the zswap layer since anything
  done in the zbud layer would need to be duplicated in any other future
  allocator that zswap wanted to support.
  
  Unfortunately, zbud abstracts away the struct page and that visibility
  is needed to properly do what we are talking about.
  
  So maybe it is inevitable that this will need to be in the zbud code
  with the radix tree slots pointing to struct pages after all.
 
 To me it looks very similar to the solution proposed in my patches.

Yes, it is very similar.  I'm beginning to like aspects of this patch
more as I explore this issue more.

At first, I balked at the idea of yet another abstraction layer, but it
is very hard to avoid unless you want to completely collapse zswap and
zbud into one another and dissolve the layering.  Then you could do a
direct swap_offset - address mapping.

 The
 difference is that you wish to use offset as radix tree index.
 I thought about this earlier but it imposed two problems:
 
 1. A generalized handle (instead of offset) may be more suitable when
 zbud will be used in other drivers (e.g. zram).
 
 2. It requires redesigning of zswap architecture around
 zswap_frontswap_store() in case of duplicated insertion. Currently when
 storing a page the zswap:
  - allocates zbud page,
  - stores new data in it,
  - checks whether it is a duplicated page (same offset present in
 rbtree),
  - if yes (duplicated) then zswap frees previous entry.
 The problem here lies in allocating zbud page under the same offset.
 This step would replace old data (because we are using the same offset
 in radix tree).

Yes, but the offset is always going to be the key at the top layer
because that is was the swap subsystem uses.  So we'd have to have a
swap_offset - handle - address translation (2 abstraction layers) the
first of which would need to deal with the duplicate store issue.

Seth

 
 In my opinion using zbud handle is in this case more flexible.
 
 
 Best regards,
 Krzysztof
 
  I like the idea of masking the bit into the struct page pointer to
  indicate which buddy maps to the offset.
  
  There is a twist here in that, unlike a normal page cache tree, we can
  have two offsets pointing at different buddies in the same frame
  which means we'll have to do some custom stuff for migration.
  
  The rabbit hole I was going down today has come to an end so I'll take a
  fresh look next week.
  
  Thanks for your ideas and discussion! Maybe we can make zswap/zbud an
  upstanding MM citizen yet!
  
  Seth
  
   
   
In case of zbud, there are two swap offset pointing to
the same page. There might be more if zsmalloc is used.
What is worse it is possible that one swap entry could
point to data that cross a page boundary.

We just won't set page-index since it doesn't have a good meaning in
our case.  Swap cache pages also don't use index, although is seems to
me that they could since there is a 1:1 mapping of a swap cache page to
a swap offset and the index field isn't being used for anything else.
But I digress...
   
   OK.
   

   
Of course, one could try to modify MM to support
multiple mapping of a page in the radix tree.
But I think that MM guys will consider this as a hack
and they will not accept it.

Yes, it will require some changes to the MM to handle zbud pages on the
LRU.  I'm thinking that it won't be too intrusive, depending on how we
choose to mark zbud pages.

   
   Anyway, I think that zswap should use two index engines.
   I mean index in Data Base meaning.
   One index is used to translate swap_entry to compressed page.
   And another one to be used by reclaim and migration by MM,
   probably address_space is a best choice.
   Zbud would responsible for keeping consistency
   between mentioned indexes.
   
   Regards,
   Tomasz Stanislawski
   
Seth

   
Regards,
Tomasz Stanislawski
   
   
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majord...@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: a href=mailto:d...@kvack.org; em...@kvack.org /a
   
   

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majord...@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: a href=mailto:d...@kvack.org; em...@kvack.org /a

   
 

--
To unsubscribe from this list: send the line unsubscribe 

Re: [PATCH] [workqueue] check values of pwq and wq in print_worker_info() before use

2013-10-01 Thread Tejun Heo
On Tue, Oct 01, 2013 at 05:03:48PM -0400, Tejun Heo wrote:
 On Tue, Oct 01, 2013 at 10:53:31PM +0200, Helge Deller wrote:
  So, in summary my patch here is not really necessary, but for the sake of
  clean code I think it doesn't hurt either and as such it would be nice if
  you could apply it.
 
 What? function *must* take any value and try to access it and not
 cause failure.  That's the *whole* purpose of that interface.  How is
 having incomplete spurious checks around it clean code in any sense
 of the word?  That doesn't make any sense.

Just in case you didn't know already.  probe_kernel_read()'s role is
to take any ulong value and dereference it if it can.  If not, it can
return any value, but it shouldn't crash in any case.  If you're just
adding NULL test in probe_kernel_read(), you're just masking a common
failure pattern and the kernel still *will* panic while dumping the
states.  If a specific arch doesn't have proper probe_kernel_read()
implementation, adding if (!NULL) test there could be a temporary
workaround, but it should be clearly marked as such.

-- 
tejun
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/7] x86, kaslr: move CPU flags out of cpucheck

2013-10-01 Thread Kees Cook
On Tue, Oct 1, 2013 at 1:48 PM, H. Peter Anvin h...@zytor.com wrote:
 On 10/01/2013 12:37 PM, Kees Cook wrote:
 Refactor the CPU flags handling out of the cpucheck routines so that
 they can be reused by the future ASLR routines (in order to detect CPU
 features like RDRAND and RDTSC).

 This reworks has_eflag() and has_fpu() to be used on both 32-bit and
 64-bit, and refactors the calls to cpuid to make them PIC-safe on 32-bit.

 Signed-off-by: Kees Cook keesc...@chromium.org

 Please flag the ones that specifically touch the boot code so that is
 clear.  Neither the title or the description makes that at all clear,
 and at first reading is fairly confusing as a result.

Yes, good point. Patch 1/7 should be named x86, boot: ...

Thanks!

-Kees

-- 
Kees Cook
Chrome OS Security
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v6 5/6] MCS Lock: Restructure the MCS lock defines and locking code into its own file

2013-10-01 Thread Tim Chen
On Tue, 2013-10-01 at 16:01 -0400, Waiman Long wrote:
 On 10/01/2013 12:48 PM, Tim Chen wrote:
  On Mon, 2013-09-30 at 12:36 -0400, Waiman Long wrote:
  On 09/30/2013 12:10 PM, Jason Low wrote:
  On Mon, 2013-09-30 at 11:51 -0400, Waiman Long wrote:
  On 09/28/2013 12:34 AM, Jason Low wrote:
  Also, below is what the mcs_spin_lock() and mcs_spin_unlock()
  functions would look like after applying the proposed changes.
 
  static noinline
  void mcs_spin_lock(struct mcs_spin_node **lock, struct mcs_spin_node 
  *node)
  {
 struct mcs_spin_node *prev;
 
 /* Init node */
 node-locked = 0;
 node-next   = NULL;
 
 prev = xchg(lock, node);
 if (likely(prev == NULL)) {
 /* Lock acquired. No need to set node-locked since 
  it
  won't be used */
 return;
 }
 ACCESS_ONCE(prev-next) = node;
 /* Wait until the lock holder passes the lock down */
 while (!ACCESS_ONCE(node-locked))
 arch_mutex_cpu_relax();
 smp_mb();
  I wonder if a memory barrier is really needed here.
  If the compiler can reorder the while (!ACCESS_ONCE(node-locked)) check
  so that the check occurs after an instruction in the critical section,
  then the barrier may be necessary.
 
  In that case, just a barrier() call should be enough.
  The cpu could still be executing out of order load instruction from the
  critical section before checking node-locked?  Probably smp_mb() is
  still needed.
 
  Tim
 
 But this is the lock function, a barrier() call should be enough to 
 prevent the critical section from creeping up there. We certainly need 
 some kind of memory barrier at the end of the unlock function.

I may be missing something.  My understanding is that barrier only
prevents the compiler from rearranging instructions, but not for cpu out
of order execution (as in smp_mb). So cpu could read memory in the next
critical section, before node-locked is true, (i.e. unlock has been
completed).  If we only have a simple barrier at end of mcs_lock, then
say the code on CPU1 is

mcs_lock
x = 1;
...
x = 2;
mcs_unlock

and CPU 2 is

mcs_lock
y = x;
...
mcs_unlock

We expect y to be 2 after the y = x assignment.  But we
we may execute the code as

CPU1CPU2

x = 1;
... y = x;  ( y=1, out of order load)
x = 2
mcs_unlock
Check node-locked==true
continue executing critical section (y=1 when we expect 
y=2)

So we get y to be 1 when we expect that it should be 2.  Adding smp_mb
after the node-locked check in lock code

   ACCESS_ONCE(prev-next) = node;
   /* Wait until the lock holder passes the lock down */
   while (!ACCESS_ONCE(node-locked))
arch_mutex_cpu_relax();
   smp_mb();

should prevent this scenario.  

Thanks.
Tim

 
 -Longman
 --
 To unsubscribe from this list: send the line unsubscribe linux-kernel in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
 Please read the FAQ at  http://www.tux.org/lkml/


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 10/10] pwm-backlight: Allow backlight to remain disabled on boot

2013-10-01 Thread Thierry Reding
On Tue, Oct 01, 2013 at 12:50:51PM -0600, Stephen Warren wrote:
 On 09/23/2013 03:41 PM, Thierry Reding wrote:
  The default for backlight devices is to be enabled immediately when
  registering with the backlight core. This can be useful for setups that
  use a simple framebuffer device and where the backlight cannot otherwise
  be hooked up to the panel.
  
  However, when dealing with more complex setups, such as those of recent
  ARM SoCs, this can be problematic. Since the backlight is usually setup
  separately from the display controller, the probe order is not usually
  deterministic. That can lead to situations where the backlight will be
  powered up and the panel will show an uninitialized framebuffer.
  
  Furthermore, subsystems such as DRM have advanced functionality to set
  the power mode of a panel. In order to allow such setups to power up the
  panel at exactly the right moment, a way is needed to prevent the
  backlight core from powering the backlight up automatically when it is
  registered.
  
  This commit introduces a new boot_off field in the platform data (and
  also implements getting the same information from device tree). When set
  the initial backlight power mode will be set to off.
 
  diff --git 
  a/Documentation/devicetree/bindings/video/backlight/pwm-backlight.txt 
  b/Documentation/devicetree/bindings/video/backlight/pwm-backlight.txt
 
  +  - backlight-boot-off: keep the backlight disabled on boot
 
 A few thoughts:
 
 1) Does this property indicate that:
 
 a) The current state of the backlight at boot. In which case, this will
 need bootloader involvement to modify the value in the DT at boot time
 based on whether the bootloader turned on the backlight:-(

I was pretty much following the regulator bindings here. But the meaning
is different indeed, see below.

 b) That the driver should not turn on the backlight immediately? That
 seems to describe driver behaviour more than HW. Is it appropriate to
 put that into DT?

That's what it was supposed to mean. The idea is, and I mentioned that
in the commit message, that when wired up with a higher-level API such
as DRM, then usually that framework knows exactly when to enable the
backlight. Having the backlight enable itself on probe may cause the
panel to light up with no content or garbage.

 Your suggestion to make the backlight not turn on by default might be a
 better fix?

I think so too, but the problem in this case is that users of this
driver heretofore assumed that it would be turned on by default. I think
that's been common practice for all backlight drivers. Adding a property
to DT was supposed to be a way to keep backwards compatibility with any
existing users while at the same time allowing the backlight to be used
in a more modern system. I don't really have any other ideas how to
achieve this.

It is quite possible that the only reason why turning on the backlight
at probe time is the current default is because backlight_properties'
.power field is by default initialized to 0, which turns out to be the
value of FB_BLANK_UNBLANK. That's been the source of quite a bit of
confusion (I could tell you stories of how people tried to convince me
that there must be a bug in the Linux kernel because writing a 0 to the
backlight's bl_power field in sysfs turns the backlight on instead of
off). The same is true for DPMS modes in DRM and X, where on has the
value 0.

Now there have been plans to overhaul the backlight subsystem for quite
some time. One of the goals was to decouple it from the FB subsystem,
which might help solve this to some degree. At the same time sysfs is an
ABI, so we can't just go and change it randomly. The same is true for
the default behaviour of enabling the backlight at boot. That can
equally be considered an ABI and changing it will cause the majority of
devices to regress, and that's not something that I look forward to
taking the blame for...

 2) Should the driver instead attempt to read the current state of the
 GPIO output to determine whether the backlight is on? This may not be
 possible on all HW.
 
 3) Doesn't the following code in probe() (added in a previous patch)
 need to be updated?
 
  +   if (gpio_is_valid(pb-enable_gpio)) {
  +   if (pb-enable_gpio_flags  PWM_BACKLIGHT_GPIO_ACTIVE_LOW)
  +   gpio_set_value(pb-enable_gpio, 0);
  +   else
  +   gpio_set_value(pb-enable_gpio, 1);
  +   }
 
 ... That assumes that the backlight is on at boot, and hence presumably
 after this patch still always turns on the backlight, only to turn it
 off very quickly once the following code in this patch executes:
 
 (and perhaps we also need to avoid turning the backlight off then on if
 it was already on at boot)

Yes, that's a good point. Depending on the outcome of the above
discussion I'll update this to match the expectations.

Thierry


pgpKi7vvNwTzy.pgp
Description: PGP signature


Re: [PATCH 4/7] x86, kaslr: select random base offset

2013-10-01 Thread Kees Cook
On Tue, Oct 1, 2013 at 1:46 PM, H. Peter Anvin h...@zytor.com wrote:
 On 10/01/2013 12:37 PM, Kees Cook wrote:
 +
 +#include asm/archrandom.h
 +static inline int rdrand(unsigned long *v)
 +{
 + int ok;
 + asm volatile(1:  RDRAND_LONG \n\t
 +  jc 2f\n\t
 +  decl %0\n\t
 +  jnz 1b\n\t
 +  2:
 +  : =r (ok), =a (*v)
 +  : 0 (RDRAND_RETRY_LOOPS));
 + return ok;
 +}
 +

 This looks just like rdrand_long() in arch/x86/kernel/cpu/rdrand.c and
 could move into the header file, no?

Yes, good idea. I'll move it into archrandom.h instead of this copy/paste.

-Kees

-- 
Kees Cook
Chrome OS Security
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: 3.12.0-rc3: Symbol license change in commit caf5c03f (ACPI: Move acpi_bus_get_device() from bus.c to scan.c)

2013-10-01 Thread Peter Hurley

On 10/01/2013 05:00 PM, Rafael J. Wysocki wrote:

On Tuesday, October 01, 2013 12:59:53 PM Peter Hurley wrote:

I have no love lost for proprietary modules but changing
acpi_bus_get_device() symbol's license seems gratuitous considering
the symbol pre-dates the mainline git tree and the code is just
being moved from one source file to another.


Well, I didn't know whether or not any binary modules use that function in the
first place.

It looks like some of them do, so below is a revert of that change (that I'm
going to push for -rc4).

I wonder what module exactly you have in mind, though?


For 3.12, the nouveau driver wants to use MSIs by default.
Unfortunately, some hardware which should support it doesn't.

The binary driver recently migrated to MSIs by default as well,
so I was testing to see if the hardware could run stably with that
driver with MSIs on (since I don't use the binary driver, I needed to
experiment).

Switching back and forth between the drivers is really error-prone;
instead, I sacrificed an older partition/userspace, where I confirmed
that the binary driver does run stably with MSIs -- on kernel 3.2.x.

When I tried to repeat the testing on 3.12-rc2  -rc3, I happened
upon this change.

Regards,
Peter Hurley







--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 05/10] ARM: SAMSUNG: Initialize PWM backlight enable_gpio field

2013-10-01 Thread Thierry Reding
On Tue, Oct 01, 2013 at 02:58:22PM -0600, Stephen Warren wrote:
 On 10/01/2013 02:43 PM, Thierry Reding wrote:
  On Tue, Oct 01, 2013 at 12:31:04PM -0600, Stephen Warren wrote:
  On 09/23/2013 03:41 PM, Thierry Reding wrote:
  The GPIO API defines 0 as being a valid GPIO number, so this
  field needs to be initialized explicitly.
  
  static void __init smdkv210_map_io(void)
  
  @@ -70,6 +70,7 @@ static struct samsung_bl_drvdata
  samsung_dfl_bl_data __initdata = { .max_brightness = 255, 
  .dft_brightness = 255, .pwm_period_ns  = 78770, + .enable_gpio
  = -1, .init   = samsung_bl_init, .exit   =
  samsung_bl_exit, }, @@ -121,6 +122,10 @@ void __init
  samsung_bl_set(struct samsung_bl_gpio_info *gpio_info, 
  samsung_bl_data-lth_brightness = bl_data-lth_brightness; if
  (bl_data-pwm_period_ns) samsung_bl_data-pwm_period_ns =
  bl_data-pwm_period_ns; + if (bl_data-enable_gpio) +
  samsung_bl_data-enable_gpio = bl_data-enable_gpio; +if
  (bl_data-enable_gpio_flags) +
  samsung_bl_data-enable_gpio_flags =
  bl_data-enable_gpio_flags;
  
  Won't this cause the core pwm_bl driver to request/manipulate the
  GPIO, whereas this driver already does that inside the
  samsung_bl_init/exit callbacks? I think you either need to adjust
  those callbacks, or not set the new standard GPIO property in
  samsung_bl_data.
  
  I don't think so. The samsung_bl_data is a copy of
  samsung_dfl_bl_data augmented by board-specific settings. So in
  fact copying these values here is essential to allow boards to
  override the enable_gpio and flags fields. Currently no board sets
  the enable_gpio to a valid GPIO so it's all still handled by the
  callbacks only.
 
 Oh yes, you're right. I was confusing the new enable_gpio field in
 pwm_bl's platform data with some other field in a custom data structure.
 
 One minor point though:
 
  + if (bl_data-enable_gpio) + samsung_bl_data-enable_gpio =
  bl_data-enable_gpio;
 
 That assumes that enable_gpio==0 means none, whereas you've gone to
 great pains in the rest of the series to allow 0 to be a valid GPIO
 ID. right now, the default value of samsung_bl_data-enable_gpio is
 -1, and if !bl_data-enable_gpio, that value won't be propagated across.

Right, that check should now be:

if (bl_data-enable_gpio = 0)

Well, it depends. It would be possible for the default to specify a
valid GPIO and for a board to override it with -1 (and provide a set of
corresponding callbacks). In that case the right thing to do here would
be not to check at all.

Then again, I don't think that will ever happen, because no fixed GPIO
will ever be a good default. So changing to = 0 instead of != 0 should
work fine.

Again, starting with 3.13 this should become a lot easier to handle
since the GPIO subsystem will gain functionality to use a per-board
lookup table, similarly to what the regulator and PWM subsystems do.
Once that's in place I plan to make another pass over all users of the
pwm-backlight driver and replace the enable_gpio field with a GPIO
lookup table, so that the driver can uniformly request them using a
simple gpiod_get().

Thierry


pgpfiNxhg5rIB.pgp
Description: PGP signature


[PATCH 2/2] usb: chipidea: add Intel Clovertrail pci id

2013-10-01 Thread David Cohen
From: David Cohen david.a.co...@intel.com

Signed-off-by: David Cohen david.a.co...@intel.com
---
 drivers/usb/chipidea/ci_hdrc_pci.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/drivers/usb/chipidea/ci_hdrc_pci.c 
b/drivers/usb/chipidea/ci_hdrc_pci.c
index 08a724b..d514332 100644
--- a/drivers/usb/chipidea/ci_hdrc_pci.c
+++ b/drivers/usb/chipidea/ci_hdrc_pci.c
@@ -129,6 +129,11 @@ static DEFINE_PCI_DEVICE_TABLE(ci_hdrc_pci_id_table) = {
PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x0829),
.driver_data = (kernel_ulong_t)penwell_pci_platdata,
},
+   {
+   /* Intel Clovertrail */
+   PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe006),
+   .driver_data = (kernel_ulong_t)penwell_pci_platdata,
+   },
{ 0 } /* end: all zeroes */
 };
 MODULE_DEVICE_TABLE(pci, ci_hdrc_pci_id_table);
-- 
1.8.4.rc3

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/2] usb: chipidea: cosmetic clean up on pci id list

2013-10-01 Thread David Cohen
From: David Cohen david.a.co...@intel.com

In order to fill a struct with zeroes just the first element needs to
be set to 0, the rest can me omitted. This looks cleaner when reading
the code.

This patch does such clean up change on last item of pci id list.

Signed-off-by: David Cohen david.a.co...@intel.com
---
 drivers/usb/chipidea/ci_hdrc_pci.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/usb/chipidea/ci_hdrc_pci.c 
b/drivers/usb/chipidea/ci_hdrc_pci.c
index 042320a..08a724b 100644
--- a/drivers/usb/chipidea/ci_hdrc_pci.c
+++ b/drivers/usb/chipidea/ci_hdrc_pci.c
@@ -129,7 +129,7 @@ static DEFINE_PCI_DEVICE_TABLE(ci_hdrc_pci_id_table) = {
PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x0829),
.driver_data = (kernel_ulong_t)penwell_pci_platdata,
},
-   { 0, 0, 0, 0, 0, 0, 0 /* end: all zeroes */ }
+   { 0 } /* end: all zeroes */
 };
 MODULE_DEVICE_TABLE(pci, ci_hdrc_pci_id_table);
 
-- 
1.8.4.rc3

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/2] usb: chipidea: cosmetic clean up on pci id list

2013-10-01 Thread David Cohen

On 10/01/2013 02:29 PM, David Cohen wrote:

From: David Cohen david.a.co...@intel.com


I gotta fix my e-mail to use @linux.intel.com.
Please, see my v2 patches.

BR, David

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 2/2] usb: chipidea: add Intel Clovertrail pci id

2013-10-01 Thread David Cohen
Signed-off-by: David Cohen david.a.co...@linux.intel.com
---
 drivers/usb/chipidea/ci_hdrc_pci.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/drivers/usb/chipidea/ci_hdrc_pci.c 
b/drivers/usb/chipidea/ci_hdrc_pci.c
index 08a724b..d514332 100644
--- a/drivers/usb/chipidea/ci_hdrc_pci.c
+++ b/drivers/usb/chipidea/ci_hdrc_pci.c
@@ -129,6 +129,11 @@ static DEFINE_PCI_DEVICE_TABLE(ci_hdrc_pci_id_table) = {
PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x0829),
.driver_data = (kernel_ulong_t)penwell_pci_platdata,
},
+   {
+   /* Intel Clovertrail */
+   PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe006),
+   .driver_data = (kernel_ulong_t)penwell_pci_platdata,
+   },
{ 0 } /* end: all zeroes */
 };
 MODULE_DEVICE_TABLE(pci, ci_hdrc_pci_id_table);
-- 
1.8.4.rc3

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 1/2] usb: chipidea: cosmetic clean up on pci id list

2013-10-01 Thread David Cohen
In order to fill a struct with zeroes just the first element needs to
be set to 0, the rest can me omitted. This looks cleaner when reading
the code.

This patch does such clean up change on last item of pci id list.

Signed-off-by: David Cohen david.a.co...@linux.intel.com
---
 drivers/usb/chipidea/ci_hdrc_pci.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/usb/chipidea/ci_hdrc_pci.c 
b/drivers/usb/chipidea/ci_hdrc_pci.c
index 042320a..08a724b 100644
--- a/drivers/usb/chipidea/ci_hdrc_pci.c
+++ b/drivers/usb/chipidea/ci_hdrc_pci.c
@@ -129,7 +129,7 @@ static DEFINE_PCI_DEVICE_TABLE(ci_hdrc_pci_id_table) = {
PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x0829),
.driver_data = (kernel_ulong_t)penwell_pci_platdata,
},
-   { 0, 0, 0, 0, 0, 0, 0 /* end: all zeroes */ }
+   { 0 } /* end: all zeroes */
 };
 MODULE_DEVICE_TABLE(pci, ci_hdrc_pci_id_table);
 
-- 
1.8.4.rc3

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] tick: make sleep length calculation more accurate

2013-10-01 Thread Stephen Boyd
On 09/27/13 03:52, Daniel Lezcano wrote:
 The sleep_length is computed in the tick_nohz_stop_sched_tick function but it
 is used later in the code with in between the local irq enabled.

 cpu_idle_loop
   tick_nohz_idle_enter [ exits with local irq enabled ]
__tick_nohz_idle_enter
  tick_nohz_stop_sched_tick
   ...

   arch_cpu_idle
  menu_select   [ uses here 'sleep_length' ]
   ...

 Between the computation of the sleep length and its usage, some interrupts
 can occur, making the sleep length shorter than actually it is.

 This patch fixes that by moving the sleep_length computation in the
 tick_nohz_get_sleep_length function and store the next_event for the device
 instead of the sleep_length.

 Signed-off-by: Daniel Lezcano daniel.lezc...@linaro.org
 ---
  include/linux/tick.h |2 +-
  kernel/time/tick-sched.c |5 +++--
  2 files changed, 4 insertions(+), 3 deletions(-)

 diff --git a/include/linux/tick.h b/include/linux/tick.h
 index 5128d33..4932004 100644
 --- a/include/linux/tick.h
 +++ b/include/linux/tick.h
 @@ -67,7 +67,7 @@ struct tick_sched {
   ktime_t idle_exittime;
   ktime_t idle_sleeptime;
   ktime_t iowait_sleeptime;
 - ktime_t sleep_length;
 + ktime_t next_event;
   unsigned long   last_jiffies;
   unsigned long   next_jiffies;
   ktime_t idle_expires;

Documentation update?

 diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
 index 3612fc7..2007a7f 100644
 --- a/kernel/time/tick-sched.c
 +++ b/kernel/time/tick-sched.c
 @@ -673,7 +673,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct 
 tick_sched *ts,
  out:
   ts-next_jiffies = next_jiffies;
   ts-last_jiffies = last_jiffies;
 - ts-sleep_length = ktime_sub(dev-next_event, now);
 + ts-next_event = dev-next_event;
  
   return ret;
  }
 @@ -837,8 +837,9 @@ void tick_nohz_irq_exit(void)
  ktime_t tick_nohz_get_sleep_length(void)
  {
   struct tick_sched *ts = __get_cpu_var(tick_cpu_sched);
 + ktime_t now = ktime_get();
  
 - return ts-sleep_length;
 + return ktime_sub(ts-next_event, now);
  }
  
  static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)

What happens if the idling CPU's next_event is updated via that
interrupt? Say if the interrupt handler schedules a timer to fire before
the next timer on the CPU? It looks like we won't notice that.

Perhaps it's better to do this instead?

 ktime_t tick_nohz_get_sleep_length(void)
 {
struct tick_sched *ts = __get_cpu_var(tick_cpu_sched);
+   ktime_t now = ktime_get();
+   struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;

-   return ts-sleep_length;
+   return ktime_sub(dev-next_event, now);
 }


-- 
Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
hosted by The Linux Foundation

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 09/10] pwm-backlight: Use an optional power supply

2013-10-01 Thread Thierry Reding
On Tue, Oct 01, 2013 at 02:59:43PM -0600, Stephen Warren wrote:
 On 10/01/2013 02:53 PM, Thierry Reding wrote:
  On Tue, Oct 01, 2013 at 12:43:57PM -0600, Stephen Warren wrote:
  On 09/23/2013 03:41 PM, Thierry Reding wrote:
  Many backlights require a power supply to work properly. This
  commit uses a power-supply regulator, if available, to power up
  and power down the panel.
  
  I think that all backlights require a power supply, albeit the
  supply may not be SW-controllable. Hence, shouldn't the regulator
  be mandatory in the binding, yet the driver be defensively coded
  such that if one isn't specified, the driver continues to work?
  
  That has already changed in my local version of this patch.
  
  diff --git a/drivers/video/backlight/pwm_bl.c
  b/drivers/video/backlight/pwm_bl.c
  
  @@ -253,6 +264,16 @@ static int pwm_backlight_probe(struct
  platform_device *pdev) } }
  
  + pb-power_supply = devm_regulator_get_optional(pdev-dev,
  power);
  
  ... so I think that should be devm_regulator_get(), since the
  regulator isn't really optional.
  
  + if (IS_ERR(pb-power_supply)) { +   if
  (PTR_ERR(pb-power_supply) != -ENODEV) { +ret =
  PTR_ERR(pb-power_supply); +  goto err_gpio; +
  } + +
  pb-power_supply = NULL;
  
  If devm_regulator_get_optional() returns an error value or a
  valid value, then I don't think that this driver should transmute
  error values into NULL; NULL might be a perfectly valid regulator
  value. Related, I think the if (pb-power_supply) tests should be
  replaced with if (!IS_ERR(pb-power_supply)) instead.
  
  All of that is already done in my local tree. This actually turns
  out to work rather smoothly with the new support for optional
  regulators. The regulator core will give you a dummy regulator
  (assuming it's there physically but hasn't been wired up in
  software) that's always on, so the driver doesn't even have to
  special case it anymore.
 
 OK, hopefully it (the regulator core) complains about the missing DT
 property though; I assume you're using regulator_get() not
 regulator_get_optional(), since the supply really is not optional.

It doesn't always. There's a pr_warn() in _regulator_get(), but that's
only for non-DT (since for DT, has_full_constraints is set to true in
regulator_init_complete()). Actually that would mean that the regulator
core will complain as long as init isn't complete. But since, like many
other drivers, pwm-backlight could use deferred probing it's likely to
end up being probed after init.

Cc'ing Mark Brown.

Thierry


pgpg2sr3oP2RB.pgp
Description: PGP signature


Re: [PATCH] [workqueue] check values of pwq and wq in print_worker_info() before use

2013-10-01 Thread James Bottomley
On Tue, 2013-10-01 at 16:43 -0400, Tejun Heo wrote:
 Hello,
 
 On Tue, Oct 01, 2013 at 10:35:20PM +0200, Helge Deller wrote:
  print_worker_info() includes no validity check on the pwq and wq
  pointers before handing them over to the probe_kernel_read() functions.
  
  It seems that most architectures don't care about that, but at least on
  the parisc architecture this leads to a kernel crash since accesses to
  page zero are protected by the kernel for security reasons.
  
  Fix this problem by verifying the contents of pwq and wq before usage.
  Even if probe_kernel_read() usually prevents such crashes by disabling
  page faults, clean code should always include such checks. 
  
  Without this fix issuing echo t  /proc/sysrq-trigger will immediately
  crash the Linux kernel on the parisc architecture.
 
 Hmm... um had similar problem but the root cause here is that the arch
 isn't implementing probe_kernel_read() properly.  We really have no
 idea what the pointer value may be at the dump point and that's why we
 use probe_kernel_read().  If something like the above is necessary for
 the time being, the correct place would be the arch
 probe_kernel_read() implementation.  James, would it be difficult
 implement proper probe_kernel_read() on parisc?

The problem seems to be that some traps bypass our exception table
handling.  Helge, do you have the actual stack trace for this?  That
should show where the exception handling is missing.

Thanks,

James


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: spinlock contention of files-file_lock

2013-10-01 Thread Eric Dumazet
From: Eric Dumazet eduma...@google.com

On Mon, 2013-09-30 at 18:44 -0700, Linus Torvalds wrote:

 Now, that only gets rid of fd_install(), but I suspect you could do
 something similar for put_unused_fd() (that one does need cmpxchg for
 the next_fd thing, though). We'd have to replace the non-atomic
 bitops on open_fds[] with atomic ones, just to make sure adjacent bit
 clearings don't screw up concurrent adjacent bit values, but that
 looks fairly straightforward too.

While looking at this (exciting) stuff, I found following bug.

Maybe I am missing something obvious ?

Thanks

[PATCH] fs: fix a race in do_close_on_exec()

commit 6a6d27de (take close-on-exec logics to fs/file.c, clean it up a
bit) added a possible race, as another thread could resize file table
once we released files-file_lock.

We must reload fdt after getting the lock.

Signed-off-by: Eric Dumazet eduma...@google.com
---
 fs/file.c |5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fs/file.c b/fs/file.c
index 4a78f98..b614f13 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -616,10 +616,10 @@ void do_close_on_exec(struct files_struct *files)
 
/* exec unshares first */
spin_lock(files-file_lock);
+   fdt = files_fdtable(files);
for (i = 0; ; i++) {
unsigned long set;
unsigned fd = i * BITS_PER_LONG;
-   fdt = files_fdtable(files);
if (fd = fdt-max_fds)
break;
set = fdt-close_on_exec[i];
@@ -639,6 +639,9 @@ void do_close_on_exec(struct files_struct *files)
filp_close(file, files);
cond_resched();
spin_lock(files-file_lock);
+
+   /* We released files-file_lock, we must reload fdt */
+   fdt = files_fdtable(files);
}
 
}


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 06/15] sysfs: add sysfs_open_file-sd and -file

2013-10-01 Thread Tejun Heo
sysfs will be converted to use seq_file for read path, which will make
it difficult to pass around multiple pointers directly.  This patch
adds sysfs_open_file-sd and -file so that we can reach all the
necessary data structures from sysfs_open_file.

flush_write_buffer() is updated to drop @dentry which was used to
discover the sysfs_dirent as it's now available through
sysfs_open_file-sd.

This patch doesn't cause any behavior difference.

Signed-off-by: Tejun Heo t...@kernel.org
---
 fs/sysfs/file.c | 23 ---
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 4b55bcf..af6e909 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -45,6 +45,8 @@ struct sysfs_open_dirent {
 };
 
 struct sysfs_open_file {
+   struct sysfs_dirent *sd;
+   struct file *file;
size_t  count;
char*page;
struct mutexmutex;
@@ -192,7 +194,6 @@ static int fill_write_buffer(struct sysfs_open_file *of,
 
 /**
  * flush_write_buffer - push buffer to kobject.
- * @dentry:dentry to the attribute
  * @of:open file
  * @count: number of bytes
  *
@@ -200,22 +201,20 @@ static int fill_write_buffer(struct sysfs_open_file *of,
  * dealing with, then call the store() method for the attribute,
  * passing the buffer that we acquired in fill_write_buffer().
  */
-static int flush_write_buffer(struct dentry *dentry,
- struct sysfs_open_file *of, size_t count)
+static int flush_write_buffer(struct sysfs_open_file *of, size_t count)
 {
-   struct sysfs_dirent *attr_sd = dentry-d_fsdata;
-   struct kobject *kobj = attr_sd-s_parent-s_dir.kobj;
+   struct kobject *kobj = of-sd-s_parent-s_dir.kobj;
const struct sysfs_ops *ops;
int rc;
 
-   /* need attr_sd for attr and ops, its parent for kobj */
-   if (!sysfs_get_active(attr_sd))
+   /* need @of-sd for attr and ops, its parent for kobj */
+   if (!sysfs_get_active(of-sd))
return -ENODEV;
 
-   ops = sysfs_file_ops(attr_sd);
-   rc = ops-store(kobj, attr_sd-s_attr.attr, of-page, count);
+   ops = sysfs_file_ops(of-sd);
+   rc = ops-store(kobj, of-sd-s_attr.attr, of-page, count);
 
-   sysfs_put_active(attr_sd);
+   sysfs_put_active(of-sd);
 
return rc;
 }
@@ -245,7 +244,7 @@ static ssize_t sysfs_write_file(struct file *file, const 
char __user *buf,
mutex_lock(of-mutex);
len = fill_write_buffer(of, buf, count);
if (len  0)
-   len = flush_write_buffer(file-f_path.dentry, of, len);
+   len = flush_write_buffer(of, len);
if (len  0)
*ppos += len;
mutex_unlock(of-mutex);
@@ -385,6 +384,8 @@ static int sysfs_open_file(struct inode *inode, struct file 
*file)
goto err_out;
 
mutex_init(of-mutex);
+   of-sd = attr_sd;
+   of-file = file;
file-private_data = of;
 
/* make sure we have open dirent struct */
-- 
1.8.3.1

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 14/15] sysfs: prepare open path for unified regular / bin file handling

2013-10-01 Thread Tejun Heo
sysfs bin file handling will be merged into the regular file support.
This patch prepares the open path.

This patch updates sysfs_open_file() such that it can handle both
regular and bin files.

This is a preparation and the new bin file path isn't used yet.

Signed-off-by: Tejun Heo t...@kernel.org
---
 fs/sysfs/file.c | 58 -
 1 file changed, 33 insertions(+), 25 deletions(-)

diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 02797a1..417d005 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -610,38 +610,40 @@ static int sysfs_open_file(struct inode *inode, struct 
file *file)
struct sysfs_dirent *attr_sd = file-f_path.dentry-d_fsdata;
struct kobject *kobj = attr_sd-s_parent-s_dir.kobj;
struct sysfs_open_file *of;
-   const struct sysfs_ops *ops;
+   bool has_read, has_write;
int error = -EACCES;
 
/* need attr_sd for attr and ops, its parent for kobj */
if (!sysfs_get_active(attr_sd))
return -ENODEV;
 
-   /* every kobject with an attribute needs a ktype assigned */
-   ops = sysfs_file_ops(attr_sd);
-   if (WARN(!ops, KERN_ERR
-missing sysfs attribute operations for kobject: %s\n,
-kobject_name(kobj)))
-   goto err_out;
+   if (sysfs_is_bin(attr_sd)) {
+   struct bin_attribute *battr = attr_sd-s_bin_attr.bin_attr;
 
-   /* File needs write support.
-* The inode's perms must say it's ok,
-* and we must have a store method.
-*/
-   if (file-f_mode  FMODE_WRITE) {
-   if (!(inode-i_mode  S_IWUGO) || !ops-store)
-   goto err_out;
-   }
+   has_read = battr-read || battr-mmap;
+   has_write = battr-write || battr-mmap;
+   } else {
+   const struct sysfs_ops *ops = sysfs_file_ops(attr_sd);
 
-   /* File needs read support.
-* The inode's perms must say it's ok, and we there
-* must be a show method for it.
-*/
-   if (file-f_mode  FMODE_READ) {
-   if (!(inode-i_mode  S_IRUGO) || !ops-show)
+   /* every kobject with an attribute needs a ktype assigned */
+   if (WARN(!ops, KERN_ERR
+missing sysfs attribute operations for kobject: %s\n,
+kobject_name(kobj)))
goto err_out;
+
+   has_read = ops-show;
+   has_write = ops-store;
}
 
+   /* check perms and supported operations */
+   if ((file-f_mode  FMODE_WRITE) 
+   (!(inode-i_mode  S_IWUGO) || !has_write))
+   goto err_out;
+
+   if ((file-f_mode  FMODE_READ) 
+   (!(inode-i_mode  S_IRUGO) || !has_read))
+   goto err_out;
+
/* allocate a sysfs_open_file for the file */
error = -ENOMEM;
of = kzalloc(sizeof(struct sysfs_open_file), GFP_KERNEL);
@@ -653,11 +655,14 @@ static int sysfs_open_file(struct inode *inode, struct 
file *file)
of-file = file;
 
/*
-* Always instantiate seq_file even if read access is not
-* implemented or requested.  This unifies private data access and
-* most files are readable anyway.
+* Always instantiate seq_file even if read access doesn't use
+* seq_file or is not requested.  This unifies private data access
+* and readable regular files are the vast majority anyway.
 */
-   error = single_open(file, sysfs_seq_show, of);
+   if (sysfs_is_bin(attr_sd))
+   error = single_open(file, NULL, of);
+   else
+   error = single_open(file, sysfs_seq_show, of);
if (error)
goto err_free;
 
@@ -807,6 +812,9 @@ const struct file_operations sysfs_bin_operations = {
.write  = sysfs_write_file,
.llseek = generic_file_llseek,
.mmap   = sysfs_bin_mmap,
+   .open   = sysfs_open_file,
+   .release= sysfs_release,
+   .poll   = sysfs_poll,
 };
 
 int sysfs_add_file_mode_ns(struct sysfs_dirent *dir_sd,
-- 
1.8.3.1

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 12/15] sysfs: add sysfs_bin_read()

2013-10-01 Thread Tejun Heo
sysfs bin file handling will be merged into the regular file support.
This patch prepares the read path.

Copy fs/sysfs/bin.c::read() to fs/sysfs/file.c and make it use
sysfs_open_file instead of bin_buffer.  The function is identical copy
except for the use of sysfs_open_file.

The new function is added to sysfs_bin_operations.  This isn't used
yet but will eventually replace fs/sysfs/bin.c.

Signed-off-by: Tejun Heo t...@kernel.org
---
 fs/sysfs/file.c | 65 +
 1 file changed, 65 insertions(+)

diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index b36473f..9ba492a 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -139,6 +139,70 @@ static int sysfs_seq_show(struct seq_file *sf, void *v)
return 0;
 }
 
+/*
+ * Read method for bin files.  As reading a bin file can have side-effects,
+ * the exact offset and bytes specified in read(2) call should be passed to
+ * the read callback making it difficult to use seq_file.  Implement
+ * simplistic custom buffering for bin files.
+ */
+static ssize_t sysfs_bin_read(struct file *file, char __user *userbuf,
+ size_t bytes, loff_t *off)
+{
+   struct sysfs_open_file *of = sysfs_of(file);
+   struct bin_attribute *battr = of-sd-s_bin_attr.bin_attr;
+   struct kobject *kobj = of-sd-s_parent-s_dir.kobj;
+   int size = file_inode(file)-i_size;
+   int count = min_t(size_t, bytes, PAGE_SIZE);
+   loff_t offs = *off;
+   char *buf;
+
+   if (!bytes)
+   return 0;
+
+   if (size) {
+   if (offs  size)
+   return 0;
+   if (offs + count  size)
+   count = size - offs;
+   }
+
+   buf = kmalloc(count, GFP_KERNEL);
+   if (!buf)
+   return -ENOMEM;
+
+   /* need of-sd for battr, its parent for kobj */
+   mutex_lock(of-mutex);
+   if (!sysfs_get_active(of-sd)) {
+   count = -ENODEV;
+   mutex_unlock(of-mutex);
+   goto out_free;
+   }
+
+   if (battr-read)
+   count = battr-read(file, kobj, battr, buf, offs, count);
+   else
+   count = -EIO;
+
+   sysfs_put_active(of-sd);
+   mutex_unlock(of-mutex);
+
+   if (count  0)
+   goto out_free;
+
+   if (copy_to_user(userbuf, buf, count)) {
+   count = -EFAULT;
+   goto out_free;
+   }
+
+   pr_debug(offs = %lld, *off = %lld, count = %d\n, offs, *off, count);
+
+   *off = offs + count;
+
+ out_free:
+   kfree(buf);
+   return count;
+}
+
 /**
  * flush_write_buffer - push buffer to kobject
  * @of: open file
@@ -495,6 +559,7 @@ const struct file_operations sysfs_file_operations = {
 };
 
 const struct file_operations sysfs_bin_operations = {
+   .read   = sysfs_bin_read,
.write  = sysfs_write_file,
.llseek = generic_file_llseek,
 };
-- 
1.8.3.1

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 15/15] sysfs: merge regular and bin file handling

2013-10-01 Thread Tejun Heo
With the previous changes, sysfs regular file code is ready to handle
bin files too.  This patch makes bin files share the regular file
path.

* sysfs_create/remove_bin_file() are moved to fs/sysfs/file.c.

* sysfs_init_inode() is updated to use the new sysfs_bin_operations
  instead of bin_fops for bin files.

* fs/sysfs/bin.c and the related pieces are removed.

This patch shouldn't introduce any behavior difference to bin file
accesses.

Overall, this unification reduces the amount of duplicate logic, makes
behaviors more consistent and paves the road for building simpler and
more versatile interface which will allow other subsystems to make use
of sysfs for their pseudo filesystems.

v2: Stale fs/sysfs/bin.c reference dropped from
Documentation/DocBook/filesystems.tmpl.  Reported by kbuild test
robot.

Signed-off-by: Tejun Heo t...@kernel.org
Cc: Kay Sievers k...@vrfy.org
Cc: kbuild test robot fengguang...@intel.com
---
 Documentation/DocBook/filesystems.tmpl |   1 -
 fs/sysfs/Makefile  |   3 +-
 fs/sysfs/bin.c | 491 -
 fs/sysfs/dir.c |   1 -
 fs/sysfs/file.c|  26 ++
 fs/sysfs/inode.c   |   2 +-
 fs/sysfs/sysfs.h   |   6 -
 7 files changed, 28 insertions(+), 502 deletions(-)
 delete mode 100644 fs/sysfs/bin.c

diff --git a/Documentation/DocBook/filesystems.tmpl 
b/Documentation/DocBook/filesystems.tmpl
index 25b58ef..4f67683 100644
--- a/Documentation/DocBook/filesystems.tmpl
+++ b/Documentation/DocBook/filesystems.tmpl
@@ -91,7 +91,6 @@
  titleThe Filesystem for Exporting Kernel Objects/title
 !Efs/sysfs/file.c
 !Efs/sysfs/symlink.c
-!Efs/sysfs/bin.c
   /chapter
 
   chapter id=debugfs
diff --git a/fs/sysfs/Makefile b/fs/sysfs/Makefile
index 7a1ceb9..8876ac1 100644
--- a/fs/sysfs/Makefile
+++ b/fs/sysfs/Makefile
@@ -2,5 +2,4 @@
 # Makefile for the sysfs virtual filesystem
 #
 
-obj-y  := inode.o file.o dir.o symlink.o mount.o bin.o \
-  group.o
+obj-y  := inode.o file.o dir.o symlink.o mount.o group.o
diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
deleted file mode 100644
index 60a4e78..000
--- a/fs/sysfs/bin.c
+++ /dev/null
@@ -1,491 +0,0 @@
-/*
- * fs/sysfs/bin.c - sysfs binary file implementation
- *
- * Copyright (c) 2003 Patrick Mochel
- * Copyright (c) 2003 Matthew Wilcox
- * Copyright (c) 2004 Silicon Graphics, Inc.
- * Copyright (c) 2007 SUSE Linux Products GmbH
- * Copyright (c) 2007 Tejun Heo te...@suse.de
- *
- * This file is released under the GPLv2.
- *
- * Please see Documentation/filesystems/sysfs.txt for more information.
- */
-
-#undef DEBUG
-
-#include linux/errno.h
-#include linux/fs.h
-#include linux/kernel.h
-#include linux/kobject.h
-#include linux/module.h
-#include linux/slab.h
-#include linux/mutex.h
-#include linux/mm.h
-#include linux/uaccess.h
-
-#include sysfs.h
-
-/*
- * There's one bin_buffer for each open file.
- *
- * filp-private_data points to bin_buffer and
- * sysfs_dirent-s_bin_attr.buffers points to a the bin_buffer s
- * sysfs_dirent-s_bin_attr.buffers is protected by sysfs_bin_lock
- */
-static DEFINE_MUTEX(sysfs_bin_lock);
-
-struct bin_buffer {
-   struct mutexmutex;
-   void*buffer;
-   int mmapped;
-   const struct vm_operations_struct *vm_ops;
-   struct file *file;
-   struct hlist_node   list;
-};
-
-static ssize_t
-read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off)
-{
-   struct sysfs_dirent *attr_sd = file-f_path.dentry-d_fsdata;
-   struct bin_attribute *attr = attr_sd-s_bin_attr.bin_attr;
-   struct kobject *kobj = attr_sd-s_parent-s_dir.kobj;
-   struct bin_buffer *bb = file-private_data;
-   int size = file_inode(file)-i_size;
-   loff_t offs = *off;
-   int count = min_t(size_t, bytes, PAGE_SIZE);
-   char *buf;
-
-   if (!bytes)
-   return 0;
-
-   if (size) {
-   if (offs  size)
-   return 0;
-   if (offs + count  size)
-   count = size - offs;
-   }
-
-   buf = kmalloc(count, GFP_KERNEL);
-   if (!buf)
-   return -ENOMEM;
-
-   /* need attr_sd for attr, its parent for kobj */
-   mutex_lock(bb-mutex);
-   if (!sysfs_get_active(attr_sd)) {
-   count = -ENODEV;
-   mutex_unlock(bb-mutex);
-   goto out_free;
-   }
-
-   if (attr-read)
-   count = attr-read(file, kobj, attr, buf, offs, count);
-   else
-   count = -EIO;
-
-   sysfs_put_active(attr_sd);
-   mutex_unlock(bb-mutex);
-
-   if (count  0)
-   goto out_free;
-
-   if (copy_to_user(userbuf, buf, count)) {
-   count = -EFAULT;
-   goto out_free;
-   }
-
-

[PATCH 13/15] sysfs: copy bin mmap support from fs/sysfs/bin.c to fs/sysfs/file.c

2013-10-01 Thread Tejun Heo
sysfs bin file handling will be merged into the regular file support.
This patch copies mmap support from bin so that fs/sysfs/file.c can
handle mmapping bin files.

The code is copied mostly verbatim with the following updates.

* -mmapped and -vm_ops are added to sysfs_open_file and bin_buffer
  references are replaced with sysfs_open_file ones.

* Symbols are prefixed with sysfs_.

* sysfs_unmap_bin_file() grabs sysfs_open_dirent and traverses
  -files.  Invocation of this function is added to
  sysfs_addrm_finish().

* sysfs_bin_mmap() is added to sysfs_bin_operations.

This is a preparation and the new mmap path isn't used yet.

Signed-off-by: Tejun Heo t...@kernel.org
---
 fs/sysfs/dir.c   |   1 +
 fs/sysfs/file.c  | 247 ++-
 fs/sysfs/sysfs.h |   2 +
 3 files changed, 249 insertions(+), 1 deletion(-)

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index b518afd..c4040dd 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -595,6 +595,7 @@ void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt)
acxt-removed = sd-u.removed_list;
 
sysfs_deactivate(sd);
+   sysfs_unmap_bin_file(sd);
unmap_bin_file(sd);
sysfs_put(sd);
}
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 9ba492a..02797a1 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -22,6 +22,7 @@
 #include linux/limits.h
 #include linux/uaccess.h
 #include linux/seq_file.h
+#include linux/mm.h
 
 #include sysfs.h
 
@@ -52,6 +53,9 @@ struct sysfs_open_file {
struct mutexmutex;
int event;
struct list_headlist;
+
+   boolmmapped;
+   const struct vm_operations_struct *vm_ops;
 };
 
 static bool sysfs_is_bin(struct sysfs_dirent *sd)
@@ -301,6 +305,218 @@ out_free:
return len;
 }
 
+static void sysfs_bin_vma_open(struct vm_area_struct *vma)
+{
+   struct file *file = vma-vm_file;
+   struct sysfs_open_file *of = sysfs_of(file);
+
+   if (!of-vm_ops)
+   return;
+
+   if (!sysfs_get_active(of-sd))
+   return;
+
+   if (of-vm_ops-open)
+   of-vm_ops-open(vma);
+
+   sysfs_put_active(of-sd);
+}
+
+static int sysfs_bin_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+   struct file *file = vma-vm_file;
+   struct sysfs_open_file *of = sysfs_of(file);
+   int ret;
+
+   if (!of-vm_ops)
+   return VM_FAULT_SIGBUS;
+
+   if (!sysfs_get_active(of-sd))
+   return VM_FAULT_SIGBUS;
+
+   ret = VM_FAULT_SIGBUS;
+   if (of-vm_ops-fault)
+   ret = of-vm_ops-fault(vma, vmf);
+
+   sysfs_put_active(of-sd);
+   return ret;
+}
+
+static int sysfs_bin_page_mkwrite(struct vm_area_struct *vma,
+ struct vm_fault *vmf)
+{
+   struct file *file = vma-vm_file;
+   struct sysfs_open_file *of = sysfs_of(file);
+   int ret;
+
+   if (!of-vm_ops)
+   return VM_FAULT_SIGBUS;
+
+   if (!sysfs_get_active(of-sd))
+   return VM_FAULT_SIGBUS;
+
+   ret = 0;
+   if (of-vm_ops-page_mkwrite)
+   ret = of-vm_ops-page_mkwrite(vma, vmf);
+   else
+   file_update_time(file);
+
+   sysfs_put_active(of-sd);
+   return ret;
+}
+
+static int sysfs_bin_access(struct vm_area_struct *vma, unsigned long addr,
+   void *buf, int len, int write)
+{
+   struct file *file = vma-vm_file;
+   struct sysfs_open_file *of = sysfs_of(file);
+   int ret;
+
+   if (!of-vm_ops)
+   return -EINVAL;
+
+   if (!sysfs_get_active(of-sd))
+   return -EINVAL;
+
+   ret = -EINVAL;
+   if (of-vm_ops-access)
+   ret = of-vm_ops-access(vma, addr, buf, len, write);
+
+   sysfs_put_active(of-sd);
+   return ret;
+}
+
+#ifdef CONFIG_NUMA
+static int sysfs_bin_set_policy(struct vm_area_struct *vma,
+   struct mempolicy *new)
+{
+   struct file *file = vma-vm_file;
+   struct sysfs_open_file *of = sysfs_of(file);
+   int ret;
+
+   if (!of-vm_ops)
+   return 0;
+
+   if (!sysfs_get_active(of-sd))
+   return -EINVAL;
+
+   ret = 0;
+   if (of-vm_ops-set_policy)
+   ret = of-vm_ops-set_policy(vma, new);
+
+   sysfs_put_active(of-sd);
+   return ret;
+}
+
+static struct mempolicy *sysfs_bin_get_policy(struct vm_area_struct *vma,
+ unsigned long addr)
+{
+   struct file *file = vma-vm_file;
+   struct sysfs_open_file *of = sysfs_of(file);
+   struct mempolicy *pol;
+
+   if (!of-vm_ops)
+   return vma-vm_policy;
+
+   if (!sysfs_get_active(of-sd))
+   return vma-vm_policy;
+
+   pol = vma-vm_policy;
+   if (of-vm_ops-get_policy)
+   pol = 

[PATCH 10/15] sysfs: collapse fs/sysfs/bin.c::fill_read() into read()

2013-10-01 Thread Tejun Heo
read() is simple enough and fill_read() being in a separate function
doesn't add anything.  Let's collapse it into read().  This will make
merging bin file handling with regular file.

Signed-off-by: Tejun Heo t...@kernel.org
---
 fs/sysfs/bin.c | 36 +++-
 1 file changed, 15 insertions(+), 21 deletions(-)

diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index d2142c0..60a4e78 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -44,30 +44,12 @@ struct bin_buffer {
struct hlist_node   list;
 };
 
-static int
-fill_read(struct file *file, char *buffer, loff_t off, size_t count)
+static ssize_t
+read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off)
 {
struct sysfs_dirent *attr_sd = file-f_path.dentry-d_fsdata;
struct bin_attribute *attr = attr_sd-s_bin_attr.bin_attr;
struct kobject *kobj = attr_sd-s_parent-s_dir.kobj;
-   int rc;
-
-   /* need attr_sd for attr, its parent for kobj */
-   if (!sysfs_get_active(attr_sd))
-   return -ENODEV;
-
-   rc = -EIO;
-   if (attr-read)
-   rc = attr-read(file, kobj, attr, buffer, off, count);
-
-   sysfs_put_active(attr_sd);
-
-   return rc;
-}
-
-static ssize_t
-read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off)
-{
struct bin_buffer *bb = file-private_data;
int size = file_inode(file)-i_size;
loff_t offs = *off;
@@ -88,8 +70,20 @@ read(struct file *file, char __user *userbuf, size_t bytes, 
loff_t *off)
if (!buf)
return -ENOMEM;
 
+   /* need attr_sd for attr, its parent for kobj */
mutex_lock(bb-mutex);
-   count = fill_read(file, buf, offs, count);
+   if (!sysfs_get_active(attr_sd)) {
+   count = -ENODEV;
+   mutex_unlock(bb-mutex);
+   goto out_free;
+   }
+
+   if (attr-read)
+   count = attr-read(file, kobj, attr, buf, offs, count);
+   else
+   count = -EIO;
+
+   sysfs_put_active(attr_sd);
mutex_unlock(bb-mutex);
 
if (count  0)
-- 
1.8.3.1

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 11/15] sysfs: prepare path write for unified regular / bin file handling

2013-10-01 Thread Tejun Heo
sysfs bin file handling will be merged into the regular file support.
This patch prepares the write path.

bin file write is almost identical to regular file write except that
the write length is capped by the inode size and @off is passed to the
write method.  This patch adds bin file handling to sysfs_write_file()
so that it can handle both regular and bin files.

A new file_operations struct sysfs_bin_operations is added, which
currently only hosts sysfs_write_file() and generic_file_llseek().
This isn't used yet but will eventually replace fs/sysfs/bin.c.

Signed-off-by: Tejun Heo t...@kernel.org
---
 fs/sysfs/file.c  | 40 ++--
 fs/sysfs/sysfs.h |  1 +
 2 files changed, 35 insertions(+), 6 deletions(-)

diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 4921bda..b36473f 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -54,6 +54,11 @@ struct sysfs_open_file {
struct list_headlist;
 };
 
+static bool sysfs_is_bin(struct sysfs_dirent *sd)
+{
+   return sysfs_type(sd) == SYSFS_KOBJ_BIN_ATTR;
+}
+
 static struct sysfs_open_file *sysfs_of(struct file *file)
 {
return ((struct seq_file *)file-private_data)-private;
@@ -138,16 +143,16 @@ static int sysfs_seq_show(struct seq_file *sf, void *v)
  * flush_write_buffer - push buffer to kobject
  * @of: open file
  * @buf: data buffer for file
+ * @off: file offset to write to
  * @count: number of bytes
  *
  * Get the correct pointers for the kobject and the attribute we're dealing
  * with, then call the store() method for it with @buf.
  */
-static int flush_write_buffer(struct sysfs_open_file *of, char *buf,
+static int flush_write_buffer(struct sysfs_open_file *of, char *buf, loff_t 
off,
  size_t count)
 {
struct kobject *kobj = of-sd-s_parent-s_dir.kobj;
-   const struct sysfs_ops *ops;
int rc = 0;
 
/*
@@ -161,8 +166,18 @@ static int flush_write_buffer(struct sysfs_open_file *of, 
char *buf,
return -ENODEV;
}
 
-   ops = sysfs_file_ops(of-sd);
-   rc = ops-store(kobj, of-sd-s_attr.attr, buf, count);
+   if (sysfs_is_bin(of-sd)) {
+   struct bin_attribute *battr = of-sd-s_bin_attr.bin_attr;
+
+   rc = -EIO;
+   if (battr-write)
+   rc = battr-write(of-file, kobj, battr, buf, off,
+ count);
+   } else {
+   const struct sysfs_ops *ops = sysfs_file_ops(of-sd);
+
+   rc = ops-store(kobj, of-sd-s_attr.attr, buf, count);
+   }
 
sysfs_put_active(of-sd);
mutex_unlock(of-mutex);
@@ -190,9 +205,17 @@ static ssize_t sysfs_write_file(struct file *file, const 
char __user *user_buf,
size_t count, loff_t *ppos)
 {
struct sysfs_open_file *of = sysfs_of(file);
-   ssize_t len = min_t(size_t, count, PAGE_SIZE - 1);
+   ssize_t len = min_t(size_t, count, PAGE_SIZE);
char *buf;
 
+   if (sysfs_is_bin(of-sd)) {
+   loff_t size = file_inode(file)-i_size;
+
+   if (size = *ppos)
+   return 0;
+   len = min_t(ssize_t, len, size - *ppos);
+   }
+
if (!len)
return 0;
 
@@ -206,7 +229,7 @@ static ssize_t sysfs_write_file(struct file *file, const 
char __user *user_buf,
}
buf[len] = '\0';/* guarantee string termination */
 
-   len = flush_write_buffer(of, buf, len);
+   len = flush_write_buffer(of, buf, *ppos, len);
if (len  0)
*ppos += len;
 out_free:
@@ -471,6 +494,11 @@ const struct file_operations sysfs_file_operations = {
.poll   = sysfs_poll,
 };
 
+const struct file_operations sysfs_bin_operations = {
+   .write  = sysfs_write_file,
+   .llseek = generic_file_llseek,
+};
+
 int sysfs_add_file_mode_ns(struct sysfs_dirent *dir_sd,
   const struct attribute *attr, int type,
   umode_t amode, const void *ns)
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 4b1d825..f103bac 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -212,6 +212,7 @@ int sysfs_inode_init(void);
  * file.c
  */
 extern const struct file_operations sysfs_file_operations;
+extern const struct file_operations sysfs_bin_operations;
 
 int sysfs_add_file(struct sysfs_dirent *dir_sd,
   const struct attribute *attr, int type);
-- 
1.8.3.1

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 09/15] sysfs: skip bin_buffer-buffer while reading

2013-10-01 Thread Tejun Heo
After b31ca3f5dfc (sysfs: fix deadlock), bin read() first writes
data to bb-buffer and bounces it to a transient kernel buffer which
is then copied out to userland.  The double bouncing doesn't add
anything.  Let's just use the transient buffer directly.

While at it, rename @temp to @buf for clarity.

Signed-off-by: Tejun Heo t...@kernel.org
---
 fs/sysfs/bin.c | 21 -
 1 file changed, 8 insertions(+), 13 deletions(-)

diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index d49e6ca..d2142c0 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -72,7 +72,7 @@ read(struct file *file, char __user *userbuf, size_t bytes, 
loff_t *off)
int size = file_inode(file)-i_size;
loff_t offs = *off;
int count = min_t(size_t, bytes, PAGE_SIZE);
-   char *temp;
+   char *buf;
 
if (!bytes)
return 0;
@@ -84,23 +84,18 @@ read(struct file *file, char __user *userbuf, size_t bytes, 
loff_t *off)
count = size - offs;
}
 
-   temp = kmalloc(count, GFP_KERNEL);
-   if (!temp)
+   buf = kmalloc(count, GFP_KERNEL);
+   if (!buf)
return -ENOMEM;
 
mutex_lock(bb-mutex);
+   count = fill_read(file, buf, offs, count);
+   mutex_unlock(bb-mutex);
 
-   count = fill_read(file, bb-buffer, offs, count);
-   if (count  0) {
-   mutex_unlock(bb-mutex);
+   if (count  0)
goto out_free;
-   }
-
-   memcpy(temp, bb-buffer, count);
 
-   mutex_unlock(bb-mutex);
-
-   if (copy_to_user(userbuf, temp, count)) {
+   if (copy_to_user(userbuf, buf, count)) {
count = -EFAULT;
goto out_free;
}
@@ -110,7 +105,7 @@ read(struct file *file, char __user *userbuf, size_t bytes, 
loff_t *off)
*off = offs + count;
 
  out_free:
-   kfree(temp);
+   kfree(buf);
return count;
 }
 
-- 
1.8.3.1

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/6] clk: exynos5410: register clocks using common clock framework

2013-10-01 Thread Stephen Warren
On 10/01/2013 10:17 AM, Vyacheslav Tyrtov wrote:
 From: Tarek Dakhran t.dakh...@samsung.com
 
 The EXYNOS5410 clocks are statically listed and registered
 using the Samsung specific common clock helper functions.

 diff --git a/Documentation/devicetree/bindings/clock/exynos5410-clock.txt 
 b/Documentation/devicetree/bindings/clock/exynos5410-clock.txt

 +   [Core Clocks]
 +  [Clock Gate for Special Clocks]
 +   [Peripheral Clock Gates]

These headers/titles for the sections/lists aren't consistently aligned.

 +  [Clock Gate for Special Clocks]
 +
 +  Clock  ID
 +  
 +  sclk_uart0 128
 +  sclk_uart1 129
 +  sclk_uart2 130
 +  sclk_uart3 131
 +  sclk_mmc0  132
 +  sclk_mmc1  133
 +  sclk_mmc2  134
 +
 +   [Peripheral Clock Gates]
 +
 +  Clock  ID
 +  
 +
 +  uart0  257
 +  uart1  258
 +  uart2  259
 +  uart3  260
 +  mct315
 +  mmc0   351
 +  mmc1   352
 +  mmc2   353

That's not many clocks. I assume you're planning on adding more IDs
later, in a backwards-compatible fashion? I suppose that's OK since it
won't break any existing usage, as long as there's no need to renumber
any existing values.

On that topic, are any of those clock IDs derived from HW, e.g. register
numbers, or bit numbers in an array of registers? Numbering clocks in a
HW-derived fashion would make it easier or more obvious how to add new
clock IDs later while maintaining some consistency and without
introducing the desire to break any ABI.

Finally, how about creating a header file such as
include/dt-bindings/clock/exynos5410.h to define those clock IDs, so
that both *.dts and the clock driver can share the values without having
to manually write them?

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 05/15] sysfs: rename sysfs_buffer to sysfs_open_file

2013-10-01 Thread Tejun Heo
sysfs read path will be converted to use seq_file which will handle
buffering making sysfs_buffer a misnomer.  Rename sysfs_buffer to
sysfs_open_file, and sysfs_open_dirent-buffers to -files.

This path is pure rename.

Signed-off-by: Tejun Heo t...@kernel.org
---
 fs/sysfs/file.c | 127 
 1 file changed, 63 insertions(+), 64 deletions(-)

diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 499cff8..4b55bcf 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -25,14 +25,14 @@
 #include sysfs.h
 
 /*
- * There's one sysfs_buffer for each open file and one sysfs_open_dirent
+ * There's one sysfs_open_file for each open file and one sysfs_open_dirent
  * for each sysfs_dirent with one or more open files.
  *
  * sysfs_dirent-s_attr.open points to sysfs_open_dirent.  s_attr.open is
  * protected by sysfs_open_dirent_lock.
  *
- * filp-private_data points to sysfs_buffer which is chained at
- * sysfs_open_dirent-buffers, which is protected by sysfs_open_file_mutex.
+ * filp-private_data points to sysfs_open_file which is chained at
+ * sysfs_open_dirent-files, which is protected by sysfs_open_file_mutex.
  */
 static DEFINE_SPINLOCK(sysfs_open_dirent_lock);
 static DEFINE_MUTEX(sysfs_open_file_mutex);
@@ -41,10 +41,10 @@ struct sysfs_open_dirent {
atomic_trefcnt;
atomic_tevent;
wait_queue_head_t   poll;
-   struct list_headbuffers; /* goes through sysfs_buffer.list */
+   struct list_headfiles; /* goes through sysfs_open_file.list */
 };
 
-struct sysfs_buffer {
+struct sysfs_open_file {
size_t  count;
char*page;
struct mutexmutex;
@@ -75,7 +75,7 @@ static const struct sysfs_ops *sysfs_file_ops(struct 
sysfs_dirent *sd)
  * This is called only once, on the file's first read unless an error
  * is returned.
  */
-static int fill_read_buffer(struct dentry *dentry, struct sysfs_buffer *buffer)
+static int fill_read_buffer(struct dentry *dentry, struct sysfs_open_file *of)
 {
struct sysfs_dirent *attr_sd = dentry-d_fsdata;
struct kobject *kobj = attr_sd-s_parent-s_dir.kobj;
@@ -83,19 +83,19 @@ static int fill_read_buffer(struct dentry *dentry, struct 
sysfs_buffer *buffer)
int ret = 0;
ssize_t count;
 
-   if (!buffer-page)
-   buffer-page = (char *) get_zeroed_page(GFP_KERNEL);
-   if (!buffer-page)
+   if (!of-page)
+   of-page = (char *) get_zeroed_page(GFP_KERNEL);
+   if (!of-page)
return -ENOMEM;
 
/* need attr_sd for attr and ops, its parent for kobj */
if (!sysfs_get_active(attr_sd))
return -ENODEV;
 
-   buffer-event = atomic_read(attr_sd-s_attr.open-event);
+   of-event = atomic_read(attr_sd-s_attr.open-event);
 
ops = sysfs_file_ops(attr_sd);
-   count = ops-show(kobj, attr_sd-s_attr.attr, buffer-page);
+   count = ops-show(kobj, attr_sd-s_attr.attr, of-page);
 
sysfs_put_active(attr_sd);
 
@@ -110,7 +110,7 @@ static int fill_read_buffer(struct dentry *dentry, struct 
sysfs_buffer *buffer)
count = PAGE_SIZE - 1;
}
if (count = 0)
-   buffer-count = count;
+   of-count = count;
else
ret = count;
return ret;
@@ -138,63 +138,62 @@ static int fill_read_buffer(struct dentry *dentry, struct 
sysfs_buffer *buffer)
 static ssize_t
 sysfs_read_file(struct file *file, char __user *buf, size_t count, loff_t 
*ppos)
 {
-   struct sysfs_buffer *buffer = file-private_data;
+   struct sysfs_open_file *of = file-private_data;
ssize_t retval = 0;
 
-   mutex_lock(buffer-mutex);
+   mutex_lock(of-mutex);
/*
 * Fill on zero offset and the first read so that silly things like
 * dd bs=1 skip=N can work on sysfs files.
 */
-   if (*ppos == 0 || !buffer-page) {
-   retval = fill_read_buffer(file-f_path.dentry, buffer);
+   if (*ppos == 0 || !of-page) {
+   retval = fill_read_buffer(file-f_path.dentry, of);
if (retval)
goto out;
}
pr_debug(%s: count = %zd, ppos = %lld, buf = %s\n,
-__func__, count, *ppos, buffer-page);
-   retval = simple_read_from_buffer(buf, count, ppos, buffer-page,
-buffer-count);
+__func__, count, *ppos, of-page);
+   retval = simple_read_from_buffer(buf, count, ppos, of-page, of-count);
 out:
-   mutex_unlock(buffer-mutex);
+   mutex_unlock(of-mutex);
return retval;
 }
 
 /**
  * fill_write_buffer - copy buffer from userspace.
- * @buffer:data buffer for file.
+ * @of:open file struct.
  * @buf:   data from user.
  * @count: number of bytes in @userbuf.
  *
- *  

[PATCH 07/15] sysfs: use transient write buffer

2013-10-01 Thread Tejun Heo
There isn't much to be gained by keeping around kernel buffer while a
file is open especially as the read path planned to be converted to
use seq_file and won't use the buffer.  This patch makes
sysfs_write_file() use per-write transient buffer instead of
sysfs_open_file-page.

This simplifies the write path, enables removing sysfs_open_file-page
once read path is updated and will help merging bin file write path
which already requires the use of a transient buffer due to a locking
order issue.

As the function comments of flush_write_buffer() and
sysfs_write_buffer() are being updated anyway, reformat them so that
they're more conventional.

v2: Use min_t() instead of min() in sysfs_write_file() to avoid build
warning on arm.  Reported by build test robot.

Signed-off-by: Tejun Heo t...@kernel.org
Cc: kbuild test robot fengguang...@intel.com
---
 fs/sysfs/file.c | 114 ++--
 1 file changed, 52 insertions(+), 62 deletions(-)

diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index af6e909..53cc096 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -162,92 +162,82 @@ out:
 }
 
 /**
- * fill_write_buffer - copy buffer from userspace.
- * @of:open file struct.
- * @buf:   data from user.
- * @count: number of bytes in @userbuf.
+ * flush_write_buffer - push buffer to kobject
+ * @of: open file
+ * @buf: data buffer for file
+ * @count: number of bytes
  *
- * Allocate @of-page if it hasn't been already, then copy the
- * user-supplied buffer into it.
+ * Get the correct pointers for the kobject and the attribute we're dealing
+ * with, then call the store() method for it with @buf.
  */
-static int fill_write_buffer(struct sysfs_open_file *of,
-const char __user *buf, size_t count)
-{
-   int error;
-
-   if (!of-page)
-   of-page = (char *)get_zeroed_page(GFP_KERNEL);
-   if (!of-page)
-   return -ENOMEM;
-
-   if (count = PAGE_SIZE)
-   count = PAGE_SIZE - 1;
-   error = copy_from_user(of-page, buf, count);
-
-   /*
-* If buf is assumed to contain a string, terminate it by \0, so
-* e.g. sscanf() can scan the string easily.
-*/
-   of-page[count] = 0;
-   return error ? -EFAULT : count;
-}
-
-/**
- * flush_write_buffer - push buffer to kobject.
- * @of:open file
- * @count: number of bytes
- *
- * Get the correct pointers for the kobject and the attribute we're
- * dealing with, then call the store() method for the attribute,
- * passing the buffer that we acquired in fill_write_buffer().
- */
-static int flush_write_buffer(struct sysfs_open_file *of, size_t count)
+static int flush_write_buffer(struct sysfs_open_file *of, char *buf,
+ size_t count)
 {
struct kobject *kobj = of-sd-s_parent-s_dir.kobj;
const struct sysfs_ops *ops;
-   int rc;
+   int rc = 0;
 
-   /* need @of-sd for attr and ops, its parent for kobj */
-   if (!sysfs_get_active(of-sd))
+   /*
+* Need @of-sd for attr and ops, its parent for kobj.  @of-mutex
+* nests outside active ref and is just to ensure that the ops
+* aren't called concurrently for the same open file.
+*/
+   mutex_lock(of-mutex);
+   if (!sysfs_get_active(of-sd)) {
+   mutex_unlock(of-mutex);
return -ENODEV;
+   }
 
ops = sysfs_file_ops(of-sd);
-   rc = ops-store(kobj, of-sd-s_attr.attr, of-page, count);
+   rc = ops-store(kobj, of-sd-s_attr.attr, buf, count);
 
sysfs_put_active(of-sd);
+   mutex_unlock(of-mutex);
 
return rc;
 }
 
 /**
- * sysfs_write_file - write an attribute.
- * @file:  file pointer
- * @buf:   data to write
- * @count: number of bytes
- * @ppos:  starting offset
+ * sysfs_write_file - write an attribute
+ * @file: file pointer
+ * @user_buf: data to write
+ * @count: number of bytes
+ * @ppos: starting offset
  *
- * Similar to sysfs_read_file(), though working in the opposite direction.
- * We allocate and fill the data from the user in fill_write_buffer(),
- * then push it to the kobject in flush_write_buffer().
- * There is no easy way for us to know if userspace is only doing a partial
- * write, so we don't support them. We expect the entire buffer to come
- * on the first write.
- * Hint: if you're writing a value, first read the file, modify only the
- * the value you're changing, then write entire buffer back.
+ * Copy data in from userland and pass it to the matching
+ * sysfs_ops-store() by invoking flush_write_buffer().
+ *
+ * There is no easy way for us to know if userspace is only doing a partial
+ * write, so we don't support them. We expect the entire buffer to come on
+ * the first write.  Hint: if you're writing a value, first read the file,
+ * 

Re: [PATCH 3/3] KVM: PPC: Book3S: Add support for hwrng found on some powernv systems

2013-10-01 Thread Benjamin Herrenschmidt
On Tue, 2013-10-01 at 13:19 +0200, Paolo Bonzini wrote:
 Il 01/10/2013 11:38, Benjamin Herrenschmidt ha scritto:
  So for the sake of that dogma you are going to make us do something that
  is about 100 times slower ? (and possibly involves more lines of code)
 
 If it's 100 times slower there is something else that's wrong.  It's
 most likely not 100 times slower, and this makes me wonder if you or
 Michael actually timed the code at all.

We haven't but it's pretty obvious:

 - The KVM real mode implementation: guest issues the hcall, we remain
in real mode, within the MMU context of the guest, all secondary threads
on the core are still running in the guest, and we do an MMIO  return.

 - The qemu variant: guest issues the hcall we need to exit the guest,
which means bring *all* threads on the core out of KVM, switch the full
MMU context back to host (which among others involves flushing the ERAT,
aka level 1 TLB), while sending the secondary threads into idle loops.
Then we return to qemu user context, which will then use /dev/random -
back into the kernel and out, at which point we can return to the guest,
so back into the kernel, back into run which means IPI the secondary
threads on the core, switch the MMU context again until we can finally
go back to executing guest instructions.

So no we haven't measured. But it is going to be VERY VERY VERY much
slower. Our exit latencies are bad with our current MMU *and* any exit
is going to cause all secondary threads on the core to have to exit as
well (remember P7 is 4 threads, P8 is 8)

  It's not just speed ... H_RANDOM is going to be called by the guest
  kernel. A round trip to qemu is going to introduce a kernel jitter
  (complete stop of operations of the kernel on that virtual processor) of
  a full exit + round trip to qemu + back to the kernel to get to some
  source of random number ...  this is going to be in the dozens of ns at
  least.
 
 I guess you mean dozens of *micro*seconds, which is somewhat exaggerated
 but not too much.  On x86 some reasonable timings are:

Yes.

   100 cyclesbare metal rdrand
   2000 cycles   guest-hypervisor-guest
   15000 cycles  guest-userspace-guest
 
 (100 cycles = 40 ns = 200 MB/sec; 2000 cycles = ~1 microseconds; 15000
 cycles = ~7.5 microseconds).  Even on 5 year old hardware, a userspace
 roundtrip is around a dozen microseconds.

So in your case going to qemu to emulate rdrand would indeed be 150
times slower, I don't see in what universe that would be considered a
good idea.

 Anyhow, I would like to know more about this hwrng and hypercall.
 
 Does the hwrng return random numbers (like rdrand) or real entropy (like
 rdseed that Intel will add in Broadwell)?

It's a random number obtained from sampling a set of oscillators. It's
slightly biased but we have very simple code (I believe shared with the
host kernel implementation) for whitening it as is required by PAPR.
 
   What about the hypercall?
 For example virtio-rng is specified to return actual entropy, it doesn't
 matter if it is from hardware or software.
 
 In either case, the patches have problems.
 
 1) If the hwrng returns random numbers, the whitening you're doing is
 totally insufficient and patch 2 is forging entropy that doesn't exist.

I will let Paul to comment on the whitening, it passes all the tests
we've been running it through.

 2) If the hwrng returns entropy, a read from the hwrng is going to even
 more expensive than an x86 rdrand (perhaps ~2000 cycles).

Depends how often you read, the HW I think is sampling asynchronously so
you only block on the MMIO if you already consumed the previous sample
but I'll let Paulus provide more details here.

   Hence, doing
 the emulation in the kernel is even less necessary.  Also, if the hwrng
 returns entropy patch 1 is unnecessary: you do not need to waste
 precious entropy bits by passing them to arch_get_random_long; just run
 rngd in the host as that will put the entropy to much better use.

 3) If the hypercall returns random numbers, then it is a pretty
 braindead interface since returning 8 bytes at a time limits the
 throughput to a handful of MB/s (compare to 200 MB/sec for x86 rdrand).
  But more important: in this case drivers/char/hw_random/pseries-rng.c
 is completely broken and insecure, just like patch 2 in case (1) above.

How so ?

 4) If the hypercall returns entropy (same as virtio-rng), the same
 considerations on speed apply.  If you can only produce entropy at say 1
 MB/s (so reading 8 bytes take 8 microseconds---which is actually very
 fast), it doesn't matter that much to spend 7 microseconds on a
 userspace roundtrip.  It's going to be only half the speed of bare
 metal, not 100 times slower.
 
 
 Also, you will need _anyway_ extra code that is not present here to
 either disable the rng based on userspace command-line, or to emulate
 the rng from userspace.  It is absolutely _not_ acceptable to have a
 hypercall disappear across 

[PATCHSET v2] sysfs: use seq_file and unify regular and bin file handling

2013-10-01 Thread Tejun Heo
Hello,

Changes from the last take[L] are,

* bin file reads no longer go through seq_file.  It goes through a
  separate read path implemented in sysfs_bin_read().  bin files
  shouldn't see any behavior difference now.

* bin files now use a separate file_operations struct -
  sysfs_bin_operations.  Open and write paths are still shared but
  read path is separate and mmap exists only for the bin files.  While
  this is less uniform than before, it should still render itself well
  to extracting the core functionality.

* 0001-0008 are the same as before.

Patchset description follows.

Currently, sysfs's file handling is a bit weird.

* Regular and bin file paths are similar but implemented completely
  separately duplicating some hairy logics.

* Read path implements custom buffering which is essentially
  degenerate seq_file.

In addition, sysfs core implementation is planned to be separated out
so that it can be shared by other subsystems and the current file
handling is too restrictive and quirky to spread further to other
parts of the kernel.  It'd be a lot more desirable to have read path
completely handled by seq_file which is a lot more versatile and would
also increase overall behavior consistency.

This patchset updates file handling such that read is handled by
seq_file and then merges bin file handling into regular file path.
While some changes introduces behavior changes in extreme corner
cases, they are highly unlikely to be noticeable (please refer to the
description of each patch for details) and generally bring sysfs's
behavior closer to those of procfs or any pseudo filesystem which
makes use of seq_file.

After the conversion, LOC is reduced by ~150 lines and read path is
fully handled by seq_file, which allows defining a new seq_file based
core interface which will enable sharing sysfs from other subsystems.

This patchset contains the following patches.

 0001-sysfs-remove-unused-sysfs_buffer-pos.patch
 0002-sysfs-remove-sysfs_buffer-needs_read_fill.patch
 0003-sysfs-remove-sysfs_buffer-ops.patch
 0004-sysfs-add-sysfs_open_file_mutex.patch
 0005-sysfs-rename-sysfs_buffer-to-sysfs_open_file.patch
 0006-sysfs-add-sysfs_open_file-sd-and-file.patch
 0007-sysfs-use-transient-write-buffer.patch
 0008-sysfs-use-seq_file-when-reading-regular-files.patch
 0009-sysfs-skip-bin_buffer-buffer-while-reading.patch
 0010-sysfs-collapse-fs-sysfs-bin.c-fill_read-into-read.patch
 0011-sysfs-prepare-path-write-for-unified-regular-bin-fil.patch
 0012-sysfs-add-sysfs_bin_read.patch
 0013-sysfs-copy-bin-mmap-support-from-fs-sysfs-bin.c-to-f.patch
 0014-sysfs-prepare-open-path-for-unified-regular-bin-file.patch
 0015-sysfs-merge-regular-and-bin-file-handling.patch

0001-0006 are misc preps.

0007 makes write path use transient buffer instead of the one
persistent during open.

0008 makes read path use seq_file.

0009-0011 prepare for merging bin and regular file handling.

0012-0015 merge bin file handling into regular file support.

The patches are on top of

  linus#master c2d95729e3 (Merge branch 'akpm' (patches from Andrew Morton))
+ [1] [PATCHSET] sysfs: disentangle kobject namespace handling from sysfs
+ [2] [PATCHSET] sysfs: implement sysfs_remove()

and available in the following git branch.

 git://git.kernel.org/pub/scm/linux/kernel/git/tj/misc.git review-sysfs-seq_file

diffstat follows, thanks.

 Documentation/DocBook/filesystems.tmpl |1
 fs/sysfs/Makefile  |3
 fs/sysfs/bin.c |  502 -
 fs/sysfs/dir.c |2
 fs/sysfs/file.c|  766 -
 fs/sysfs/inode.c   |2
 fs/sysfs/sysfs.h   |7
 7 files changed, 567 insertions(+), 716 deletions(-)

Thanks.

--
tejun

[L] http://thread.gmane.org/gmane.linux.kernel/1569578
[1] http://thread.gmane.org/gmane.linux.kernel/1560372
[2] http://thread.gmane.org/gmane.linux.kernel/1564002
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 08/15] sysfs: use seq_file when reading regular files

2013-10-01 Thread Tejun Heo
sysfs read path implements its own buffering scheme between userland
and kernel callbacks, which essentially is a degenerate duplicate of
seq_file.  This patch replaces the custom read buffering
implementation in sysfs with seq_file.

While the amount of code reduction is small, this reduces low level
hairiness and enables future development of a new versatile API based
on seq_file so that sysfs features can be shared with other
subsystems.

As write path was already converted to not use sysfs_open_file-page,
this patch makes -page and -count unused and removes them.

Userland behavior remains the same except for some extreme corner
cases - e.g. sysfs will now regenerate the content each time a file is
read after a non-contiguous seek whereas the original code would keep
using the same content.  While this is a userland visible behavior
change, it is extremely unlikely to be noticeable and brings sysfs
behavior closer to that of procfs.

Signed-off-by: Tejun Heo t...@kernel.org
Cc: Kay Sievers k...@vrfy.org
---
 fs/sysfs/file.c | 164 +---
 1 file changed, 73 insertions(+), 91 deletions(-)

diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 53cc096..4921bda 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -21,6 +21,7 @@
 #include linux/mutex.h
 #include linux/limits.h
 #include linux/uaccess.h
+#include linux/seq_file.h
 
 #include sysfs.h
 
@@ -31,7 +32,8 @@
  * sysfs_dirent-s_attr.open points to sysfs_open_dirent.  s_attr.open is
  * protected by sysfs_open_dirent_lock.
  *
- * filp-private_data points to sysfs_open_file which is chained at
+ * filp-private_data points to seq_file whose -private points to
+ * sysfs_open_file.  sysfs_open_files are chained at
  * sysfs_open_dirent-files, which is protected by sysfs_open_file_mutex.
  */
 static DEFINE_SPINLOCK(sysfs_open_dirent_lock);
@@ -47,13 +49,16 @@ struct sysfs_open_dirent {
 struct sysfs_open_file {
struct sysfs_dirent *sd;
struct file *file;
-   size_t  count;
-   char*page;
struct mutexmutex;
int event;
struct list_headlist;
 };
 
+static struct sysfs_open_file *sysfs_of(struct file *file)
+{
+   return ((struct seq_file *)file-private_data)-private;
+}
+
 /*
  * Determine ktype-sysfs_ops for the given sysfs_dirent.  This function
  * must be called while holding an active reference.
@@ -66,40 +71,54 @@ static const struct sysfs_ops *sysfs_file_ops(struct 
sysfs_dirent *sd)
return kobj-ktype ? kobj-ktype-sysfs_ops : NULL;
 }
 
-/**
- * fill_read_buffer - allocate and fill buffer from object.
- * @dentry:dentry pointer.
- * @buffer:data buffer for file.
- *
- * Allocate @buffer-page, if it hasn't been already, then call the
- * kobject's show() method to fill the buffer with this attribute's
- * data.
- * This is called only once, on the file's first read unless an error
- * is returned.
+/*
+ * Reads on sysfs are handled through seq_file, which takes care of hairy
+ * details like buffering and seeking.  The following function pipes
+ * sysfs_ops-show() result through seq_file.
  */
-static int fill_read_buffer(struct dentry *dentry, struct sysfs_open_file *of)
+static int sysfs_seq_show(struct seq_file *sf, void *v)
 {
-   struct sysfs_dirent *attr_sd = dentry-d_fsdata;
-   struct kobject *kobj = attr_sd-s_parent-s_dir.kobj;
+   struct sysfs_open_file *of = sf-private;
+   struct kobject *kobj = of-sd-s_parent-s_dir.kobj;
const struct sysfs_ops *ops;
-   int ret = 0;
+   char *buf;
ssize_t count;
 
-   if (!of-page)
-   of-page = (char *) get_zeroed_page(GFP_KERNEL);
-   if (!of-page)
-   return -ENOMEM;
+   /* acquire buffer and ensure that it's = PAGE_SIZE */
+   count = seq_get_buf(sf, buf);
+   if (count  PAGE_SIZE) {
+   seq_commit(sf, -1);
+   return 0;
+   }
 
-   /* need attr_sd for attr and ops, its parent for kobj */
-   if (!sysfs_get_active(attr_sd))
+   /*
+* Need @of-sd for attr and ops, its parent for kobj.  @of-mutex
+* nests outside active ref and is just to ensure that the ops
+* aren't called concurrently for the same open file.
+*/
+   mutex_lock(of-mutex);
+   if (!sysfs_get_active(of-sd)) {
+   mutex_unlock(of-mutex);
return -ENODEV;
+   }
 
-   of-event = atomic_read(attr_sd-s_attr.open-event);
+   of-event = atomic_read(of-sd-s_attr.open-event);
 
-   ops = sysfs_file_ops(attr_sd);
-   count = ops-show(kobj, attr_sd-s_attr.attr, of-page);
+   /*
+* Lookup @ops and invoke show().  Control may reach here via seq
+* file lseek even if @ops-show() isn't implemented.
+*/
+   ops = sysfs_file_ops(of-sd);
+   if (ops-show)
+

[PATCH 01/15] sysfs: remove unused sysfs_buffer-pos

2013-10-01 Thread Tejun Heo
Signed-off-by: Tejun Heo t...@kernel.org
---
 fs/sysfs/file.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 1656a79..81e3f72 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -44,7 +44,6 @@ struct sysfs_open_dirent {
 
 struct sysfs_buffer {
size_t  count;
-   loff_t  pos;
char*page;
const struct sysfs_ops  *ops;
struct mutexmutex;
-- 
1.8.3.1

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 04/15] sysfs: add sysfs_open_file_mutex

2013-10-01 Thread Tejun Heo
Add a separate mutex to protect sysfs_open_dirent-buffers list.  This
will allow performing sleepable operations while traversing
sysfs_buffers, which will be renamed to sysfs_open_file.

Note that currently sysfs_open_dirent-buffers list isn't being used
for anything and this patch doesn't make any functional difference.
It will be used to merge regular and bin file supports.

Signed-off-by: Tejun Heo t...@kernel.org
---
 fs/sysfs/file.c | 18 --
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 7dfcc33..499cff8 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -25,15 +25,17 @@
 #include sysfs.h
 
 /*
- * There's one sysfs_buffer for each open file and one
- * sysfs_open_dirent for each sysfs_dirent with one or more open
- * files.
+ * There's one sysfs_buffer for each open file and one sysfs_open_dirent
+ * for each sysfs_dirent with one or more open files.
  *
- * filp-private_data points to sysfs_buffer and
- * sysfs_dirent-s_attr.open points to sysfs_open_dirent.  s_attr.open
- * is protected by sysfs_open_dirent_lock.
+ * sysfs_dirent-s_attr.open points to sysfs_open_dirent.  s_attr.open is
+ * protected by sysfs_open_dirent_lock.
+ *
+ * filp-private_data points to sysfs_buffer which is chained at
+ * sysfs_open_dirent-buffers, which is protected by sysfs_open_file_mutex.
  */
 static DEFINE_SPINLOCK(sysfs_open_dirent_lock);
+static DEFINE_MUTEX(sysfs_open_file_mutex);
 
 struct sysfs_open_dirent {
atomic_trefcnt;
@@ -272,6 +274,7 @@ static int sysfs_get_open_dirent(struct sysfs_dirent *sd,
struct sysfs_open_dirent *od, *new_od = NULL;
 
  retry:
+   mutex_lock(sysfs_open_file_mutex);
spin_lock_irq(sysfs_open_dirent_lock);
 
if (!sd-s_attr.open  new_od) {
@@ -286,6 +289,7 @@ static int sysfs_get_open_dirent(struct sysfs_dirent *sd,
}
 
spin_unlock_irq(sysfs_open_dirent_lock);
+   mutex_unlock(sysfs_open_file_mutex);
 
if (od) {
kfree(new_od);
@@ -321,6 +325,7 @@ static void sysfs_put_open_dirent(struct sysfs_dirent *sd,
struct sysfs_open_dirent *od = sd-s_attr.open;
unsigned long flags;
 
+   mutex_lock(sysfs_open_file_mutex);
spin_lock_irqsave(sysfs_open_dirent_lock, flags);
 
list_del(buffer-list);
@@ -330,6 +335,7 @@ static void sysfs_put_open_dirent(struct sysfs_dirent *sd,
od = NULL;
 
spin_unlock_irqrestore(sysfs_open_dirent_lock, flags);
+   mutex_unlock(sysfs_open_file_mutex);
 
kfree(od);
 }
-- 
1.8.3.1

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 03/15] sysfs: remove sysfs_buffer-ops

2013-10-01 Thread Tejun Heo
Currently, sysfs_ops is fetched during sysfs_open_file() and cached in
sysfs_buffer-ops to be used while the file is open.  This patch
removes the caching and makes each operation directly fetch sysfs_ops.

This patch doesn't introduce any behavior difference and is to prepare
for merging regular and bin file supports.

Signed-off-by: Tejun Heo t...@kernel.org
---
 fs/sysfs/file.c | 33 +
 1 file changed, 21 insertions(+), 12 deletions(-)

diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index e2fafc0..7dfcc33 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -45,12 +45,23 @@ struct sysfs_open_dirent {
 struct sysfs_buffer {
size_t  count;
char*page;
-   const struct sysfs_ops  *ops;
struct mutexmutex;
int event;
struct list_headlist;
 };
 
+/*
+ * Determine ktype-sysfs_ops for the given sysfs_dirent.  This function
+ * must be called while holding an active reference.
+ */
+static const struct sysfs_ops *sysfs_file_ops(struct sysfs_dirent *sd)
+{
+   struct kobject *kobj = sd-s_parent-s_dir.kobj;
+
+   lockdep_assert_held(sd);
+   return kobj-ktype ? kobj-ktype-sysfs_ops : NULL;
+}
+
 /**
  * fill_read_buffer - allocate and fill buffer from object.
  * @dentry:dentry pointer.
@@ -66,7 +77,7 @@ static int fill_read_buffer(struct dentry *dentry, struct 
sysfs_buffer *buffer)
 {
struct sysfs_dirent *attr_sd = dentry-d_fsdata;
struct kobject *kobj = attr_sd-s_parent-s_dir.kobj;
-   const struct sysfs_ops *ops = buffer-ops;
+   const struct sysfs_ops *ops;
int ret = 0;
ssize_t count;
 
@@ -80,6 +91,8 @@ static int fill_read_buffer(struct dentry *dentry, struct 
sysfs_buffer *buffer)
return -ENODEV;
 
buffer-event = atomic_read(attr_sd-s_attr.open-event);
+
+   ops = sysfs_file_ops(attr_sd);
count = ops-show(kobj, attr_sd-s_attr.attr, buffer-page);
 
sysfs_put_active(attr_sd);
@@ -191,13 +204,14 @@ static int flush_write_buffer(struct dentry *dentry,
 {
struct sysfs_dirent *attr_sd = dentry-d_fsdata;
struct kobject *kobj = attr_sd-s_parent-s_dir.kobj;
-   const struct sysfs_ops *ops = buffer-ops;
+   const struct sysfs_ops *ops;
int rc;
 
/* need attr_sd for attr and ops, its parent for kobj */
if (!sysfs_get_active(attr_sd))
return -ENODEV;
 
+   ops = sysfs_file_ops(attr_sd);
rc = ops-store(kobj, attr_sd-s_attr.attr, buffer-page, count);
 
sysfs_put_active(attr_sd);
@@ -205,7 +219,6 @@ static int flush_write_buffer(struct dentry *dentry,
return rc;
 }
 
-
 /**
  * sysfs_write_file - write an attribute.
  * @file:  file pointer
@@ -334,14 +347,11 @@ static int sysfs_open_file(struct inode *inode, struct 
file *file)
return -ENODEV;
 
/* every kobject with an attribute needs a ktype assigned */
-   if (kobj-ktype  kobj-ktype-sysfs_ops)
-   ops = kobj-ktype-sysfs_ops;
-   else {
-   WARN(1, KERN_ERR
-missing sysfs attribute operations for kobject: %s\n,
-kobject_name(kobj));
+   ops = sysfs_file_ops(attr_sd);
+   if (WARN(!ops, KERN_ERR
+missing sysfs attribute operations for kobject: %s\n,
+kobject_name(kobj)))
goto err_out;
-   }
 
/* File needs write support.
 * The inode's perms must say it's ok,
@@ -370,7 +380,6 @@ static int sysfs_open_file(struct inode *inode, struct file 
*file)
goto err_out;
 
mutex_init(buffer-mutex);
-   buffer-ops = ops;
file-private_data = buffer;
 
/* make sure we have open dirent struct */
-- 
1.8.3.1

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 02/15] sysfs: remove sysfs_buffer-needs_read_fill

2013-10-01 Thread Tejun Heo
-needs_read_fill is used to implement the following behaviors.

1. Ensure buffer filling on the first read.
2. Force buffer filling after a write.
3. Force buffer filling after a successful poll.

However, #2 and #3 don't really work as sysfs doesn't reset file
position.  While the read buffer would be refilled, the next read
would continue from the position after the last read or write,
requiring an explicit seek to the start for it to be useful, which
makes -needs_read_fill superflous as read buffer is always refilled
if f_pos == 0.

Update sysfs_read_file() to test buffer-page for #1 instead and
remove -needs_read_fill.  While this changes behavior in extreme
corner cases - e.g. re-reading a sysfs file after seeking to non-zero
position after a write or poll, it's highly unlikely to lead to actual
breakage.  This change is to prepare for using seq_file in the read
path.

While at it, reformat a comment in fill_write_buffer().

Signed-off-by: Tejun Heo t...@kernel.org
Cc: Kay Sievers k...@vrfy.org
---
 fs/sysfs/file.c | 24 
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 81e3f72..e2fafc0 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -47,7 +47,6 @@ struct sysfs_buffer {
char*page;
const struct sysfs_ops  *ops;
struct mutexmutex;
-   int needs_read_fill;
int event;
struct list_headlist;
 };
@@ -95,12 +94,10 @@ static int fill_read_buffer(struct dentry *dentry, struct 
sysfs_buffer *buffer)
/* Try to struggle along */
count = PAGE_SIZE - 1;
}
-   if (count = 0) {
-   buffer-needs_read_fill = 0;
+   if (count = 0)
buffer-count = count;
-   } else {
+   else
ret = count;
-   }
return ret;
 }
 
@@ -130,7 +127,11 @@ sysfs_read_file(struct file *file, char __user *buf, 
size_t count, loff_t *ppos)
ssize_t retval = 0;
 
mutex_lock(buffer-mutex);
-   if (buffer-needs_read_fill || *ppos == 0) {
+   /*
+* Fill on zero offset and the first read so that silly things like
+* dd bs=1 skip=N can work on sysfs files.
+*/
+   if (*ppos == 0 || !buffer-page) {
retval = fill_read_buffer(file-f_path.dentry, buffer);
if (retval)
goto out;
@@ -166,14 +167,15 @@ static int fill_write_buffer(struct sysfs_buffer *buffer,
if (count = PAGE_SIZE)
count = PAGE_SIZE - 1;
error = copy_from_user(buffer-page, buf, count);
-   buffer-needs_read_fill = 1;
-   /* if buf is assumed to contain a string, terminate it by \0,
-  so e.g. sscanf() can scan the string easily */
+
+   /*
+* If buf is assumed to contain a string, terminate it by \0, so
+* e.g. sscanf() can scan the string easily.
+*/
buffer-page[count] = 0;
return error ? -EFAULT : count;
 }
 
-
 /**
  * flush_write_buffer - push buffer to kobject.
  * @dentry:dentry to the attribute
@@ -368,7 +370,6 @@ static int sysfs_open_file(struct inode *inode, struct file 
*file)
goto err_out;
 
mutex_init(buffer-mutex);
-   buffer-needs_read_fill = 1;
buffer-ops = ops;
file-private_data = buffer;
 
@@ -435,7 +436,6 @@ static unsigned int sysfs_poll(struct file *filp, 
poll_table *wait)
return DEFAULT_POLLMASK;
 
  trigger:
-   buffer-needs_read_fill = 1;
return DEFAULT_POLLMASK|POLLERR|POLLPRI;
 }
 
-- 
1.8.3.1

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 01/13] tpm: ibmvtpm: Use %zd formatting for size_t format arguments

2013-10-01 Thread Peter Hüwe
Am Montag, 23. September 2013, 20:14:31 schrieb Jason Gunthorpe:
 This suppresses compile warnings on 32 bit builds.
 
 Signed-off-by: Jason Gunthorpe jguntho...@obsidianresearch.com


Reviewed-by: Peter Huewe peterhu...@gmx.de
Signed-off-by: Peter Huewe peterhu...@gmx.de

Staged here 
https://github.com/PeterHuewe/linux-tpmdd  for-james

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: RFC: (re-)binding the VFIO platform driver to a platform device

2013-10-01 Thread Kim Phillips
On Tue, 1 Oct 2013 14:15:38 -0500
Scott Wood scottw...@freescale.com wrote:

 On Tue, 2013-10-01 at 13:38 -0500, Kim Phillips wrote:
  Hi,
  
  Santosh and I are having a problem figuring out how to enable binding
  (and re-binding) platform devices to a platform VFIO driver (see
  Antonis' WIP: [1]) in an upstream-acceptable manner.
  
  Binding platform drivers currently depends on a string match in the
  device node's compatible entry.  On an arndale, one can currently
  rebind the same device to the same driver like so:
  
  echo 12ce.i2c  
  /sys/bus/platform/drivers/s3c-i2c/12ce.i2c/driver/unbind
  echo 12ce.i2c  /sys/bus/platform/drivers/s3c-i2c/bind
  
  And one can bind it to the vfio-dt driver, as Antonis instructs, by
  appending a 'vfio-dt' string to the device tree compatible entry for
  the device.  Then this would work:
  
  echo 12ce.i2c  
  /sys/bus/platform/drivers/s3c-i2c/12ce.i2c/driver/unbind
  echo 12ce.i2c  /sys/bus/platform/drivers/vfio-dt/bind
  
  Consequently, the hack patch below [2] allows any platform device to be
  bound to the vfio-dt driver, without making changes to the device
  tree.  It's a hack because I don't see having any driver name specific
  code in drivers/base/bus.c being upstream acceptable.
 
 Modifying the device tree is the worse part of this.
 
 Is this part of your later suggestion to make compatible writeable after
 boot, or are you talking about messing with the device tree before boot
 (putting software config in the device tree, among other ickiness)?

writeable after boot

  Alternately, device tree compatible entries may be made writeable after
  boot, e.g.:
  
  echo vfio-platform  /proc/device-tree/i2c\@12CE/compatible
  
  [note s/vfio-dt/vfio-platform/]
  
  but that would require the vfio-platform module be reloaded, thereby
  unbinding it from any existing devices it was bound to: we're
  seeking a more dynamic solution.
 
 Eww.
 
 Not to mention that the VFIO user might want to know what the compatible
 was,

well, technically the user would be able to get that info by reading
compatible before writing it, and ideally write the original value back
in addition to the new value.

 or that we might later want to unbind from VFIO and rebind to the
 kernel...

I believe that's independent:  it would depend on which driver's (VFIO,
kernel, or other) sysfs file the device address gets written into.

  Alex Graf (cc'd) proposed an alternate approach: re-write the driver
  name in the device's sysfs entry:
  
  echo vfio-platform  
  /sys/bus/platform/devices/101e.rtc/driver/driver_name
  
  The advantage of this approach is that we can achieve the re-bind
  (unbind + bind) as an atomic operation, which alleviates userspace from
  having to coordinate with other device operations (I think VM migration
  is an example case here).
  
  Note that driver_name currently doesn't exist in sysfs, so it would
  either have to be added, or another means developed to rename the
  driver symlink itself:
 
 I think the ideal interface would be if you could write the sysfs device
 name into the vfio bind file (or some new file in the same directory),
 and have it claim that device (preferably with an atomic unbind from the
 previous driver).

ok.

 We shouldn't be messing around with compatible
 (either modifying it or telling VFIO which compatibles to look for) when
 we know the specific devices (not just type of devices) we want to bind.

ok, but I still don't see how to get past driver_match_device()'s
refusal to allow bind a non-compatible driver (or one who's name isn't
in the compatible list).

Kim
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: RFC: (re-)binding the VFIO platform driver to a platform device

2013-10-01 Thread Kim Phillips
On Tue, 1 Oct 2013 14:17:16 -0500
Scott Wood scottw...@freescale.com wrote:

 On Tue, 2013-10-01 at 14:15 -0500, Scott Wood wrote:
  On Tue, 2013-10-01 at 13:38 -0500, Kim Phillips wrote:
   Hi,
   
   Santosh and I are having a problem figuring out how to enable binding
   (and re-binding) platform devices to a platform VFIO driver (see
   Antonis' WIP: [1]) in an upstream-acceptable manner.
   
   Binding platform drivers currently depends on a string match in the
   device node's compatible entry.  On an arndale, one can currently
   rebind the same device to the same driver like so:
   
   echo 12ce.i2c  
   /sys/bus/platform/drivers/s3c-i2c/12ce.i2c/driver/unbind
   echo 12ce.i2c  /sys/bus/platform/drivers/s3c-i2c/bind
   
   And one can bind it to the vfio-dt driver, as Antonis instructs, by
   appending a 'vfio-dt' string to the device tree compatible entry for
   the device.  Then this would work:
   
   echo 12ce.i2c  
   /sys/bus/platform/drivers/s3c-i2c/12ce.i2c/driver/unbind
   echo 12ce.i2c  /sys/bus/platform/drivers/vfio-dt/bind
   
   Consequently, the hack patch below [2] allows any platform device to be
   bound to the vfio-dt driver, without making changes to the device
   tree.  It's a hack because I don't see having any driver name specific
   code in drivers/base/bus.c being upstream acceptable.
  
  Modifying the device tree is the worse part of this.
 
 I think I missed something.  How do you reconcile without making
 changes to the device tree with appending a 'vfio-dt' string to the
 device tree compatible entry?

one doesn't need to append 'vfio-dt' to the device tree compatibles if
one uses the hack that makes bind_store ignore trying to
driver_match_device() if the driver is 'vfio-dt'.

Kim
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: RFC: (re-)binding the VFIO platform driver to a platform device

2013-10-01 Thread Kim Phillips
On Tue, 1 Oct 2013 13:00:54 -0700
Greg Kroah-Hartman gre...@linuxfoundation.org wrote:

 On Tue, Oct 01, 2013 at 01:38:31PM -0500, Kim Phillips wrote:
  Hi,
  
  Santosh and I are having a problem figuring out how to enable binding
  (and re-binding) platform devices to a platform VFIO driver (see
  Antonis' WIP: [1]) in an upstream-acceptable manner.
  
  Binding platform drivers currently depends on a string match in the
  device node's compatible entry.  On an arndale, one can currently
  rebind the same device to the same driver like so:
  
  echo 12ce.i2c  
  /sys/bus/platform/drivers/s3c-i2c/12ce.i2c/driver/unbind
  echo 12ce.i2c  /sys/bus/platform/drivers/s3c-i2c/bind
  
  And one can bind it to the vfio-dt driver, as Antonis instructs, by
  appending a 'vfio-dt' string to the device tree compatible entry for
  the device.  Then this would work:
  
  echo 12ce.i2c  
  /sys/bus/platform/drivers/s3c-i2c/12ce.i2c/driver/unbind
  echo 12ce.i2c  /sys/bus/platform/drivers/vfio-dt/bind
  
  Consequently, the hack patch below [2] allows any platform device to be
  bound to the vfio-dt driver, without making changes to the device
  tree.  It's a hack because I don't see having any driver name specific
  code in drivers/base/bus.c being upstream acceptable.
 
 You are correct.
 
 What is wrong with just doing the above unbind/bind things through
 sysfs, that is what it is there for, right?

The bind fails because the compatible string in the device tree doesn't
match that of the VFIO platform driver, so driver_match_device always
returns false.

Kim
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: spinlock contention of files-file_lock

2013-10-01 Thread Al Viro
On Tue, Oct 01, 2013 at 02:41:58PM -0700, Eric Dumazet wrote:
 Maybe I am missing something obvious ?

Yes.  do_execve_common() starts with unshare_files(); there can be
no other thread capable of modifying that descriptor table.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] [workqueue] check values of pwq and wq in print_worker_info() before use

2013-10-01 Thread Helge Deller
On 10/01/2013 11:40 PM, James Bottomley wrote:
 On Tue, 2013-10-01 at 16:43 -0400, Tejun Heo wrote:
 Hello,

 On Tue, Oct 01, 2013 at 10:35:20PM +0200, Helge Deller wrote:
 print_worker_info() includes no validity check on the pwq and wq
 pointers before handing them over to the probe_kernel_read() functions.

 It seems that most architectures don't care about that, but at least on
 the parisc architecture this leads to a kernel crash since accesses to
 page zero are protected by the kernel for security reasons.

 Fix this problem by verifying the contents of pwq and wq before usage.
 Even if probe_kernel_read() usually prevents such crashes by disabling
 page faults, clean code should always include such checks. 

 Without this fix issuing echo t  /proc/sysrq-trigger will immediately
 crash the Linux kernel on the parisc architecture.

 Hmm... um had similar problem but the root cause here is that the arch
 isn't implementing probe_kernel_read() properly.  We really have no
 idea what the pointer value may be at the dump point and that's why we
 use probe_kernel_read().  If something like the above is necessary for
 the time being, the correct place would be the arch
 probe_kernel_read() implementation.  James, would it be difficult
 implement proper probe_kernel_read() on parisc?
 
 The problem seems to be that some traps bypass our exception table
 handling.  

Yes, that's correct.
It's trap #26 and we directly call parisc_terminate() for fault_space==0
without checking the exception table.
See my patch I posted a few hours ago which fixes this:
https://patchwork.kernel.org/patch/2971701/

 Helge, do you have the actual stack trace for this?  That
 should show where the exception handling is missing.

Here it is:
[47072.976000] ksoftirqd/0 R  running task0 3  2 0x
[47072.976000] Backtrace:
[47072.976000]  [40113a54] __schedule+0x62c/0x808
[47072.976000]
[47072.976000] kworker/0:0HS 401040c0 0 5  2 0x
[47073.468000] Backtrace:
[47073.468000]  [40464264] pa_memcpy+0x44/0xb0
[47073.468000]  [404643e0] __copy_from_user+0x60/0x90
[47073.468000]  [401d99bc] __probe_kernel_read+0x54/0x90
[47073.468000]  [4016cc70] print_worker_info+0x158/0x2c0
[47073.468000]  [40185a60] sched_show_task+0x1c8/0x210
[47073.468000]  [40185b64] show_state_filter+0xbc/0x138
[47073.468000]  [404e85c4] sysrq_handle_showstate+0x34/0x48
[47073.468000]  [404e9154] __handle_sysrq+0x174/0x2f0
[47073.468000]  [404e933c] write_sysrq_trigger+0x6c/0x90
[47073.468000]  [402ca2fc] proc_reg_write+0xbc/0x130
[47073.468000]  [40236d44] vfs_write+0x114/0x268
[47073.468000]  [402373a4] SyS_write+0x94/0xf8
[47073.468000]  [40105fc0] syscall_exit+0x0/0x14
[47073.468000]
[47073.468000]
[47073.468000] Kernel Fault: Code=26 regs=958a09b0 
(Addr=0008)
[47073.468000] CPU: 0 PID: 30189 Comm: bash Not tainted 3.12.0-rc3-64bit+ #1
[47073.468000] task: 7ba64100 ti: 958a task.ti: 
958a
[47073.468000]
[47073.468000]  YZrvWESTHLNXBCVMcbcbcbcbOGFRQPDI
[47073.468000] PSW: 11001110 Not tainted
[47073.468000] r00-03  00ff0804ff0e 958a08c0 40464264 
958a0960
[47073.468000] r04-07  40d73db0 0008 0008 
958a06f8
[47073.468000] r08-11  958a0600 40c49d18 af535494 
958a0370
[47073.468000] r12-15    0010e7e8 
000fde28
[47073.468000] r16-19   000c7800  

[47073.468000] r20-23  958a06e0 0018 0018 
0003
[47073.468000] r24-27  0008 0008 958a06f8 
40d73db0
[47073.468000] r28-31  958a06f8 958a0930 958a09b0 
0008
[47073.468000] sr00-03  05dc5000   
05dc5000
[47073.468000] sr04-07     

[47073.468000]
[47073.468000] IASQ:   IAOQ: 40463fdc 
40463fe0
[47073.468000]  IIR: 0fe25033ISR:   IOR: 0008
[47073.468000]  CPU:0   CR30: 958a CR31: 
[47073.468000]  ORIG_R28: 958a0b40
[47073.468000]  IAOQ[0]: pa_memcpy_internal+0xec/0x2b4
[47073.468000]  IAOQ[1]: pa_memcpy_internal+0xf0/0x2b4
[47073.468000]  RP(r2): pa_memcpy+0x44/0xb0
[47073.468000] Backtrace:
[47073.468000]  [40464264] pa_memcpy+0x44/0xb0
[47073.468000]  [404643e0] __copy_from_user+0x60/0x90
[47073.468000]  [401d99bc] __probe_kernel_read+0x54/0x90
[47073.468000]  [4016cc70] print_worker_info+0x158/0x2c0
[47073.468000]  [40185a60] sched_show_task+0x1c8/0x210
[47073.468000]  [40185b64] 

Re: spinlock contention of files-file_lock

2013-10-01 Thread Eric Dumazet
On Tue, 2013-10-01 at 23:04 +0100, Al Viro wrote:
 On Tue, Oct 01, 2013 at 02:41:58PM -0700, Eric Dumazet wrote:
  Maybe I am missing something obvious ?
 
 Yes.  do_execve_common() starts with unshare_files(); there can be
 no other thread capable of modifying that descriptor table.

Hmm, then what's the point of using spin_lock() here ?

This gives wrong hints ;)

diff --git a/fs/file.c b/fs/file.c
index 4a78f98..cdbca0d 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -615,11 +615,10 @@ void do_close_on_exec(struct files_struct *files)
struct fdtable *fdt;
 
/* exec unshares first */
-   spin_lock(files-file_lock);
+   fdt = files_fdtable(files);
for (i = 0; ; i++) {
unsigned long set;
unsigned fd = i * BITS_PER_LONG;
-   fdt = files_fdtable(files);
if (fd = fdt-max_fds)
break;
set = fdt-close_on_exec[i];
@@ -635,14 +634,11 @@ void do_close_on_exec(struct files_struct *files)
continue;
rcu_assign_pointer(fdt-fd[fd], NULL);
__put_unused_fd(files, fd);
-   spin_unlock(files-file_lock);
filp_close(file, files);
cond_resched();
-   spin_lock(files-file_lock);
}
 
}
-   spin_unlock(files-file_lock);
 }
 
 struct file *fget(unsigned int fd)


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 07/13] tpm: Remove tpm_show_caps_1_2

2013-10-01 Thread Jason Gunthorpe
On Wed, Oct 02, 2013 at 12:09:22AM +0200, Peter H?we wrote:

Since the tpm_spi_stm_st33, tpm_i2c_nuvoton and tpm_i2c_atmel drivers
are not yet merged and were heavily improved by you anyway, please
include this improvement directly in the new drivers.

Okay, it is easy enough to invert the patch order and put the drivers
last. I am not submitting the st33 driver, but when things are done I
will send Mathias a patch with the changes from this series that are
needed and he can deal with re-submitting his driver.

FWIW, I needed the drivers first to keep my world sane when testing
this thing, so the patches are all as-tested from my perspective.

Jason
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/7] arm: dts: add dra7 IVA thermal data

2013-10-01 Thread Nishanth Menon
On 14:32-20131001, Eduardo Valentin wrote:
minor comments follow
 This patch changes a dtsi file to contain the thermal data
s/changes/introduces?
 for IVA domain on DRA7 and later SoCs. This data will
 enable a thermal shutdown at 125C.
 
 This thermal data can be reused across TI SoC devices.
is'nt it just DRA7 that reuses this - based on dtsi name?
 
 Signed-off-by: Eduardo Valentin eduardo.valen...@ti.com
 ---
  arch/arm/boot/dts/dra7-iva-thermal.dtsi | 28 
  1 file changed, 28 insertions(+)
  create mode 100644 arch/arm/boot/dts/dra7-iva-thermal.dtsi
 
 diff --git a/arch/arm/boot/dts/dra7-iva-thermal.dtsi 
 b/arch/arm/boot/dts/dra7-iva-thermal.dtsi
 new file mode 100644
 index 000..fea0cea
 --- /dev/null
 +++ b/arch/arm/boot/dts/dra7-iva-thermal.dtsi
 @@ -0,0 +1,28 @@
 +/*
 + * Device Tree Source for DRA7 SoC IVA thermal
 + *
 + * Copyright (C) 2013 Texas Instruments Incorporated - http://www.ti.com/
 + * Contact: Eduardo Valentin eduardo.valen...@ti.com
 + *
 + * This file is licensed under the terms of the GNU General Public License
 + * version 2.  This program is licensed as is without any warranty of any
 + * kind, whether express or implied.
 + */
 +
 +#include dt-bindings/thermal/thermal.h
 +
 +iva_thermal: iva_thermal {
 + polling-delay-passive = 250; /* milliseconds */
 + polling-delay = 1000; /* milliseconds */
 +
 + /* sensor   ID */
^^ double tab here?
 + thermal-sensors = bandgap 4;
space after bandgap is good enough?
 +
 + trips {
 + iva_crit: iva_crit {
 + temperature = 125000; /* milliCelsius */
 + hysteresis = 2000; /* milliCelsius */
 + type = THERMAL_TRIP_CRITICAL;
 + };
 + };
 +};

-- 
Regards,
Nishanth Menon
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] [workqueue] check values of pwq and wq in print_worker_info() before use

2013-10-01 Thread Helge Deller
On 10/01/2013 11:07 PM, Tejun Heo wrote:
 On Tue, Oct 01, 2013 at 05:03:48PM -0400, Tejun Heo wrote:
 On Tue, Oct 01, 2013 at 10:53:31PM +0200, Helge Deller wrote:
 So, in summary my patch here is not really necessary, but for the sake of
 clean code I think it doesn't hurt either and as such it would be nice if
 you could apply it.

 What? function *must* take any value and try to access it and not
 cause failure.  That's the *whole* purpose of that interface.  How is
 having incomplete spurious checks around it clean code in any sense
 of the word?  That doesn't make any sense.
 
 Just in case you didn't know already.  probe_kernel_read()'s role is
 to take any ulong value and dereference it if it can.  If not, it can
 return any value, but it shouldn't crash in any case.  If you're just
 adding NULL test in probe_kernel_read(), you're just masking a common
 failure pattern and the kernel still *will* panic while dumping the
 states.  If a specific arch doesn't have proper probe_kernel_read()
 implementation, adding if (!NULL) test there could be a temporary
 workaround, but it should be clearly marked as such.

Sure, probe_kernel_read() takes care that no segfaults will happen.
Nevertheless, if we know that pwq might become NULL, why access pwq-wq at 
all?
  struct pool_workqueue *pwq = NULL;
  probe_kernel_read(wq, pwqwq, sizeof(wq));

If you wouldn't have used probe_kernel_read() you would never code it 
like that. That's what I meant when I wrote clean coding (aka similar
to what you would have done without probe_kernel_read()).

Helge
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tpmdd-devel] [PATCH 07/13] tpm: Remove tpm_show_caps_1_2

2013-10-01 Thread Peter Hüwe
Hi Jason,

Am Mittwoch, 2. Oktober 2013, 00:21:13 schrieb Jason Gunthorpe:
 On Wed, Oct 02, 2013 at 12:09:22AM +0200, Peter H?we wrote:
 Since the tpm_spi_stm_st33, tpm_i2c_nuvoton and tpm_i2c_atmel drivers
 are not yet merged and were heavily improved by you anyway, please
 include this improvement directly in the new drivers.
 
 Okay, it is easy enough to invert the patch order and put the drivers
 last. I am not submitting the st33 driver, but when things are done I
 will send Mathias a patch with the changes from this series that are
 needed and he can deal with re-submitting his driver.
 
 FWIW, I needed the drivers first to keep my world sane when testing
 this thing, so the patches are all as-tested from my perspective.

nevermind and don't worry - the way you submitted the patches is perfectly 
fine for me.
You can keep the patches seperate and in case of doubt I can simply squash 
them in the end.

Thanks,
Peter
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/7] arm: dts: add dra7 DSPEVE thermal data

2013-10-01 Thread Nishanth Menon
On 14:32-20131001, Eduardo Valentin wrote:
 This patch changes a dtsi file to contain the thermal data
 ^^ introduces?
 for DSPEVE domain on DRA7 and later SoCs. This data will
 enable a thermal shutdown at 125C.
 
 This thermal data can be reused across TI SoC devices.
 
 Signed-off-by: Eduardo Valentin eduardo.valen...@ti.com
 ---
  arch/arm/boot/dts/dra7-dspeve-thermal.dtsi | 28 
  1 file changed, 28 insertions(+)
  create mode 100644 arch/arm/boot/dts/dra7-dspeve-thermal.dtsi
 
 diff --git a/arch/arm/boot/dts/dra7-dspeve-thermal.dtsi 
 b/arch/arm/boot/dts/dra7-dspeve-thermal.dtsi
 new file mode 100644
 index 000..f8b9051
 --- /dev/null
 +++ b/arch/arm/boot/dts/dra7-dspeve-thermal.dtsi
 @@ -0,0 +1,28 @@
 +/*
 + * Device Tree Source for DRA7 SoC DSPEVE thermal
 + *
 + * Copyright (C) 2013 Texas Instruments Incorporated - http://www.ti.com/
 + * Contact: Eduardo Valentin eduardo.valen...@ti.com
 + *
 + * This file is licensed under the terms of the GNU General Public License
 + * version 2.  This program is licensed as is without any warranty of any
 + * kind, whether express or implied.
 + */
 +
 +#include dt-bindings/thermal/thermal.h
 +
 +dspeve_thermal: dspeve_thermal {
 + polling-delay-passive = 250; /* milliseconds */
 + polling-delay = 1000; /* milliseconds */
 +
 + /* sensor   ID */
^^
 + thermal-sensors = bandgap 3;
^^ tab control a bit? ;)
 +
 + trips {
 + dspeve_crit: dspeve_crit {
 + temperature = 125000; /* milliCelsius */
 + hysteresis = 2000; /* milliCelsius */
 + type = THERMAL_TRIP_CRITICAL;
 + };
 + };
 +};

-- 
Regards,
Nishanth Menon
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] [workqueue] check values of pwq and wq in print_worker_info() before use

2013-10-01 Thread Tejun Heo
Hello,

On Wed, Oct 02, 2013 at 12:34:53AM +0200, Helge Deller wrote:
 Sure, probe_kernel_read() takes care that no segfaults will happen.
 Nevertheless, if we know that pwq might become NULL, why access pwq-wq at 
 all?
   struct pool_workqueue *pwq = NULL;
   probe_kernel_read(wq, pwqwq, sizeof(wq));
 
 If you wouldn't have used probe_kernel_read() you would never code it 
 like that. That's what I meant when I wrote clean coding (aka similar
 to what you would have done without probe_kernel_read()).

Because it is using probe_kernel_read() and such test wouldn't mean
anything?  It may be NULL, it may be 1 or full Fs.  NULL is just one
of many illegal pointers which may happen.  Why add code which doesn't
achieve anything when you're explicitly trying to access pointers
which you know could be invalid?  Why is that clean?  Is if (p)
kfree(p) cleaner than kfree(p)?

Thanks.

-- 
tejun
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: RFC: (re-)binding the VFIO platform driver to a platform device

2013-10-01 Thread Scott Wood
On Tue, 2013-10-01 at 16:59 -0500, Kim Phillips wrote:
 On Tue, 1 Oct 2013 14:15:38 -0500
 Scott Wood scottw...@freescale.com wrote:
 
  I think the ideal interface would be if you could write the sysfs device
  name into the vfio bind file (or some new file in the same directory),
  and have it claim that device (preferably with an atomic unbind from the
  previous driver).
 
 ok.

...which apparently is what you are already doing (except for the atomic
part).  My recollection of how this works on PCI (via new_id) apparently
kept me from reading it properly. :-P

  We shouldn't be messing around with compatible
  (either modifying it or telling VFIO which compatibles to look for) when
  we know the specific devices (not just type of devices) we want to bind.
 
 ok, but I still don't see how to get past driver_match_device()'s
 refusal to allow bind a non-compatible driver (or one who's name isn't
 in the compatible list).

Probably something similar to your hack, except use a flag or some other
neutral mechanism rather than a driver name.

The flag could be something like I'll try to bind to any device on this
bus, but only if explicitly requested.

-Scott



--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] OOM killer: wait for tasks with pending SIGKILL to exit

2013-10-01 Thread David Rientjes
On Tue, 1 Oct 2013, Sergey Dyasly wrote:

 If you are ok with the first change in my patch regarding 
 fatal_signal_pending,
 I can send new patch with just that change.
 

The entire patch is pointless, there's no need to give access to memory 
reserves simply because it is PF_EXITING.  If it needs memory, it will 
call the oom killer itself.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [BUG] Regression in 2fdac010 drivers/net/ethernet/via/via-velocity.c: update napi implementation

2013-10-01 Thread Francois Romieu
Julia Lawall julia.law...@lip6.fr :
[...]
 There has already been a discussion about this, and a patch has already 
 been proposed.  It has to do with lock managament.  I will look for the 
 email.

The underlying problem has to do with disabled irq. netif_receive_skb
assumes irq to be enabled. Current via-velocity poll() method should
narrow its (spinlocked) irq disabled section.

What I've done should not require much analysis (aka what could race
with the rx bh processing in a napi driver ?) and avoids a more intrusive
lockless napi design.

-- 
Ueimor
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] [workqueue] check values of pwq and wq in print_worker_info() before use

2013-10-01 Thread Tejun Heo
On Tue, Oct 01, 2013 at 06:40:23PM -0400, Tejun Heo wrote:
 Because it is using probe_kernel_read() and such test wouldn't mean
 anything?  It may be NULL, it may be 1 or full Fs.  NULL is just one
 of many illegal pointers which may happen.  Why add code which doesn't
 achieve anything when you're explicitly trying to access pointers
 which you know could be invalid?  Why is that clean?  Is if (p)
 kfree(p) cleaner than kfree(p)?

Here's one general rule of thumb for cleanliness - try to do the
minimal because that's something many people can agree on.  If people
do stuff which aren't necessary, naturally different people would have
different opinions on what's cleaner / better and inevitably end up
with different choices as the choices made are functionally superflous
none would fail and we'll end up with various variants for the same
thing for no good reason, which is messy.  Adding if (p) in front of
probe_kernel_read(p) is inherently superflous and you wouldn't have
any way to enforce or even encourage such practice and the end result
would inevitably be if (p) being sprayed randomly, which is the
opposite of cleanliness.

So, no, please don't add random tests which aren't essential.  It is
inherently messy thing to do.

Thanks.

-- 
tejun
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 3/7] arm: dts: dra7: add bandgap entry

2013-10-01 Thread Nishanth Menon
On 14:32-20131001, Eduardo Valentin wrote:
 This patch adds bandgap IP entry on DRA7 dtsi device tree file.
 
 Signed-off-by: Eduardo Valentin eduardo.valen...@ti.com
 ---
  arch/arm/boot/dts/dra7.dtsi | 12 
  1 file changed, 12 insertions(+)
 
 diff --git a/arch/arm/boot/dts/dra7.dtsi b/arch/arm/boot/dts/dra7.dtsi
 index 3a746c2..4e9b159 100644
 --- a/arch/arm/boot/dts/dra7.dtsi
 +++ b/arch/arm/boot/dts/dra7.dtsi
 @@ -53,6 +53,18 @@
   };
   };
  
 + bandgap: bandgap {
would you like to follow bandgap: bandgap@4a0021e0 convention?
Also, could you move it under ocp?
I already commented about this previously here: 
https://lkml.org/lkml/2013/9/27/300

 + reg = 0x4a0021e0 0xc
 + 0x4a00232c 0xc
 + 0x4a002380 0x2c
 + 0x4a0023C0 0x3c
 + 0x4a002564 0x8
 + 0x4a002574 0x50;
 + compatible = ti,dra752-bandgap;
 + interrupts = GIC_SPI 126 IRQ_TYPE_LEVEL_HIGH;
 + #thermal-sensor-cells = 1;
 + };
 +
   gic: interrupt-controller@48211000 {
   compatible = arm,cortex-a15-gic;
   interrupt-controller;
-- 
Regards,
Nishanth Menon
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 08/13] tpm: Pull everything related to /dev/tpmX into tpm-dev.c

2013-10-01 Thread Peter Hüwe
Am Montag, 23. September 2013, 20:14:38 schrieb Jason Gunthorpe:
 CLASS-dev.c is a common idiom for Linux subsystems
 
 This pulls all the code related to the miscdev into tpm-dev.c and makes it
 static. The identical file_operation structs in the drivers are purged and
 the tpm common code unconditionally creates the miscdev.
 
 Signed-off-by: Jason Gunthorpe jguntho...@obsidianresearch.com
 ---
  drivers/char/tpm/Makefile   |   2 +-
  drivers/char/tpm/tpm-dev.c  | 199

When compiling the tpm drivers as modules I get
ERROR: tpm_sysfs_del_device [drivers/char/tpm/tpm.ko] undefined!
ERROR: tpm_dev_add_device [drivers/char/tpm/tpm.ko] undefined!
ERROR: tpm_dev_del_device [drivers/char/tpm/tpm.ko] undefined!
ERROR: tpm_sysfs_add_device [drivers/char/tpm/tpm.ko] undefined!
ERROR: tpm_transmit [drivers/char/tpm/tpm-sysfs.ko] undefined!
ERROR: tpm_pcr_read_dev [drivers/char/tpm/tpm-sysfs.ko] undefined!
ERROR: tpm_getcap [drivers/char/tpm/tpm-sysfs.ko] undefined!
ERROR: tpm_transmit [drivers/char/tpm/tpm-dev.ko] undefined!


I added a suitable patch with the appropriate EXPORT_SYMBOL_GPL declarations 
to my testing branch. (also see next message)


Thanks,
Peter
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] [workqueue] check values of pwq and wq in print_worker_info() before use

2013-10-01 Thread James Bottomley
On Wed, 2013-10-02 at 00:07 +0200, Helge Deller wrote:
 On 10/01/2013 11:40 PM, James Bottomley wrote:
  On Tue, 2013-10-01 at 16:43 -0400, Tejun Heo wrote:
  Hello,
 
  On Tue, Oct 01, 2013 at 10:35:20PM +0200, Helge Deller wrote:
  print_worker_info() includes no validity check on the pwq and wq
  pointers before handing them over to the probe_kernel_read() functions.
 
  It seems that most architectures don't care about that, but at least on
  the parisc architecture this leads to a kernel crash since accesses to
  page zero are protected by the kernel for security reasons.
 
  Fix this problem by verifying the contents of pwq and wq before usage.
  Even if probe_kernel_read() usually prevents such crashes by disabling
  page faults, clean code should always include such checks. 
 
  Without this fix issuing echo t  /proc/sysrq-trigger will immediately
  crash the Linux kernel on the parisc architecture.
 
  Hmm... um had similar problem but the root cause here is that the arch
  isn't implementing probe_kernel_read() properly.  We really have no
  idea what the pointer value may be at the dump point and that's why we
  use probe_kernel_read().  If something like the above is necessary for
  the time being, the correct place would be the arch
  probe_kernel_read() implementation.  James, would it be difficult
  implement proper probe_kernel_read() on parisc?
  
  The problem seems to be that some traps bypass our exception table
  handling.  
 
 Yes, that's correct.
 It's trap #26 and we directly call parisc_terminate() for fault_space==0
 without checking the exception table.
 See my patch I posted a few hours ago which fixes this:
 https://patchwork.kernel.org/patch/2971701/

That doesn't quite look right ... I guessed it was probably access
rights, so we should do an exception table fixup, so isn't this the fix?
because we shouldn't call do_page_fault if there's no exception table.

James

---
diff --git a/arch/parisc/kernel/traps.c b/arch/parisc/kernel/traps.c
index 04e47c6..25a088a 100644
--- a/arch/parisc/kernel/traps.c
+++ b/arch/parisc/kernel/traps.c
@@ -684,6 +684,8 @@ void notrace handle_interruption(int code, struct pt_regs 
*regs)
/* Fall Through */
case 26: 
/* PCXL: Data memory access rights trap */
+   if (!user_mode(regs)  fixup_exception(regs))
+   return;
fault_address = regs-ior;
fault_space   = regs-isr;
break;


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 6/7] arm: dts: add tmp102 i2c sensor node on dra7-evm

2013-10-01 Thread Nishanth Menon
On 14:32-20131001, Eduardo Valentin wrote:
 On dra7-evm there is an tmp102 temperature sensor on i2c bus 1.
 This patch adds its device tree node.
 
 Signed-off-by: Eduardo Valentin eduardo.valen...@ti.com
 ---
  arch/arm/boot/dts/dra7-evm.dts | 8 
  1 file changed, 8 insertions(+)
 
 diff --git a/arch/arm/boot/dts/dra7-evm.dts b/arch/arm/boot/dts/dra7-evm.dts
 index 21fe16b..3b6c16a 100644
 --- a/arch/arm/boot/dts/dra7-evm.dts
 +++ b/arch/arm/boot/dts/dra7-evm.dts
 @@ -7,6 +7,8 @@
   */
  /dts-v1/;
  
 +#include dt-bindings/thermal/thermal.h
 +

^^ needed?

  #include dra7.dtsi
  
  / {
 @@ -93,6 +95,12 @@
   pinctrl-names = default;
   pinctrl-0 = i2c1_pins;
   clock-frequency = 40;
could you add an EOL here?
 + tmp102: tmp102@48{
a space before the {?
 + compatible = ti,tmp102;
 + reg = 0x48;
 +
 + #thermal-sensor-cells = 0;
 + };
  };
  
  i2c2 {
 -- 
 1.8.2.1.342.gfa7285d
 
 --
 To unsubscribe from this list: send the line unsubscribe linux-pm in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html

-- 
Regards,
Nishanth Menon
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 08/13] tpm: Pull everything related to /dev/tpmX into tpm-dev.c

2013-10-01 Thread Jason Gunthorpe
On Wed, Oct 02, 2013 at 12:52:40AM +0200, Peter H?we wrote:
 Am Montag, 23. September 2013, 20:14:38 schrieb Jason Gunthorpe:
  CLASS-dev.c is a common idiom for Linux subsystems
  
  This pulls all the code related to the miscdev into tpm-dev.c and makes it
  static. The identical file_operation structs in the drivers are purged and
  the tpm common code unconditionally creates the miscdev.
  
  Signed-off-by: Jason Gunthorpe jguntho...@obsidianresearch.com
   drivers/char/tpm/Makefile   |   2 +-
   drivers/char/tpm/tpm-dev.c  | 199
 
 When compiling the tpm drivers as modules I get
 ERROR: tpm_sysfs_del_device [drivers/char/tpm/tpm.ko] undefined!
 ERROR: tpm_dev_add_device [drivers/char/tpm/tpm.ko] undefined!
 ERROR: tpm_dev_del_device [drivers/char/tpm/tpm.ko] undefined!
 ERROR: tpm_sysfs_add_device [drivers/char/tpm/tpm.ko] undefined!
 ERROR: tpm_transmit [drivers/char/tpm/tpm-sysfs.ko] undefined!
 ERROR: tpm_pcr_read_dev [drivers/char/tpm/tpm-sysfs.ko] undefined!
 ERROR: tpm_getcap [drivers/char/tpm/tpm-sysfs.ko] undefined!
 ERROR: tpm_transmit [drivers/char/tpm/tpm-dev.ko] undefined!

Oh, I am glad you can test modules..

I botched the makefile changes for the new .c files.

I believe it should be like this:

obj-$(CONFIG_TCG_TPM) += tpm-core.o
tpm-core-y := tpm.o tpm-dev.o tpm-sysfs.o

 I added a suitable patch with the appropriate EXPORT_SYMBOL_GPL declarations 
 to my testing branch. (also see next message)

EXPORT_SYMBOL_GPL is not correct, these are in-module references, not
cross module references, and I've deliberately not exported them to
prevent drivers from trying to use them inappropriately.

Let me know if you want me to respin things.

Regards,
Jason
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


<    1   2   3   4   5   6   7   8   9   10   >