[PATCH V3 3/3] vdpa_sim: flush workers on suspend

2024-05-20 Thread Steve Sistare
Flush to guarantee no workers are running when suspend returns.
Add a lock to enforce ordering between clearing running, flushing,
and posting new work in vdpasim_kick_vq.  It must be a spin lock
because vdpasim_kick_vq may be reached va eventfd_write.

Signed-off-by: Steve Sistare 
---
 drivers/vdpa/vdpa_sim/vdpa_sim.c | 16 ++--
 drivers/vdpa/vdpa_sim/vdpa_sim.h |  1 +
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c
index 8ffea8430f95..67ed49d95bf0 100644
--- a/drivers/vdpa/vdpa_sim/vdpa_sim.c
+++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c
@@ -322,7 +322,7 @@ static u16 vdpasim_get_vq_size(struct vdpa_device *vdpa, 
u16 idx)
return VDPASIM_QUEUE_MAX;
 }
 
-static void vdpasim_kick_vq(struct vdpa_device *vdpa, u16 idx)
+static void vdpasim_do_kick_vq(struct vdpa_device *vdpa, u16 idx)
 {
struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
struct vdpasim_virtqueue *vq = >vqs[idx];
@@ -337,6 +337,15 @@ static void vdpasim_kick_vq(struct vdpa_device *vdpa, u16 
idx)
vdpasim_schedule_work(vdpasim);
 }
 
+static void vdpasim_kick_vq(struct vdpa_device *vdpa, u16 idx)
+{
+   struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+
+   spin_lock(>kick_lock);
+   vdpasim_do_kick_vq(vdpa, idx);
+   spin_unlock(>kick_lock);
+}
+
 static void vdpasim_set_vq_cb(struct vdpa_device *vdpa, u16 idx,
  struct vdpa_callback *cb)
 {
@@ -520,8 +529,11 @@ static int vdpasim_suspend(struct vdpa_device *vdpa)
struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
 
mutex_lock(>mutex);
+   spin_lock(>kick_lock);
vdpasim->running = false;
+   spin_unlock(>kick_lock);
mutex_unlock(>mutex);
+   kthread_flush_work(>work);
 
return 0;
 }
@@ -537,7 +549,7 @@ static int vdpasim_resume(struct vdpa_device *vdpa)
if (vdpasim->pending_kick) {
/* Process pending descriptors */
for (i = 0; i < vdpasim->dev_attr.nvqs; ++i)
-   vdpasim_kick_vq(vdpa, i);
+   vdpasim_do_kick_vq(vdpa, i);
 
vdpasim->pending_kick = false;
}
diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.h b/drivers/vdpa/vdpa_sim/vdpa_sim.h
index bb137e479763..5eb6ca9c5ec5 100644
--- a/drivers/vdpa/vdpa_sim/vdpa_sim.h
+++ b/drivers/vdpa/vdpa_sim/vdpa_sim.h
@@ -75,6 +75,7 @@ struct vdpasim {
bool pending_kick;
/* spinlock to synchronize iommu table */
spinlock_t iommu_lock;
+   spinlock_t kick_lock;
 };
 
 struct vdpasim *vdpasim_create(struct vdpasim_dev_attr *attr,
-- 
2.39.3




[PATCH V3 2/3] vduse: suspend

2024-05-20 Thread Steve Sistare
Support the suspend operation.  There is little to do, except flush to
guarantee no workers are running when suspend returns.

Signed-off-by: Steve Sistare 
---
 drivers/vdpa/vdpa_user/vduse_dev.c | 24 
 1 file changed, 24 insertions(+)

diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c 
b/drivers/vdpa/vdpa_user/vduse_dev.c
index 73c89701fc9d..7dc46f771f12 100644
--- a/drivers/vdpa/vdpa_user/vduse_dev.c
+++ b/drivers/vdpa/vdpa_user/vduse_dev.c
@@ -472,6 +472,18 @@ static void vduse_dev_reset(struct vduse_dev *dev)
up_write(>rwsem);
 }
 
+static void vduse_flush_work(struct vduse_dev *dev)
+{
+   flush_work(>inject);
+
+   for (int i = 0; i < dev->vq_num; i++) {
+   struct vduse_virtqueue *vq = dev->vqs[i];
+
+   flush_work(>inject);
+   flush_work(>kick);
+   }
+}
+
 static int vduse_vdpa_set_vq_address(struct vdpa_device *vdpa, u16 idx,
u64 desc_area, u64 driver_area,
u64 device_area)
@@ -724,6 +736,17 @@ static int vduse_vdpa_reset(struct vdpa_device *vdpa)
return ret;
 }
 
+static int vduse_vdpa_suspend(struct vdpa_device *vdpa)
+{
+   struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+
+   down_write(>rwsem);
+   vduse_flush_work(dev);
+   up_write(>rwsem);
+
+   return 0;
+}
+
 static u32 vduse_vdpa_get_generation(struct vdpa_device *vdpa)
 {
struct vduse_dev *dev = vdpa_to_vduse(vdpa);
@@ -806,6 +829,7 @@ static const struct vdpa_config_ops vduse_vdpa_config_ops = 
{
.set_vq_affinity= vduse_vdpa_set_vq_affinity,
.get_vq_affinity= vduse_vdpa_get_vq_affinity,
.reset  = vduse_vdpa_reset,
+   .suspend= vduse_vdpa_suspend,
.set_map= vduse_vdpa_set_map,
.free   = vduse_vdpa_free,
 };
-- 
2.39.3




[PATCH V3 0/3] flush workers on suspend

2024-05-20 Thread Steve Sistare
Flush to guarantee no workers are running when suspend returns,
for vdpa, vdpa_sim, and vduse.  (mlx5 already does so, via the path
mlx5_vdpa_suspend -> unregister_link_notifier -> flush_workqueue.)

Changes in V2:
  - renamed "vduse: suspend" (was vduse: flush workers on suspend)
  - call vhost_dev_flush unconditionally in "vhost-vdpa: flush workers on 
suspend"

Changes in v3:
  - rewrote vdpa_sim patch

Steve Sistare (3):
  vhost-vdpa: flush workers on suspend
  vduse: suspend
  vdpa_sim: flush workers on suspend

 drivers/vdpa/vdpa_sim/vdpa_sim.c   | 16 ++--
 drivers/vdpa/vdpa_sim/vdpa_sim.h   |  1 +
 drivers/vdpa/vdpa_user/vduse_dev.c | 24 
 drivers/vhost/vdpa.c   |  3 +++
 4 files changed, 42 insertions(+), 2 deletions(-)

-- 
2.39.3




[PATCH V3 1/3] vhost-vdpa: flush workers on suspend

2024-05-20 Thread Steve Sistare
Flush to guarantee no workers are running when suspend returns.

Fixes: f345a0143b4d ("vhost-vdpa: uAPI to suspend the device")
Signed-off-by: Steve Sistare 
Acked-by: Eugenio Pérez 
---
 drivers/vhost/vdpa.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index ba52d128aeb7..189596caaec9 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -594,6 +594,7 @@ static long vhost_vdpa_suspend(struct vhost_vdpa *v)
struct vdpa_device *vdpa = v->vdpa;
const struct vdpa_config_ops *ops = vdpa->config;
int ret;
+   struct vhost_dev *vdev = >vdev;
 
if (!(ops->get_status(vdpa) & VIRTIO_CONFIG_S_DRIVER_OK))
return 0;
@@ -601,6 +602,8 @@ static long vhost_vdpa_suspend(struct vhost_vdpa *v)
if (!ops->suspend)
return -EOPNOTSUPP;
 
+   vhost_dev_flush(vdev);
+
ret = ops->suspend(vdpa);
if (!ret)
v->suspended = true;
-- 
2.39.3




[PATCH V2] vdpa: skip suspend/resume ops if not DRIVER_OK

2024-02-13 Thread Steve Sistare
If a vdpa device is not in state DRIVER_OK, then there is no driver state
to preserve, so no need to call the suspend and resume driver ops.

Suggested-by: Eugenio Perez Martin "
Signed-off-by: Steve Sistare 
---
 drivers/vhost/vdpa.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index bc4a51e4638b..aef92a7c57f3 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -595,6 +595,9 @@ static long vhost_vdpa_suspend(struct vhost_vdpa *v)
const struct vdpa_config_ops *ops = vdpa->config;
int ret;
 
+   if (!(ops->get_status(vdpa) & VIRTIO_CONFIG_S_DRIVER_OK))
+   return 0;
+
if (!ops->suspend)
return -EOPNOTSUPP;
 
@@ -615,6 +618,9 @@ static long vhost_vdpa_resume(struct vhost_vdpa *v)
const struct vdpa_config_ops *ops = vdpa->config;
int ret;
 
+   if (!(ops->get_status(vdpa) & VIRTIO_CONFIG_S_DRIVER_OK))
+   return 0;
+
if (!ops->resume)
return -EOPNOTSUPP;
 
-- 
2.39.3




[PATCH V2 2/3] vduse: suspend

2024-02-12 Thread Steve Sistare
Support the suspend operation.  There is little to do, except flush to
guarantee no workers are running when suspend returns.

Signed-off-by: Steve Sistare 
---
 drivers/vdpa/vdpa_user/vduse_dev.c | 24 
 1 file changed, 24 insertions(+)

diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c 
b/drivers/vdpa/vdpa_user/vduse_dev.c
index 1d24da79c399..503030f19e52 100644
--- a/drivers/vdpa/vdpa_user/vduse_dev.c
+++ b/drivers/vdpa/vdpa_user/vduse_dev.c
@@ -472,6 +472,18 @@ static void vduse_dev_reset(struct vduse_dev *dev)
up_write(>rwsem);
 }
 
+static void vduse_flush_work(struct vduse_dev *dev)
+{
+   flush_work(>inject);
+
+   for (int i = 0; i < dev->vq_num; i++) {
+   struct vduse_virtqueue *vq = dev->vqs[i];
+
+   flush_work(>inject);
+   flush_work(>kick);
+   }
+}
+
 static int vduse_vdpa_set_vq_address(struct vdpa_device *vdpa, u16 idx,
u64 desc_area, u64 driver_area,
u64 device_area)
@@ -713,6 +725,17 @@ static int vduse_vdpa_reset(struct vdpa_device *vdpa)
return ret;
 }
 
+static int vduse_vdpa_suspend(struct vdpa_device *vdpa)
+{
+   struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+
+   down_write(>rwsem);
+   vduse_flush_work(dev);
+   up_write(>rwsem);
+
+   return 0;
+}
+
 static u32 vduse_vdpa_get_generation(struct vdpa_device *vdpa)
 {
struct vduse_dev *dev = vdpa_to_vduse(vdpa);
@@ -794,6 +817,7 @@ static const struct vdpa_config_ops vduse_vdpa_config_ops = 
{
.set_vq_affinity= vduse_vdpa_set_vq_affinity,
.get_vq_affinity= vduse_vdpa_get_vq_affinity,
.reset  = vduse_vdpa_reset,
+   .suspend= vduse_vdpa_suspend,
.set_map= vduse_vdpa_set_map,
.free   = vduse_vdpa_free,
 };
-- 
2.39.3




[PATCH V2 1/3] vhost-vdpa: flush workers on suspend

2024-02-12 Thread Steve Sistare
Flush to guarantee no workers are running when suspend returns.

Signed-off-by: Steve Sistare 
---
 drivers/vhost/vdpa.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index bc4a51e4638b..a3b986c24805 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -594,10 +594,13 @@ static long vhost_vdpa_suspend(struct vhost_vdpa *v)
struct vdpa_device *vdpa = v->vdpa;
const struct vdpa_config_ops *ops = vdpa->config;
int ret;
+   struct vhost_dev *vdev = >vdev;
 
if (!ops->suspend)
return -EOPNOTSUPP;
 
+   vhost_dev_flush(vdev);
+
ret = ops->suspend(vdpa);
if (!ret)
v->suspended = true;
-- 
2.39.3




[PATCH V2 0/3] flush workers on suspend

2024-02-12 Thread Steve Sistare
Flush to guarantee no workers are running when suspend returns,
for vdpa, vpa_sim, and vduse.  (mlx5 already does so, via the path
mlx5_vdpa_suspend -> unregister_link_notifier -> flush_workqueue.)

Steve Sistare (3):
  vhost-vdpa: flush workers on suspend
  vduse: suspend
  vdpa_sim: flush workers on suspend

 drivers/vdpa/vdpa_sim/vdpa_sim.c   | 13 +
 drivers/vdpa/vdpa_user/vduse_dev.c | 24 
 drivers/vhost/vdpa.c   |  3 +++
 3 files changed, 40 insertions(+)

-- 
2.39.3




[PATCH V2 3/3] vdpa_sim: flush workers on suspend

2024-02-12 Thread Steve Sistare
Flush to guarantee no workers are running when suspend returns.

Signed-off-by: Steve Sistare 
---
 drivers/vdpa/vdpa_sim/vdpa_sim.c | 13 +
 1 file changed, 13 insertions(+)

diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c
index be2925d0d283..a662b90357c3 100644
--- a/drivers/vdpa/vdpa_sim/vdpa_sim.c
+++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c
@@ -74,6 +74,17 @@ static void vdpasim_worker_change_mm_sync(struct vdpasim 
*vdpasim,
kthread_flush_work(work);
 }
 
+static void flush_work_fn(struct kthread_work *work) {}
+
+static void vdpasim_flush_work(struct vdpasim *vdpasim)
+{
+   struct kthread_work work;
+
+   kthread_init_work(, flush_work_fn);
+   kthread_queue_work(vdpasim->worker, );
+   kthread_flush_work();
+}
+
 static struct vdpasim *vdpa_to_sim(struct vdpa_device *vdpa)
 {
return container_of(vdpa, struct vdpasim, vdpa);
@@ -511,6 +522,8 @@ static int vdpasim_suspend(struct vdpa_device *vdpa)
vdpasim->running = false;
mutex_unlock(>mutex);
 
+   vdpasim_flush_work(vdpasim);
+
return 0;
 }
 
-- 
2.39.3




[PATCH V1] vdpa: suspend and resume require DRIVER_OK

2024-02-09 Thread Steve Sistare
Calling suspend or resume requires VIRTIO_CONFIG_S_DRIVER_OK, for all
vdpa devices.

Suggested-by: Eugenio Perez Martin "
Signed-off-by: Steve Sistare 
---
 drivers/vhost/vdpa.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index bc4a51e4638b..ce1882acfc3b 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -598,6 +598,9 @@ static long vhost_vdpa_suspend(struct vhost_vdpa *v)
if (!ops->suspend)
return -EOPNOTSUPP;
 
+   if (!(ops->get_status(vdpa) & VIRTIO_CONFIG_S_DRIVER_OK))
+   return -EINVAL;
+
ret = ops->suspend(vdpa);
if (!ret)
v->suspended = true;
@@ -618,6 +621,9 @@ static long vhost_vdpa_resume(struct vhost_vdpa *v)
if (!ops->resume)
return -EOPNOTSUPP;
 
+   if (!(ops->get_status(vdpa) & VIRTIO_CONFIG_S_DRIVER_OK))
+   return -EINVAL;
+
ret = ops->resume(vdpa);
if (!ret)
v->suspended = false;
-- 
2.39.3




[PATCH V2] vdpa_sim: reset must not run

2024-02-09 Thread Steve Sistare
vdpasim_do_reset sets running to true, which is wrong, as it allows
vdpasim_kick_vq to post work requests before the device has been
configured.  To fix, do not set running until VIRTIO_CONFIG_S_DRIVER_OK
is set.

Fixes: 0c89e2a3a9d0 ("vdpa_sim: Implement suspend vdpa op")
Signed-off-by: Steve Sistare 
Reviewed-by: Eugenio Pérez 
Acked-by: Jason Wang 
---
 drivers/vdpa/vdpa_sim/vdpa_sim.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c
index be2925d0d283..18584ce70bf0 100644
--- a/drivers/vdpa/vdpa_sim/vdpa_sim.c
+++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c
@@ -160,7 +160,7 @@ static void vdpasim_do_reset(struct vdpasim *vdpasim, u32 
flags)
}
}
 
-   vdpasim->running = true;
+   vdpasim->running = false;
spin_unlock(>iommu_lock);
 
vdpasim->features = 0;
@@ -483,6 +483,7 @@ static void vdpasim_set_status(struct vdpa_device *vdpa, u8 
status)
 
mutex_lock(>mutex);
vdpasim->status = status;
+   vdpasim->running = (status & VIRTIO_CONFIG_S_DRIVER_OK) != 0;
mutex_unlock(>mutex);
 }
 
-- 
2.39.3




[PATCH V1] vdpa_sim: reset must not run

2024-01-17 Thread Steve Sistare
vdpasim_do_reset sets running to true, which is wrong, as it allows
vdpasim_kick_vq to post work requests before the device has been
configured.  To fix, do not set running until VIRTIO_CONFIG_S_FEATURES_OK
is set.

Fixes: 0c89e2a3a9d0 ("vdpa_sim: Implement suspend vdpa op")
Signed-off-by: Steve Sistare 
Reviewed-by: Eugenio Pérez 
---
 drivers/vdpa/vdpa_sim/vdpa_sim.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c
index be2925d0d283..6304cb0b4770 100644
--- a/drivers/vdpa/vdpa_sim/vdpa_sim.c
+++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c
@@ -160,7 +160,7 @@ static void vdpasim_do_reset(struct vdpasim *vdpasim, u32 
flags)
}
}
 
-   vdpasim->running = true;
+   vdpasim->running = false;
spin_unlock(>iommu_lock);
 
vdpasim->features = 0;
@@ -483,6 +483,7 @@ static void vdpasim_set_status(struct vdpa_device *vdpa, u8 
status)
 
mutex_lock(>mutex);
vdpasim->status = status;
+   vdpasim->running = (status & VIRTIO_CONFIG_S_FEATURES_OK) != 0;
mutex_unlock(>mutex);
 }
 
-- 
2.39.3




[RFC V1 12/13] vdpa_sim: new owner capability

2024-01-10 Thread Steve Sistare
The vdpa_sim device supports ownership transfer to a new process, so
advertise VHOST_BACKEND_F_NEW_OWNER.  User virtual addresses are used
by the software iommu, so VHOST_IOTLB_REMAP is required after
VHOST_NEW_OWNER, so advertise VHOST_BACKEND_F_IOTLB_REMAP.

Signed-off-by: Steve Sistare 
---
 drivers/vdpa/vdpa_sim/vdpa_sim.c | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c
index 8734834983cb..d037869d8a89 100644
--- a/drivers/vdpa/vdpa_sim/vdpa_sim.c
+++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c
@@ -430,7 +430,13 @@ static u64 vdpasim_get_device_features(struct vdpa_device 
*vdpa)
 
 static u64 vdpasim_get_backend_features(const struct vdpa_device *vdpa)
 {
-   return BIT_ULL(VHOST_BACKEND_F_ENABLE_AFTER_DRIVER_OK);
+   u64 features = BIT_ULL(VHOST_BACKEND_F_ENABLE_AFTER_DRIVER_OK) |
+  BIT_ULL(VHOST_BACKEND_F_NEW_OWNER);
+
+   if (use_va)
+   features += BIT_ULL(VHOST_BACKEND_F_IOTLB_REMAP);
+
+   return features;
 }
 
 static int vdpasim_set_driver_features(struct vdpa_device *vdpa, u64 features)
-- 
2.39.3




[RFC V1 13/13] vduse: new owner capability

2024-01-10 Thread Steve Sistare
The vduse device supports ownership transfer to a new process, so
advertise VHOST_BACKEND_F_NEW_OWNER.  User virtual addresses are used
by the software iommu, so VHOST_IOTLB_REMAP is required after
VHOST_NEW_OWNER, so advertise VHOST_BACKEND_F_IOTLB_REMAP.

Signed-off-by: Steve Sistare 
---
 drivers/vdpa/vdpa_user/vduse_dev.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c 
b/drivers/vdpa/vdpa_user/vduse_dev.c
index 6b25457a037d..67815f6391db 100644
--- a/drivers/vdpa/vdpa_user/vduse_dev.c
+++ b/drivers/vdpa/vdpa_user/vduse_dev.c
@@ -25,6 +25,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -608,6 +609,12 @@ static u32 vduse_vdpa_get_vq_align(struct vdpa_device 
*vdpa)
return dev->vq_align;
 }
 
+static u64 vduse_vdpa_get_backend_features(const struct vdpa_device *vdpa)
+{
+   return BIT_ULL(VHOST_BACKEND_F_IOTLB_REMAP) |
+  BIT_ULL(VHOST_BACKEND_F_NEW_OWNER);
+}
+
 static u64 vduse_vdpa_get_device_features(struct vdpa_device *vdpa)
 {
struct vduse_dev *dev = vdpa_to_vduse(vdpa);
@@ -801,6 +808,7 @@ static const struct vdpa_config_ops vduse_vdpa_config_ops = 
{
.set_vq_state   = vduse_vdpa_set_vq_state,
.get_vq_state   = vduse_vdpa_get_vq_state,
.get_vq_align   = vduse_vdpa_get_vq_align,
+   .get_backend_features   = vduse_vdpa_get_backend_features,
.get_device_features= vduse_vdpa_get_device_features,
.set_driver_features= vduse_vdpa_set_driver_features,
.get_driver_features= vduse_vdpa_get_driver_features,
-- 
2.39.3




[RFC V1 07/13] vhost-vdpa: flush workers on suspend

2024-01-10 Thread Steve Sistare
To pass ownership of a live vdpa device to a new process, the user
suspends the device, calls VHOST_NEW_OWNER to change the mm, and calls
VHOST_IOTLB_REMAP to change the user virtual addresses to match the new
mm.  Flush workers in suspend to guarantee that no worker sees the new
mm and old VA in between.

Signed-off-by: Steve Sistare 
---
 drivers/vhost/vdpa.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index 8fe1562d24af..9673e8e20d11 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -591,10 +591,14 @@ static long vhost_vdpa_suspend(struct vhost_vdpa *v)
 {
struct vdpa_device *vdpa = v->vdpa;
const struct vdpa_config_ops *ops = vdpa->config;
+   struct vhost_dev *vdev = >vdev;
 
if (!ops->suspend)
return -EOPNOTSUPP;
 
+   if (vdev->use_worker)
+   vhost_dev_flush(vdev);
+
return ops->suspend(vdpa);
 }
 
-- 
2.39.3




[RFC V1 04/13] vhost-vdpa: VHOST_BACKEND_F_NEW_OWNER

2024-01-10 Thread Steve Sistare
Add the VHOST_BACKEND_F_NEW_OWNER backend capability, which indicates that
VHOST_NEW_OWNER is supported.

Signed-off-by: Steve Sistare 
---
 drivers/vhost/vdpa.c | 7 ++-
 include/uapi/linux/vhost_types.h | 2 ++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index eb3a95e703b0..faed6471934a 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -621,6 +621,10 @@ static long vhost_vdpa_new_owner(struct vhost_vdpa *v)
struct mm_struct *mm_new = current->mm;
long pinned_vm = v->pinned_vm;
unsigned long lock_limit = PFN_DOWN(rlimit(RLIMIT_MEMLOCK));
+   u64 features = vhost_vdpa_get_backend_features(v);
+
+   if (!(features & BIT_ULL(VHOST_BACKEND_F_NEW_OWNER)))
+   return -EOPNOTSUPP;
 
if (!mm_old)
return -EINVAL;
@@ -784,7 +788,8 @@ static long vhost_vdpa_unlocked_ioctl(struct file *filep,
 BIT_ULL(VHOST_BACKEND_F_IOTLB_PERSIST) |
 BIT_ULL(VHOST_BACKEND_F_SUSPEND) |
 BIT_ULL(VHOST_BACKEND_F_RESUME) |
-
BIT_ULL(VHOST_BACKEND_F_ENABLE_AFTER_DRIVER_OK)))
+
BIT_ULL(VHOST_BACKEND_F_ENABLE_AFTER_DRIVER_OK) |
+BIT_ULL(VHOST_BACKEND_F_NEW_OWNER)))
return -EOPNOTSUPP;
if ((features & BIT_ULL(VHOST_BACKEND_F_SUSPEND)) &&
 !vhost_vdpa_can_suspend(v))
diff --git a/include/uapi/linux/vhost_types.h b/include/uapi/linux/vhost_types.h
index d7656908f730..9177843951e9 100644
--- a/include/uapi/linux/vhost_types.h
+++ b/include/uapi/linux/vhost_types.h
@@ -192,5 +192,7 @@ struct vhost_vdpa_iova_range {
 #define VHOST_BACKEND_F_DESC_ASID0x7
 /* IOTLB don't flush memory mapping across device reset */
 #define VHOST_BACKEND_F_IOTLB_PERSIST  0x8
+/* Supports VHOST_NEW_OWNER */
+#define VHOST_BACKEND_F_NEW_OWNER  0x9
 
 #endif
-- 
2.39.3




[RFC V1 11/13] vdpa/mlx5: new owner capability

2024-01-10 Thread Steve Sistare
The mlx5 vdpa device supports ownership transfer to a new process, so
advertise VHOST_BACKEND_F_NEW_OWNER.  User virtual addresses are not
used after they are initially translated to physical, so VHOST_IOTLB_REMAP
is not required, hence VHOST_BACKEND_F_IOTLB_REMAP is not advertised.

Signed-off-by: Steve Sistare 
---
 drivers/vdpa/mlx5/net/mlx5_vnet.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 26ba7da6b410..26f24fb0e160 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -2562,7 +2562,8 @@ static void unregister_link_notifier(struct mlx5_vdpa_net 
*ndev)
 
 static u64 mlx5_vdpa_get_backend_features(const struct vdpa_device *vdpa)
 {
-   return BIT_ULL(VHOST_BACKEND_F_ENABLE_AFTER_DRIVER_OK);
+   return BIT_ULL(VHOST_BACKEND_F_ENABLE_AFTER_DRIVER_OK) |
+  BIT_ULL(VHOST_BACKEND_F_NEW_OWNER);
 }
 
 static int mlx5_vdpa_set_driver_features(struct vdpa_device *vdev, u64 
features)
-- 
2.39.3




[RFC V1 10/13] vdpa_sim: flush workers on suspend

2024-01-10 Thread Steve Sistare
To pass ownership of a live vdpa device to a new process, the user
suspends the device, calls VHOST_NEW_OWNER to change the mm, and calls
VHOST_IOTLB_REMAP to change the user virtual addresses to match the new
mm.  Flush workers in suspend to guarantee that no worker sees the new
mm and old VA in between.

Signed-off-by: Steve Sistare 
---
 drivers/vdpa/vdpa_sim/vdpa_sim.c | 13 +
 1 file changed, 13 insertions(+)

diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c
index 6304cb0b4770..8734834983cb 100644
--- a/drivers/vdpa/vdpa_sim/vdpa_sim.c
+++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c
@@ -74,6 +74,17 @@ static void vdpasim_worker_change_mm_sync(struct vdpasim 
*vdpasim,
kthread_flush_work(work);
 }
 
+static void flush_work_fn(struct kthread_work *work) {}
+
+static void vdpasim_flush_work(struct vdpasim *vdpasim)
+{
+   struct kthread_work work;
+
+   kthread_init_work(, flush_work_fn);
+   kthread_queue_work(vdpasim->worker, );
+   kthread_flush_work();
+}
+
 static struct vdpasim *vdpa_to_sim(struct vdpa_device *vdpa)
 {
return container_of(vdpa, struct vdpasim, vdpa);
@@ -512,6 +523,8 @@ static int vdpasim_suspend(struct vdpa_device *vdpa)
vdpasim->running = false;
mutex_unlock(>mutex);
 
+   vdpasim_flush_work(vdpasim);
+
return 0;
 }
 
-- 
2.39.3




[RFC V1 09/13] vdpa_sim: reset must not run

2024-01-10 Thread Steve Sistare
vdpasim_do_reset sets running to true, which is wrong, as it allows
vdpasim_kick_vq to post work requests before the device has been
configured.  To fix, do not set running until VIRTIO_CONFIG_S_FEATURES_OK
is set.

Signed-off-by: Steve Sistare 
---
 drivers/vdpa/vdpa_sim/vdpa_sim.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c
index be2925d0d283..6304cb0b4770 100644
--- a/drivers/vdpa/vdpa_sim/vdpa_sim.c
+++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c
@@ -160,7 +160,7 @@ static void vdpasim_do_reset(struct vdpasim *vdpasim, u32 
flags)
}
}
 
-   vdpasim->running = true;
+   vdpasim->running = false;
spin_unlock(>iommu_lock);
 
vdpasim->features = 0;
@@ -483,6 +483,7 @@ static void vdpasim_set_status(struct vdpa_device *vdpa, u8 
status)
 
mutex_lock(>mutex);
vdpasim->status = status;
+   vdpasim->running = (status & VIRTIO_CONFIG_S_FEATURES_OK) != 0;
mutex_unlock(>mutex);
 }
 
-- 
2.39.3




[RFC V1 05/13] vhost-vdpa: VHOST_IOTLB_REMAP

2024-01-10 Thread Steve Sistare
When device ownership is passed to a new process via VHOST_NEW_OWNER,
some devices need to know the new userland addresses of the dma mappings.
Define the new iotlb message type VHOST_IOTLB_REMAP to update the uaddr
of a mapping.  The new uaddr must address the same memory object as
originally mapped.

The user must suspend the device before the old address is invalidated,
and cannot resume it until after VHOST_IOTLB_REMAP is called, but this
requirement is not enforced by the API.

Signed-off-by: Steve Sistare 
---
 drivers/vhost/vdpa.c | 34 
 include/uapi/linux/vhost_types.h | 11 ++-
 2 files changed, 44 insertions(+), 1 deletion(-)

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index faed6471934a..ec5ca20bd47d 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -1219,6 +1219,37 @@ static int vhost_vdpa_pa_map(struct vhost_vdpa *v,
 
 }
 
+static int vhost_vdpa_process_iotlb_remap(struct vhost_vdpa *v,
+ struct vhost_iotlb *iotlb,
+ struct vhost_iotlb_msg *msg)
+{
+   struct vdpa_device *vdpa = v->vdpa;
+   const struct vdpa_config_ops *ops = vdpa->config;
+   u32 asid = iotlb_to_asid(iotlb);
+   u64 start = msg->iova;
+   u64 last = start + msg->size - 1;
+   struct vhost_iotlb_map *map;
+   int r = 0;
+
+   if (msg->perm || !msg->size)
+   return -EINVAL;
+
+   map = vhost_iotlb_itree_first(iotlb, start, last);
+   if (!map)
+   return -ENOENT;
+
+   if (map->start != start || map->last != last)
+   return -EINVAL;
+
+   /* batch will finish with remap.  non-batch must do it now. */
+   if (!v->in_batch)
+   r = ops->set_map(vdpa, asid, iotlb);
+   if (!r)
+   map->addr = msg->uaddr;
+
+   return r;
+}
+
 static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v,
   struct vhost_iotlb *iotlb,
   struct vhost_iotlb_msg *msg)
@@ -1298,6 +1329,9 @@ static int vhost_vdpa_process_iotlb_msg(struct vhost_dev 
*dev, u32 asid,
ops->set_map(vdpa, asid, iotlb);
v->in_batch = false;
break;
+   case VHOST_IOTLB_REMAP:
+   r = vhost_vdpa_process_iotlb_remap(v, iotlb, msg);
+   break;
default:
r = -EINVAL;
break;
diff --git a/include/uapi/linux/vhost_types.h b/include/uapi/linux/vhost_types.h
index 9177843951e9..35908315ff55 100644
--- a/include/uapi/linux/vhost_types.h
+++ b/include/uapi/linux/vhost_types.h
@@ -79,7 +79,7 @@ struct vhost_iotlb_msg {
 /*
  * VHOST_IOTLB_BATCH_BEGIN and VHOST_IOTLB_BATCH_END allow modifying
  * multiple mappings in one go: beginning with
- * VHOST_IOTLB_BATCH_BEGIN, followed by any number of
+ * VHOST_IOTLB_BATCH_BEGIN, followed by any number of VHOST_IOTLB_REMAP or
  * VHOST_IOTLB_UPDATE messages, and ending with VHOST_IOTLB_BATCH_END.
  * When one of these two values is used as the message type, the rest
  * of the fields in the message are ignored. There's no guarantee that
@@ -87,6 +87,15 @@ struct vhost_iotlb_msg {
  */
 #define VHOST_IOTLB_BATCH_BEGIN5
 #define VHOST_IOTLB_BATCH_END  6
+
+/*
+ * VHOST_IOTLB_REMAP registers a new uaddr for the existing mapping at iova.
+ * The new uaddr must address the same memory object as originally mapped.
+ * Failure to do so will result in user memory corruption and/or device
+ * misbehavior.  iova and size must match the arguments used to create the
+ * an existing mapping.  Protection is not changed, and perm must be 0.
+ */
+#define VHOST_IOTLB_REMAP  7
__u8 type;
 };
 
-- 
2.39.3




[RFC V1 06/13] vhost-vdpa: VHOST_BACKEND_F_IOTLB_REMAP

2024-01-10 Thread Steve Sistare
Add the VHOST_BACKEND_F_IOTLB_REMAP backend capability, which indicates
that VHOST_IOTLB_REMAP is supported.

If VHOST_BACKEND_F_IOTLB_REMAP is advertised, then the user must call
VHOST_IOTLB_REMAP after ownership of a device is transferred to a new
process via VHOST_NEW_OWNER.  Disabling the feature during negotiation
does not negate this requirement.

Signed-off-by: Steve Sistare 
---
 drivers/vhost/vdpa.c | 8 +++-
 include/uapi/linux/vhost_types.h | 2 ++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index ec5ca20bd47d..8fe1562d24af 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -789,7 +789,8 @@ static long vhost_vdpa_unlocked_ioctl(struct file *filep,
 BIT_ULL(VHOST_BACKEND_F_SUSPEND) |
 BIT_ULL(VHOST_BACKEND_F_RESUME) |
 
BIT_ULL(VHOST_BACKEND_F_ENABLE_AFTER_DRIVER_OK) |
-BIT_ULL(VHOST_BACKEND_F_NEW_OWNER)))
+BIT_ULL(VHOST_BACKEND_F_NEW_OWNER) |
+BIT_ULL(VHOST_BACKEND_F_IOTLB_REMAP)))
return -EOPNOTSUPP;
if ((features & BIT_ULL(VHOST_BACKEND_F_SUSPEND)) &&
 !vhost_vdpa_can_suspend(v))
@@ -1229,11 +1230,16 @@ static int vhost_vdpa_process_iotlb_remap(struct 
vhost_vdpa *v,
u64 start = msg->iova;
u64 last = start + msg->size - 1;
struct vhost_iotlb_map *map;
+   u64 features;
int r = 0;
 
if (msg->perm || !msg->size)
return -EINVAL;
 
+   features = ops->get_backend_features(vdpa);
+   if (!(features & BIT_ULL(VHOST_BACKEND_F_IOTLB_REMAP)))
+   return -EOPNOTSUPP;
+
map = vhost_iotlb_itree_first(iotlb, start, last);
if (!map)
return -ENOENT;
diff --git a/include/uapi/linux/vhost_types.h b/include/uapi/linux/vhost_types.h
index 35908315ff55..7e79e9bd0f7b 100644
--- a/include/uapi/linux/vhost_types.h
+++ b/include/uapi/linux/vhost_types.h
@@ -203,5 +203,7 @@ struct vhost_vdpa_iova_range {
 #define VHOST_BACKEND_F_IOTLB_PERSIST  0x8
 /* Supports VHOST_NEW_OWNER */
 #define VHOST_BACKEND_F_NEW_OWNER  0x9
+/* Supports VHOST_IOTLB_REMAP */
+#define VHOST_BACKEND_F_IOTLB_REMAP  0xa
 
 #endif
-- 
2.39.3




[RFC V1 08/13] vduse: flush workers on suspend

2024-01-10 Thread Steve Sistare
To pass ownership of a live vdpa device to a new process, the user
suspends the device, calls VHOST_NEW_OWNER to change the mm, and calls
VHOST_IOTLB_REMAP to change the user virtual addresses to match the new
mm.  Flush workers in suspend to guarantee that no worker sees the new
mm and old VA in between.

Signed-off-by: Steve Sistare 
---
 drivers/vdpa/vdpa_user/vduse_dev.c | 24 
 1 file changed, 24 insertions(+)

diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c 
b/drivers/vdpa/vdpa_user/vduse_dev.c
index 0ddd4b8abecb..6b25457a037d 100644
--- a/drivers/vdpa/vdpa_user/vduse_dev.c
+++ b/drivers/vdpa/vdpa_user/vduse_dev.c
@@ -472,6 +472,18 @@ static void vduse_dev_reset(struct vduse_dev *dev)
up_write(>rwsem);
 }
 
+static void vduse_flush_work(struct vduse_dev *dev)
+{
+   flush_work(>inject);
+
+   for (int i = 0; i < dev->vq_num; i++) {
+   struct vduse_virtqueue *vq = dev->vqs[i];
+
+   flush_work(>inject);
+   flush_work(>kick);
+   }
+}
+
 static int vduse_vdpa_set_vq_address(struct vdpa_device *vdpa, u16 idx,
u64 desc_area, u64 driver_area,
u64 device_area)
@@ -713,6 +725,17 @@ static int vduse_vdpa_reset(struct vdpa_device *vdpa)
return ret;
 }
 
+static int vduse_vdpa_suspend(struct vdpa_device *vdpa)
+{
+   struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+
+   down_write(>rwsem);
+   vduse_flush_work(dev);
+   up_write(>rwsem);
+
+   return 0;
+}
+
 static u32 vduse_vdpa_get_generation(struct vdpa_device *vdpa)
 {
struct vduse_dev *dev = vdpa_to_vduse(vdpa);
@@ -794,6 +817,7 @@ static const struct vdpa_config_ops vduse_vdpa_config_ops = 
{
.set_vq_affinity= vduse_vdpa_set_vq_affinity,
.get_vq_affinity= vduse_vdpa_get_vq_affinity,
.reset  = vduse_vdpa_reset,
+   .suspend= vduse_vdpa_suspend,
.set_map= vduse_vdpa_set_map,
.free   = vduse_vdpa_free,
 };
-- 
2.39.3




[RFC V1 03/13] vhost-vdpa: VHOST_NEW_OWNER

2024-01-10 Thread Steve Sistare
Add an ioctl to transfer file descriptor ownership and pinned memory
accounting from one process to another.

Signed-off-by: Steve Sistare 
---
 drivers/vhost/vdpa.c   | 37 +
 drivers/vhost/vhost.c  | 15 +++
 drivers/vhost/vhost.h  |  1 +
 include/uapi/linux/vhost.h | 10 ++
 4 files changed, 63 insertions(+)

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index 2269988d6d33..eb3a95e703b0 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -613,6 +613,40 @@ static long vhost_vdpa_resume(struct vhost_vdpa *v)
return ops->resume(vdpa);
 }
 
+static long vhost_vdpa_new_owner(struct vhost_vdpa *v)
+{
+   int r;
+   struct vhost_dev *vdev = >vdev;
+   struct mm_struct *mm_old = vdev->mm;
+   struct mm_struct *mm_new = current->mm;
+   long pinned_vm = v->pinned_vm;
+   unsigned long lock_limit = PFN_DOWN(rlimit(RLIMIT_MEMLOCK));
+
+   if (!mm_old)
+   return -EINVAL;
+
+   if (!v->vdpa->use_va &&
+   pinned_vm + atomic64_read(_new->pinned_vm) > lock_limit)
+   return -ENOMEM;
+
+   r = vhost_vdpa_bind_mm(v, mm_new);
+   if (r)
+   return r;
+
+   r = vhost_dev_new_owner(vdev);
+   if (r) {
+   vhost_vdpa_bind_mm(v, mm_old);
+   return r;
+   }
+
+   if (!v->vdpa->use_va) {
+   atomic64_sub(pinned_vm, _old->pinned_vm);
+   atomic64_add(pinned_vm, _new->pinned_vm);
+   }
+
+   return r;
+}
+
 static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, unsigned int cmd,
   void __user *argp)
 {
@@ -843,6 +877,9 @@ static long vhost_vdpa_unlocked_ioctl(struct file *filep,
case VHOST_VDPA_RESUME:
r = vhost_vdpa_resume(v);
break;
+   case VHOST_NEW_OWNER:
+   r = vhost_vdpa_new_owner(v);
+   break;
default:
r = vhost_dev_ioctl(>vdev, cmd, argp);
if (r == -ENOIOCTLCMD)
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index e0c181ad17e3..0ce7ee9834f4 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -907,6 +907,21 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
 }
 EXPORT_SYMBOL_GPL(vhost_dev_set_owner);
 
+/* Caller should have device mutex */
+long vhost_dev_new_owner(struct vhost_dev *dev)
+{
+   if (dev->mm == current->mm)
+   return -EBUSY;
+
+   if (!vhost_dev_has_owner(dev))
+   return -EINVAL;
+
+   vhost_detach_mm(dev);
+   vhost_attach_mm(dev);
+   return 0;
+}
+EXPORT_SYMBOL_GPL(vhost_dev_new_owner);
+
 static struct vhost_iotlb *iotlb_alloc(void)
 {
return vhost_iotlb_alloc(max_iotlb_entries,
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index f60d5f7bef94..cd0dab21d99e 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -185,6 +185,7 @@ void vhost_dev_init(struct vhost_dev *, struct 
vhost_virtqueue **vqs,
int (*msg_handler)(struct vhost_dev *dev, u32 asid,
   struct vhost_iotlb_msg *msg));
 long vhost_dev_set_owner(struct vhost_dev *dev);
+long vhost_dev_new_owner(struct vhost_dev *dev);
 bool vhost_dev_has_owner(struct vhost_dev *dev);
 long vhost_dev_check_owner(struct vhost_dev *);
 struct vhost_iotlb *vhost_dev_reset_owner_prepare(void);
diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h
index 649560c685f1..5e3cdce4c0cf 100644
--- a/include/uapi/linux/vhost.h
+++ b/include/uapi/linux/vhost.h
@@ -123,6 +123,16 @@
 #define VHOST_SET_BACKEND_FEATURES _IOW(VHOST_VIRTIO, 0x25, __u64)
 #define VHOST_GET_BACKEND_FEATURES _IOR(VHOST_VIRTIO, 0x26, __u64)
 
+/* Set current process as the new owner of this file descriptor.  The fd must
+ * already be owned, via a prior call to VHOST_SET_OWNER.  The pinned memory
+ * count is transferred from the previous to the new owner.
+ * Errors:
+ *   EINVAL: not owned
+ *   EBUSY:  caller is already the owner
+ *   ENOMEM: RLIMIT_MEMLOCK exceeded
+ */
+#define VHOST_NEW_OWNER _IO(VHOST_VIRTIO, 0x27)
+
 /* VHOST_NET specific defines */
 
 /* Attach virtio net ring to a raw socket, or tap device.
-- 
2.39.3




[RFC V1 00/13] vdpa live update

2024-01-10 Thread Steve Sistare
Live update is a technique wherein an application saves its state, exec's
to an updated version of itself, and restores its state.  Clients of the
application experience a brief suspension of service, on the order of 
100's of milliseconds, but are otherwise unaffected.

Define and implement interfaces that allow vdpa devices to be preserved
across fork or exec, to support live update for applications such as qemu.
The device must be suspended during the update, but its dma mappings are
preserved, so the suspension is brief.

The VHOST_NEW_OWNER ioctl transfers device ownership and pinned memory
accounting from one process to another.

The VHOST_BACKEND_F_NEW_OWNER backend capability indicates that
VHOST_NEW_OWNER is supported.

The VHOST_IOTLB_REMAP message type updates a dma mapping with its userland
address in the new process.

The VHOST_BACKEND_F_IOTLB_REMAP backend capability indicates that
VHOST_IOTLB_REMAP is supported and required.  Some devices do not
require it, because the userland address of each dma mapping is discarded
after being translated to a physical address.

Here is a pseudo-code sequence for performing live update, based on
suspend + reset because resume is not yet available.  The vdpa device
descriptor, fd, remains open across the exec.

  ioctl(fd, VHOST_VDPA_SUSPEND)
  ioctl(fd, VHOST_VDPA_SET_STATUS, 0)
  exec 

  ioctl(fd, VHOST_NEW_OWNER)

  issue ioctls to re-create vrings

  if VHOST_BACKEND_F_IOTLB_REMAP
  foreach dma mapping
  write(fd, {VHOST_IOTLB_REMAP, new_addr})

  ioctl(fd, VHOST_VDPA_SET_STATUS,
ACKNOWLEDGE | DRIVER | FEATURES_OK | DRIVER_OK)


Steve Sistare (13):
  vhost-vdpa: count pinned memory
  vhost-vdpa: pass mm to bind
  vhost-vdpa: VHOST_NEW_OWNER
  vhost-vdpa: VHOST_BACKEND_F_NEW_OWNER
  vhost-vdpa: VHOST_IOTLB_REMAP
  vhost-vdpa: VHOST_BACKEND_F_IOTLB_REMAP
  vhost-vdpa: flush workers on suspend
  vduse: flush workers on suspend
  vdpa_sim: reset must not run
  vdpa_sim: flush workers on suspend
  vdpa/mlx5: new owner capability
  vdpa_sim: new owner capability
  vduse: new owner capability

 drivers/vdpa/mlx5/net/mlx5_vnet.c  |   3 +-
 drivers/vdpa/vdpa_sim/vdpa_sim.c   |  24 ++-
 drivers/vdpa/vdpa_user/vduse_dev.c |  32 +
 drivers/vhost/vdpa.c   | 101 +++--
 drivers/vhost/vhost.c  |  15 +
 drivers/vhost/vhost.h  |   1 +
 include/uapi/linux/vhost.h |  10 +++
 include/uapi/linux/vhost_types.h   |  15 -
 8 files changed, 191 insertions(+), 10 deletions(-)

-- 
2.39.3




[RFC V1 02/13] vhost-vdpa: pass mm to bind

2024-01-10 Thread Steve Sistare
Pass the target mm to vhost_vdpa_bind_mm.  No functional change.

Signed-off-by: Steve Sistare 
---
 drivers/vhost/vdpa.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index 10fb95bcca1a..2269988d6d33 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -248,7 +248,7 @@ static int vhost_vdpa_reset(struct vhost_vdpa *v)
return _compat_vdpa_reset(v);
 }
 
-static long vhost_vdpa_bind_mm(struct vhost_vdpa *v)
+static long vhost_vdpa_bind_mm(struct vhost_vdpa *v, struct mm_struct *mm)
 {
struct vdpa_device *vdpa = v->vdpa;
const struct vdpa_config_ops *ops = vdpa->config;
@@ -256,7 +256,7 @@ static long vhost_vdpa_bind_mm(struct vhost_vdpa *v)
if (!vdpa->use_va || !ops->bind_mm)
return 0;
 
-   return ops->bind_mm(vdpa, v->vdev.mm);
+   return ops->bind_mm(vdpa, mm);
 }
 
 static void vhost_vdpa_unbind_mm(struct vhost_vdpa *v)
@@ -855,7 +855,7 @@ static long vhost_vdpa_unlocked_ioctl(struct file *filep,
 
switch (cmd) {
case VHOST_SET_OWNER:
-   r = vhost_vdpa_bind_mm(v);
+   r = vhost_vdpa_bind_mm(v, v->vdev.mm);
if (r)
vhost_dev_reset_owner(d, NULL);
break;
-- 
2.39.3




[RFC V1 01/13] vhost-vdpa: count pinned memory

2024-01-10 Thread Steve Sistare
Remember the count of pinned memory for the device.

Signed-off-by: Steve Sistare 
---
 drivers/vhost/vdpa.c | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index da7ec77cdaff..10fb95bcca1a 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -59,6 +59,7 @@ struct vhost_vdpa {
int in_batch;
struct vdpa_iova_range range;
u32 batch_asid;
+   long pinned_vm;
 };
 
 static DEFINE_IDA(vhost_vdpa_ida);
@@ -893,6 +894,7 @@ static void vhost_vdpa_pa_unmap(struct vhost_vdpa *v, 
struct vhost_iotlb *iotlb,
unpin_user_page(page);
}
atomic64_sub(PFN_DOWN(map->size), >mm->pinned_vm);
+   v->pinned_vm -= PFN_DOWN(map->size);
vhost_vdpa_general_unmap(v, map, asid);
vhost_iotlb_map_free(iotlb, map);
}
@@ -975,9 +977,10 @@ static int vhost_vdpa_map(struct vhost_vdpa *v, struct 
vhost_iotlb *iotlb,
return r;
}
 
-   if (!vdpa->use_va)
+   if (!vdpa->use_va) {
atomic64_add(PFN_DOWN(size), >mm->pinned_vm);
-
+   v->pinned_vm += PFN_DOWN(size);
+   }
return 0;
 }
 
-- 
2.39.3




[PATCH v4 05/10] sched/fair: Hoist idle_stamp up from idle_balance

2018-12-06 Thread Steve Sistare
Move the update of idle_stamp from idle_balance to the call site in
pick_next_task_fair, to prepare for a future patch that adds work to
pick_next_task_fair which must be included in the idle_stamp interval.
No functional change.

Signed-off-by: Steve Sistare 
---
 kernel/sched/fair.c | 31 ++-
 1 file changed, 22 insertions(+), 9 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4e105db..8a33ad9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3725,6 +3725,16 @@ static inline void update_misfit_status(struct 
task_struct *p, struct rq *rq)
rq->misfit_task_load = task_h_load(p);
 }
 
+static inline void rq_idle_stamp_update(struct rq *rq)
+{
+   rq->idle_stamp = rq_clock(rq);
+}
+
+static inline void rq_idle_stamp_clear(struct rq *rq)
+{
+   rq->idle_stamp = 0;
+}
+
 static void overload_clear(struct rq *rq)
 {
struct sparsemask *overload_cpus;
@@ -3770,6 +3780,8 @@ static inline int idle_balance(struct rq *rq, struct 
rq_flags *rf)
return 0;
 }
 
+static inline void rq_idle_stamp_update(struct rq *rq) {}
+static inline void rq_idle_stamp_clear(struct rq *rq) {}
 static inline void overload_clear(struct rq *rq) {}
 static inline void overload_set(struct rq *rq) {}
 
@@ -6764,8 +6776,18 @@ static void check_preempt_wakeup(struct rq *rq, struct 
task_struct *p, int wake_
 
 idle:
update_misfit_status(NULL, rq);
+
+   /*
+* We must set idle_stamp _before_ calling idle_balance(), such that we
+* measure the duration of idle_balance() as idle time.
+*/
+   rq_idle_stamp_update(rq);
+
new_tasks = idle_balance(rq, rf);
 
+   if (new_tasks)
+   rq_idle_stamp_clear(rq);
+
/*
 * Because idle_balance() releases (and re-acquires) rq->lock, it is
 * possible for any higher priority task to appear. In that case we
@@ -9611,12 +9633,6 @@ static int idle_balance(struct rq *this_rq, struct 
rq_flags *rf)
u64 curr_cost = 0;
 
/*
-* We must set idle_stamp _before_ calling idle_balance(), such that we
-* measure the duration of idle_balance() as idle time.
-*/
-   this_rq->idle_stamp = rq_clock(this_rq);
-
-   /*
 * Do not pull tasks towards !active CPUs...
 */
if (!cpu_active(this_cpu))
@@ -9707,9 +9723,6 @@ static int idle_balance(struct rq *this_rq, struct 
rq_flags *rf)
if (this_rq->nr_running != this_rq->cfs.h_nr_running)
pulled_task = -1;
 
-   if (pulled_task)
-   this_rq->idle_stamp = 0;
-
rq_repin_lock(this_rq, rf);
 
return pulled_task;
-- 
1.8.3.1



[PATCH v4 04/10] sched/fair: Dynamically update cfs_overload_cpus

2018-12-06 Thread Steve Sistare
An overloaded CPU has more than 1 runnable task.  When a CFS task wakes
on a CPU, if h_nr_running transitions from 1 to more, then set the CPU in
the cfs_overload_cpus bitmap.  When a CFS task sleeps, if h_nr_running
transitions from 2 to less, then clear the CPU in cfs_overload_cpus.

Signed-off-by: Steve Sistare 
---
 kernel/sched/fair.c | 52 
 1 file changed, 48 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ee271bb..4e105db 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -21,6 +21,7 @@
  *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
  */
 #include "sched.h"
+#include "sparsemask.h"
 
 #include 
 
@@ -3724,6 +3725,28 @@ static inline void update_misfit_status(struct 
task_struct *p, struct rq *rq)
rq->misfit_task_load = task_h_load(p);
 }
 
+static void overload_clear(struct rq *rq)
+{
+   struct sparsemask *overload_cpus;
+
+   rcu_read_lock();
+   overload_cpus = rcu_dereference(rq->cfs_overload_cpus);
+   if (overload_cpus)
+   sparsemask_clear_elem(overload_cpus, rq->cpu);
+   rcu_read_unlock();
+}
+
+static void overload_set(struct rq *rq)
+{
+   struct sparsemask *overload_cpus;
+
+   rcu_read_lock();
+   overload_cpus = rcu_dereference(rq->cfs_overload_cpus);
+   if (overload_cpus)
+   sparsemask_set_elem(overload_cpus, rq->cpu);
+   rcu_read_unlock();
+}
+
 #else /* CONFIG_SMP */
 
 #define UPDATE_TG  0x0
@@ -3747,6 +3770,9 @@ static inline int idle_balance(struct rq *rq, struct 
rq_flags *rf)
return 0;
 }
 
+static inline void overload_clear(struct rq *rq) {}
+static inline void overload_set(struct rq *rq) {}
+
 static inline void
 util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
 
@@ -4441,6 +4467,7 @@ static int tg_throttle_down(struct task_group *tg, void 
*data)
 static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
 {
struct rq *rq = rq_of(cfs_rq);
+   unsigned int prev_nr = rq->cfs.h_nr_running;
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct sched_entity *se;
long task_delta, dequeue = 1;
@@ -4468,8 +4495,12 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
dequeue = 0;
}
 
-   if (!se)
+   if (!se) {
sub_nr_running(rq, task_delta);
+   if (prev_nr >= 2 && prev_nr - task_delta < 2)
+   overload_clear(rq);
+
+   }
 
cfs_rq->throttled = 1;
cfs_rq->throttled_clock = rq_clock(rq);
@@ -4499,6 +4530,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
 void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
 {
struct rq *rq = rq_of(cfs_rq);
+   unsigned int prev_nr = rq->cfs.h_nr_running;
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct sched_entity *se;
int enqueue = 1;
@@ -4535,8 +4567,11 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
break;
}
 
-   if (!se)
+   if (!se) {
add_nr_running(rq, task_delta);
+   if (prev_nr < 2 && prev_nr + task_delta >= 2)
+   overload_set(rq);
+   }
 
/* Determine whether we need to wake up potentially idle CPU: */
if (rq->curr == rq->idle && rq->cfs.nr_running)
@@ -5082,6 +5117,7 @@ static inline void hrtick_update(struct rq *rq)
 {
struct cfs_rq *cfs_rq;
struct sched_entity *se = >se;
+   unsigned int prev_nr = rq->cfs.h_nr_running;
 
/*
 * The code below (indirectly) updates schedutil which looks at
@@ -5129,8 +5165,12 @@ static inline void hrtick_update(struct rq *rq)
update_cfs_group(se);
}
 
-   if (!se)
+   if (!se) {
add_nr_running(rq, 1);
+   if (prev_nr == 1)
+   overload_set(rq);
+
+   }
 
hrtick_update(rq);
 }
@@ -5147,6 +5187,7 @@ static void dequeue_task_fair(struct rq *rq, struct 
task_struct *p, int flags)
struct cfs_rq *cfs_rq;
struct sched_entity *se = >se;
int task_sleep = flags & DEQUEUE_SLEEP;
+   unsigned int prev_nr = rq->cfs.h_nr_running;
 
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
@@ -5188,8 +5229,11 @@ static void dequeue_task_fair(struct rq *rq, struct 
task_struct *p, int flags)
update_cfs_group(se);
}
 
-   if (!se)
+   if (!se) {
sub_nr_running(rq, 1);
+   if (prev_nr == 2)
+   overload_clear(rq);
+   }
 
util_est_dequeue(>cfs, p, task_sleep);
hrtick_update(rq);
-- 
1.8.3.1



[PATCH v4 05/10] sched/fair: Hoist idle_stamp up from idle_balance

2018-12-06 Thread Steve Sistare
Move the update of idle_stamp from idle_balance to the call site in
pick_next_task_fair, to prepare for a future patch that adds work to
pick_next_task_fair which must be included in the idle_stamp interval.
No functional change.

Signed-off-by: Steve Sistare 
---
 kernel/sched/fair.c | 31 ++-
 1 file changed, 22 insertions(+), 9 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4e105db..8a33ad9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3725,6 +3725,16 @@ static inline void update_misfit_status(struct 
task_struct *p, struct rq *rq)
rq->misfit_task_load = task_h_load(p);
 }
 
+static inline void rq_idle_stamp_update(struct rq *rq)
+{
+   rq->idle_stamp = rq_clock(rq);
+}
+
+static inline void rq_idle_stamp_clear(struct rq *rq)
+{
+   rq->idle_stamp = 0;
+}
+
 static void overload_clear(struct rq *rq)
 {
struct sparsemask *overload_cpus;
@@ -3770,6 +3780,8 @@ static inline int idle_balance(struct rq *rq, struct 
rq_flags *rf)
return 0;
 }
 
+static inline void rq_idle_stamp_update(struct rq *rq) {}
+static inline void rq_idle_stamp_clear(struct rq *rq) {}
 static inline void overload_clear(struct rq *rq) {}
 static inline void overload_set(struct rq *rq) {}
 
@@ -6764,8 +6776,18 @@ static void check_preempt_wakeup(struct rq *rq, struct 
task_struct *p, int wake_
 
 idle:
update_misfit_status(NULL, rq);
+
+   /*
+* We must set idle_stamp _before_ calling idle_balance(), such that we
+* measure the duration of idle_balance() as idle time.
+*/
+   rq_idle_stamp_update(rq);
+
new_tasks = idle_balance(rq, rf);
 
+   if (new_tasks)
+   rq_idle_stamp_clear(rq);
+
/*
 * Because idle_balance() releases (and re-acquires) rq->lock, it is
 * possible for any higher priority task to appear. In that case we
@@ -9611,12 +9633,6 @@ static int idle_balance(struct rq *this_rq, struct 
rq_flags *rf)
u64 curr_cost = 0;
 
/*
-* We must set idle_stamp _before_ calling idle_balance(), such that we
-* measure the duration of idle_balance() as idle time.
-*/
-   this_rq->idle_stamp = rq_clock(this_rq);
-
-   /*
 * Do not pull tasks towards !active CPUs...
 */
if (!cpu_active(this_cpu))
@@ -9707,9 +9723,6 @@ static int idle_balance(struct rq *this_rq, struct 
rq_flags *rf)
if (this_rq->nr_running != this_rq->cfs.h_nr_running)
pulled_task = -1;
 
-   if (pulled_task)
-   this_rq->idle_stamp = 0;
-
rq_repin_lock(this_rq, rf);
 
return pulled_task;
-- 
1.8.3.1



[PATCH v4 04/10] sched/fair: Dynamically update cfs_overload_cpus

2018-12-06 Thread Steve Sistare
An overloaded CPU has more than 1 runnable task.  When a CFS task wakes
on a CPU, if h_nr_running transitions from 1 to more, then set the CPU in
the cfs_overload_cpus bitmap.  When a CFS task sleeps, if h_nr_running
transitions from 2 to less, then clear the CPU in cfs_overload_cpus.

Signed-off-by: Steve Sistare 
---
 kernel/sched/fair.c | 52 
 1 file changed, 48 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ee271bb..4e105db 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -21,6 +21,7 @@
  *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
  */
 #include "sched.h"
+#include "sparsemask.h"
 
 #include 
 
@@ -3724,6 +3725,28 @@ static inline void update_misfit_status(struct 
task_struct *p, struct rq *rq)
rq->misfit_task_load = task_h_load(p);
 }
 
+static void overload_clear(struct rq *rq)
+{
+   struct sparsemask *overload_cpus;
+
+   rcu_read_lock();
+   overload_cpus = rcu_dereference(rq->cfs_overload_cpus);
+   if (overload_cpus)
+   sparsemask_clear_elem(overload_cpus, rq->cpu);
+   rcu_read_unlock();
+}
+
+static void overload_set(struct rq *rq)
+{
+   struct sparsemask *overload_cpus;
+
+   rcu_read_lock();
+   overload_cpus = rcu_dereference(rq->cfs_overload_cpus);
+   if (overload_cpus)
+   sparsemask_set_elem(overload_cpus, rq->cpu);
+   rcu_read_unlock();
+}
+
 #else /* CONFIG_SMP */
 
 #define UPDATE_TG  0x0
@@ -3747,6 +3770,9 @@ static inline int idle_balance(struct rq *rq, struct 
rq_flags *rf)
return 0;
 }
 
+static inline void overload_clear(struct rq *rq) {}
+static inline void overload_set(struct rq *rq) {}
+
 static inline void
 util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
 
@@ -4441,6 +4467,7 @@ static int tg_throttle_down(struct task_group *tg, void 
*data)
 static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
 {
struct rq *rq = rq_of(cfs_rq);
+   unsigned int prev_nr = rq->cfs.h_nr_running;
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct sched_entity *se;
long task_delta, dequeue = 1;
@@ -4468,8 +4495,12 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
dequeue = 0;
}
 
-   if (!se)
+   if (!se) {
sub_nr_running(rq, task_delta);
+   if (prev_nr >= 2 && prev_nr - task_delta < 2)
+   overload_clear(rq);
+
+   }
 
cfs_rq->throttled = 1;
cfs_rq->throttled_clock = rq_clock(rq);
@@ -4499,6 +4530,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
 void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
 {
struct rq *rq = rq_of(cfs_rq);
+   unsigned int prev_nr = rq->cfs.h_nr_running;
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct sched_entity *se;
int enqueue = 1;
@@ -4535,8 +4567,11 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
break;
}
 
-   if (!se)
+   if (!se) {
add_nr_running(rq, task_delta);
+   if (prev_nr < 2 && prev_nr + task_delta >= 2)
+   overload_set(rq);
+   }
 
/* Determine whether we need to wake up potentially idle CPU: */
if (rq->curr == rq->idle && rq->cfs.nr_running)
@@ -5082,6 +5117,7 @@ static inline void hrtick_update(struct rq *rq)
 {
struct cfs_rq *cfs_rq;
struct sched_entity *se = >se;
+   unsigned int prev_nr = rq->cfs.h_nr_running;
 
/*
 * The code below (indirectly) updates schedutil which looks at
@@ -5129,8 +5165,12 @@ static inline void hrtick_update(struct rq *rq)
update_cfs_group(se);
}
 
-   if (!se)
+   if (!se) {
add_nr_running(rq, 1);
+   if (prev_nr == 1)
+   overload_set(rq);
+
+   }
 
hrtick_update(rq);
 }
@@ -5147,6 +5187,7 @@ static void dequeue_task_fair(struct rq *rq, struct 
task_struct *p, int flags)
struct cfs_rq *cfs_rq;
struct sched_entity *se = >se;
int task_sleep = flags & DEQUEUE_SLEEP;
+   unsigned int prev_nr = rq->cfs.h_nr_running;
 
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
@@ -5188,8 +5229,11 @@ static void dequeue_task_fair(struct rq *rq, struct 
task_struct *p, int flags)
update_cfs_group(se);
}
 
-   if (!se)
+   if (!se) {
sub_nr_running(rq, 1);
+   if (prev_nr == 2)
+   overload_clear(rq);
+   }
 
util_est_dequeue(>cfs, p, task_sleep);
hrtick_update(rq);
-- 
1.8.3.1



[PATCH v4 01/10] sched: Provide sparsemask, a reduced contention bitmap

2018-12-06 Thread Steve Sistare
Provide struct sparsemask and functions to manipulate it.  A sparsemask is
a sparse bitmap.  It reduces cache contention vs the usual bitmap when many
threads concurrently set, clear, and visit elements, by reducing the number
of significant bits per cacheline.  For each cacheline chunk of the mask,
only the first K bits of the first word are used, and the remaining bits
are ignored, where K is a creation time parameter.  Thus a sparsemask that
can represent a set of N elements is approximately (N/K * CACHELINE) bytes
in size.

This type is simpler and more efficient than the struct sbitmap used by
block drivers.

Signed-off-by: Steve Sistare 
---
 kernel/sched/sparsemask.h | 210 ++
 1 file changed, 210 insertions(+)
 create mode 100644 kernel/sched/sparsemask.h

diff --git a/kernel/sched/sparsemask.h b/kernel/sched/sparsemask.h
new file mode 100644
index 000..1194862
--- /dev/null
+++ b/kernel/sched/sparsemask.h
@@ -0,0 +1,210 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * sparsemask.h - sparse bitmap operations
+ *
+ * Copyright (c) 2018 Oracle Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __LINUX_SPARSEMASK_H
+#define __LINUX_SPARSEMASK_H
+
+#include 
+#include 
+#include 
+
+/*
+ * A sparsemask is a sparse bitmap.  It reduces cache contention vs the usual
+ * bitmap when many threads concurrently set, clear, and visit elements.  For
+ * each cacheline chunk of the mask, only the first K bits of the first word 
are
+ * used, and the remaining bits are ignored, where K is a creation time
+ * parameter.  Thus a sparsemask that can represent a set of N elements is
+ * approximately (N/K * CACHELINE) bytes in size.
+ *
+ * Clients pass and receive element numbers in the public API, and the
+ * implementation translates them to bit numbers to perform the bitmap
+ * operations.
+ */
+
+struct sparsemask_chunk {
+   unsigned long word; /* the significant bits */
+} cacheline_aligned_in_smp;
+
+struct sparsemask {
+   short nelems;   /* current number of elements */
+   short density;  /* store 2^density elements per chunk */
+   struct sparsemask_chunk chunks[0];  /* embedded array of chunks */
+};
+
+#define _SMASK_INDEX(density, elem)((elem) >> (density))
+#define _SMASK_BIT(density, elem)  ((elem) & ((1U << (density)) - 1U))
+#define SMASK_INDEX(mask, elem)_SMASK_INDEX((mask)->density, 
elem)
+#define SMASK_BIT(mask, elem)  _SMASK_BIT((mask)->density, elem)
+#define SMASK_WORD(mask, elem) \
+   (&(mask)->chunks[SMASK_INDEX((mask), (elem))].word)
+
+/*
+ * sparsemask_next() - Return the next one bit in a bitmap, starting at a
+ * specified position and wrapping from the last bit to the first, up to but
+ * not including a specified origin.  This is a helper, so do not call it
+ * directly.
+ *
+ * @mask: Bitmap to search.
+ * @origin: Origin.
+ * @prev: Previous bit. Start search after this bit number.
+ *   If -1, start search at @origin.
+ *
+ * Return: the bit number, else mask->nelems if no bits are set in the range.
+ */
+static inline int
+sparsemask_next(const struct sparsemask *mask, int origin, int prev)
+{
+   int density = mask->density;
+   int bits_per_word = 1U << density;
+   const struct sparsemask_chunk *chunk;
+   int nelems = mask->nelems;
+   int next, bit, nbits;
+   unsigned long word;
+
+   /* Calculate number of bits to be searched. */
+   if (prev == -1) {
+   nbits = nelems;
+   next = origin;
+   } else if (prev < origin) {
+   nbits = origin - prev;
+   next = prev + 1;
+   } else {
+   nbits = nelems - prev + origin - 1;
+   next = prev + 1;
+   }
+
+   if (unlikely(next >= nelems))
+   return nelems;
+
+   /*
+* Fetch and adjust first word.  Clear word bits below @next, and round
+* @next down to @bits_per_word boundary because later ffs will add
+* those bits back.
+*/
+   chunk = >chunks[_SMASK_INDEX(density, next)];
+   bit = _SMASK_BIT(density, next);
+   word = chunk->word & (~0UL << bit);
+   next -= bit;
+   nbits += bit;
+
+   while (!word) {
+   next += bits_per_word;
+   nbits -= bits_per_word;
+   if (nbits <= 0)
+   return nelems;

[PATCH v4 03/10] sched/topology: Provide cfs_overload_cpus bitmap

2018-12-06 Thread Steve Sistare
From: Steve Sistare 

Define and initialize a sparse bitmap of overloaded CPUs, per
last-level-cache scheduling domain, for use by the CFS scheduling class.
Save a pointer to cfs_overload_cpus in the rq for efficient access.

Signed-off-by: Steve Sistare 
---
 include/linux/sched/topology.h |  1 +
 kernel/sched/sched.h   |  2 ++
 kernel/sched/topology.c| 25 +++--
 3 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 6b99761..b173a77 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -72,6 +72,7 @@ struct sched_domain_shared {
atomic_tref;
atomic_tnr_busy_cpus;
int has_idle_cores;
+   struct sparsemask *cfs_overload_cpus;
 };
 
 struct sched_domain {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 618577f..eacf5db 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -81,6 +81,7 @@
 
 struct rq;
 struct cpuidle_state;
+struct sparsemask;
 
 /* task_struct::on_rq states: */
 #define TASK_ON_RQ_QUEUED  1
@@ -812,6 +813,7 @@ struct rq {
struct cfs_rq   cfs;
struct rt_rqrt;
struct dl_rqdl;
+   struct sparsemask   *cfs_overload_cpus;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
/* list of leaf cfs_rq on this CPU: */
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 3e72ce0..89a78ce 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -3,6 +3,7 @@
  * Scheduler topology setup/handling methods
  */
 #include "sched.h"
+#include "sparsemask.h"
 
 DEFINE_MUTEX(sched_domains_mutex);
 
@@ -410,7 +411,9 @@ static void destroy_sched_domains(struct sched_domain *sd)
 
 static void update_top_cache_domain(int cpu)
 {
+   struct sparsemask *cfs_overload_cpus = NULL;
struct sched_domain_shared *sds = NULL;
+   struct rq *rq = cpu_rq(cpu);
struct sched_domain *sd;
int id = cpu;
int size = 1;
@@ -420,8 +423,10 @@ static void update_top_cache_domain(int cpu)
id = cpumask_first(sched_domain_span(sd));
size = cpumask_weight(sched_domain_span(sd));
sds = sd->shared;
+   cfs_overload_cpus = sds->cfs_overload_cpus;
}
 
+   rcu_assign_pointer(rq->cfs_overload_cpus, cfs_overload_cpus);
rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
per_cpu(sd_llc_size, cpu) = size;
per_cpu(sd_llc_id, cpu) = id;
@@ -1621,7 +1626,22 @@ static void __sdt_free(const struct cpumask *cpu_map)
 
 static int sd_llc_alloc(struct sched_domain *sd)
 {
-   /* Allocate sd->shared data here. Empty for now. */
+   struct sched_domain_shared *sds = sd->shared;
+   struct cpumask *span = sched_domain_span(sd);
+   int nid = cpu_to_node(cpumask_first(span));
+   int flags = __GFP_ZERO | GFP_KERNEL;
+   struct sparsemask *mask;
+
+   /*
+* Allocate the bitmap if not already allocated.  This is called for
+* every CPU in the LLC but only allocates once per sd_llc_shared.
+*/
+   if (!sds->cfs_overload_cpus) {
+   mask = sparsemask_alloc_node(nr_cpu_ids, 3, flags, nid);
+   if (!mask)
+   return 1;
+   sds->cfs_overload_cpus = mask;
+   }
 
return 0;
 }
@@ -1633,7 +1653,8 @@ static void sd_llc_free(struct sched_domain *sd)
if (!sds)
return;
 
-   /* Free data here. Empty for now. */
+   sparsemask_free(sds->cfs_overload_cpus);
+   sds->cfs_overload_cpus = NULL;
 }
 
 static int sd_llc_alloc_all(const struct cpumask *cpu_map, struct s_data *d)
-- 
1.8.3.1



[PATCH v4 06/10] sched/fair: Generalize the detach_task interface

2018-12-06 Thread Steve Sistare
The detach_task function takes a struct lb_env argument, but only needs a
few of its members.  Pass the rq and cpu arguments explicitly so the
function may be called from code that is not based on lb_env.  No
functional change.

Signed-off-by: Steve Sistare 
---
 kernel/sched/fair.c | 14 +++---
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8a33ad9..9b7c85b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7207,15 +7207,15 @@ int can_migrate_task(struct task_struct *p, struct 
lb_env *env)
 }
 
 /*
- * detach_task() -- detach the task for the migration specified in env
+ * detach_task() -- detach the task for the migration from @src_rq to @dst_cpu.
  */
-static void detach_task(struct task_struct *p, struct lb_env *env)
+static void detach_task(struct task_struct *p, struct rq *src_rq, int dst_cpu)
 {
-   lockdep_assert_held(>src_rq->lock);
+   lockdep_assert_held(_rq->lock);
 
p->on_rq = TASK_ON_RQ_MIGRATING;
-   deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
-   set_task_cpu(p, env->dst_cpu);
+   deactivate_task(src_rq, p, DEQUEUE_NOCLOCK);
+   set_task_cpu(p, dst_cpu);
 }
 
 /*
@@ -7235,7 +7235,7 @@ static struct task_struct *detach_one_task(struct lb_env 
*env)
if (!can_migrate_task(p, env))
continue;
 
-   detach_task(p, env);
+   detach_task(p, env->src_rq, env->dst_cpu);
 
/*
 * Right now, this is only the second place where
@@ -7302,7 +7302,7 @@ static int detach_tasks(struct lb_env *env)
if ((load / 2) > env->imbalance)
goto next;
 
-   detach_task(p, env);
+   detach_task(p, env->src_rq, env->dst_cpu);
list_add(>se.group_node, >tasks);
 
detached++;
-- 
1.8.3.1



[PATCH v4 00/10] steal tasks to improve CPU utilization

2018-12-06 Thread Steve Sistare
   9.3
  16  43.7790.1  41.7410.2   4.8

KVM 4-cpu
Intel(R) Xeon(R) CPU E5-2699 v3 @ 2.30GHz
tbench, average of 11 runs.

  clients%speedup
116.2
211.7
4 9.9
812.8
   1613.7

KVM 2-cpu
Intel(R) Xeon(R) CPU E5-2699 v3 @ 2.30GHz

  Benchmark %speedup
  specjbb2015_critical_jops  5.7
  mysql_sysb1.0.14_mutex_2  40.6
  mysql_sysb1.0.14_oltp_23.9

-- 2 Socket Results --

X6-2: 2 sockets * 10 cores * 2 hyperthreads = 40 CPUs
Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz
Average of 10 runs of: hackbench  process 10

--- base ----- new ---
  groupstime %stdevtime %stdev  %speedup
   1   7.9450.2   7.2198.7  10.0
   2   8.4440.4   6.6891.5  26.2
   3  12.1001.1   9.9622.0  21.4
   4  15.0010.4  13.1091.1  14.4
   8  27.9600.2  26.1270.3   7.0

X6-2: 2 sockets * 22 cores * 2 hyperthreads = 88 CPUs
Intel(R) Xeon(R) CPU E5-2699 v4 @ 2.20GHz
Average of 10 runs of: hackbench  process 10

--- base ----- new ---
  groupstime %stdevtime %stdev  %speedup
   1   5.8265.4   5.8405.0  -0.3
   2   5.0415.3   6.171   23.4 -18.4
   3   6.8392.1   6.3243.8   8.1
   4   8.1770.6   7.3183.6  11.7
   8  14.4290.7  13.9661.3   3.3
  16  26.4010.3  25.1491.5   4.9


X6-2: 2 sockets * 22 cores * 2 hyperthreads = 88 CPUs
Intel(R) Xeon(R) CPU E5-2699 v4 @ 2.20GHz
Oracle database OLTP, logging disabled, NVRAM storage

  Customers   Users   %speedup
120  40   -1.2
240  802.7
360 1208.9
480 1604.4
600 2003.0

X6-2: 2 sockets * 14 cores * 2 hyperthreads = 56 CPUs
Intel(R) Xeon(R) CPU E5-2690 v4 @ 2.60GHz
Results from the Oracle "Performance PIT".

  Benchmark   %speedup

  mysql_sysb1.0.14_fileio_56_rndrd19.6
  mysql_sysb1.0.14_fileio_56_seqrd12.1
  mysql_sysb1.0.14_fileio_56_rndwr 0.4
  mysql_sysb1.0.14_fileio_56_seqrewr  -0.3

  pgsql_sysb1.0.14_fileio_56_rndrd19.5
  pgsql_sysb1.0.14_fileio_56_seqrd 8.6
  pgsql_sysb1.0.14_fileio_56_rndwr 1.0
  pgsql_sysb1.0.14_fileio_56_seqrewr   0.5

  opatch_time_ASM_12.2.0.1.0_HP2M  7.5
  select-1_users-warm_asmm_ASM_12.2.0.1.0_HP2M 5.1
  select-1_users_asmm_ASM_12.2.0.1.0_HP2M  4.4
  swingbenchv3_asmm_soebench_ASM_12.2.0.1.0_HP2M   5.8

  lm3_memlat_L24.8
  lm3_memlat_L10.0

  ub_gcc_56CPUs-56copies_Pipe-based_Context_Switching 60.1
  ub_gcc_56CPUs-56copies_Shell_Scripts_1_concurrent5.2
  ub_gcc_56CPUs-56copies_Shell_Scripts_8_concurrent   -3.0
  ub_gcc_56CPUs-56copies_File_Copy_1024_bufsize_2000_maxblocks 2.4

X5-2: 2 sockets * 18 cores * 2 hyperthreads = 72 CPUs
Intel(R) Xeon(R) CPU E5-2699 v3 @ 2.30GHz

  NAS_OMP
  bench class   ncpu%improved(Mops)
  dcB   72  1.3
  isC   72  0.9
  isD   72  0.7

  sysbench mysql, average of 24 runs
  --- base --- --- new ---
  nthr   events  %stdev   events  %stdev %speedup
 1331.00.25331.00.24 -0.1
 2661.30.22661.80.22  0.0
 4   1297.00.88   1300.50.82  0.2
 8   2420.80.04   2420.50.04 -0.1
16   4826.30.07   4825.40.05 -0.1
32   8815.30.27   8830.20.18  0.1
64  12823.00.24  12823.60.26  0.0

--

Changes from v1 to v2:
  - Remove stray find_time hunk from patch 5
  - Fix "warning: label out defined but not used" for !CONFIG_SCHED_SMT
  - Set SCHED_STEAL_NODE_LIMIT_DEFAULT to 2
  - Steal iff avg_idle exceeds the cost of stealing

Changes from v2 to v3:
  - Update series for kernel 4.20.  Context changes only.

Changes from v3 to v4:
  - Avoid 64-bit division on 32-bit processors in compute_skid()
  - Replace IF_SMP with inline functions to set idle_stamp
  - Push ZALLOC_MASK body into calling function
  - Set rq->cfs_overload_cpus in update_top_cache_domain instead of
cpu_attach_domain
  - Rewrite sparsemask iterator for complete inlining
  - Cull and clean up sparsemask functions and moved all into
sched/sparsemask.h

Steve Sistare (10):
  sched: Provide sparsemask, a reduced contention bitmap
  sched/topology: Provide hooks to allocate data shared per LLC
  sched/topology: Provide cfs_overload_cpus bitmap
  sched/fair: Dynamically upda

[PATCH v4 07/10] sched/fair: Provide can_migrate_task_llc

2018-12-06 Thread Steve Sistare
Define a simpler version of can_migrate_task called can_migrate_task_llc
which does not require a struct lb_env argument, and judges whether a
migration from one CPU to another within the same LLC should be allowed.

Signed-off-by: Steve Sistare 
---
 kernel/sched/fair.c | 28 
 1 file changed, 28 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 9b7c85b..3804156 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7207,6 +7207,34 @@ int can_migrate_task(struct task_struct *p, struct 
lb_env *env)
 }
 
 /*
+ * Return true if task @p can migrate from @rq to @dst_rq in the same LLC.
+ * No need to test for co-locality, and no need to test task_hot(), as sharing
+ * LLC provides cache warmth at that level.
+ */
+static bool
+can_migrate_task_llc(struct task_struct *p, struct rq *rq, struct rq *dst_rq)
+{
+   int dst_cpu = dst_rq->cpu;
+
+   lockdep_assert_held(>lock);
+
+   if (throttled_lb_pair(task_group(p), cpu_of(rq), dst_cpu))
+   return false;
+
+   if (!cpumask_test_cpu(dst_cpu, >cpus_allowed)) {
+   schedstat_inc(p->se.statistics.nr_failed_migrations_affine);
+   return false;
+   }
+
+   if (task_running(rq, p)) {
+   schedstat_inc(p->se.statistics.nr_failed_migrations_running);
+   return false;
+   }
+
+   return true;
+}
+
+/*
  * detach_task() -- detach the task for the migration from @src_rq to @dst_cpu.
  */
 static void detach_task(struct task_struct *p, struct rq *src_rq, int dst_cpu)
-- 
1.8.3.1



[PATCH v4 02/10] sched/topology: Provide hooks to allocate data shared per LLC

2018-12-06 Thread Steve Sistare
Add functions sd_llc_alloc_all() and sd_llc_free_all() to allocate and
free data pointed to by struct sched_domain_shared at the last-level-cache
domain.  sd_llc_alloc_all() is called after the SD hierarchy is known, to
eliminate the unnecessary allocations that would occur if we instead
allocated in __sdt_alloc() and then figured out which shared nodes are
redundant.

Signed-off-by: Steve Sistare 
---
 kernel/sched/topology.c | 75 -
 1 file changed, 74 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 8d7f15b..3e72ce0 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -10,6 +10,12 @@
 static cpumask_var_t sched_domains_tmpmask;
 static cpumask_var_t sched_domains_tmpmask2;
 
+struct s_data;
+static int sd_llc_alloc(struct sched_domain *sd);
+static void sd_llc_free(struct sched_domain *sd);
+static int sd_llc_alloc_all(const struct cpumask *cpu_map, struct s_data *d);
+static void sd_llc_free_all(const struct cpumask *cpu_map);
+
 #ifdef CONFIG_SCHED_DEBUG
 
 static int __init sched_debug_setup(char *str)
@@ -361,8 +367,10 @@ static void destroy_sched_domain(struct sched_domain *sd)
 */
free_sched_groups(sd->groups, 1);
 
-   if (sd->shared && atomic_dec_and_test(>shared->ref))
+   if (sd->shared && atomic_dec_and_test(>shared->ref)) {
+   sd_llc_free(sd);
kfree(sd->shared);
+   }
kfree(sd);
 }
 
@@ -996,6 +1004,7 @@ static void __free_domain_allocs(struct s_data *d, enum 
s_alloc what,
free_percpu(d->sd);
/* Fall through */
case sa_sd_storage:
+   sd_llc_free_all(cpu_map);
__sdt_free(cpu_map);
/* Fall through */
case sa_none:
@@ -1610,6 +1619,62 @@ static void __sdt_free(const struct cpumask *cpu_map)
}
 }
 
+static int sd_llc_alloc(struct sched_domain *sd)
+{
+   /* Allocate sd->shared data here. Empty for now. */
+
+   return 0;
+}
+
+static void sd_llc_free(struct sched_domain *sd)
+{
+   struct sched_domain_shared *sds = sd->shared;
+
+   if (!sds)
+   return;
+
+   /* Free data here. Empty for now. */
+}
+
+static int sd_llc_alloc_all(const struct cpumask *cpu_map, struct s_data *d)
+{
+   struct sched_domain *sd, *hsd;
+   int i;
+
+   for_each_cpu(i, cpu_map) {
+   /* Find highest domain that shares resources */
+   hsd = NULL;
+   for (sd = *per_cpu_ptr(d->sd, i); sd; sd = sd->parent) {
+   if (!(sd->flags & SD_SHARE_PKG_RESOURCES))
+   break;
+   hsd = sd;
+   }
+   if (hsd && sd_llc_alloc(hsd))
+   return 1;
+   }
+
+   return 0;
+}
+
+static void sd_llc_free_all(const struct cpumask *cpu_map)
+{
+   struct sched_domain_topology_level *tl;
+   struct sched_domain *sd;
+   struct sd_data *sdd;
+   int j;
+
+   for_each_sd_topology(tl) {
+   sdd = >data;
+   if (!sdd)
+   continue;
+   for_each_cpu(j, cpu_map) {
+   sd = *per_cpu_ptr(sdd->sd, j);
+   if (sd)
+   sd_llc_free(sd);
+   }
+   }
+}
+
 static struct sched_domain *build_sched_domain(struct 
sched_domain_topology_level *tl,
const struct cpumask *cpu_map, struct sched_domain_attr *attr,
struct sched_domain *child, int dflags, int cpu)
@@ -1769,6 +1834,14 @@ static struct sched_domain *build_sched_domain(struct 
sched_domain_topology_leve
}
}
 
+   /*
+* Allocate shared sd data at last level cache.  Must be done after
+* domains are built above, but before the data is used in
+* cpu_attach_domain and descendants below.
+*/
+   if (sd_llc_alloc_all(cpu_map, ))
+   goto error;
+
/* Attach the domains */
rcu_read_lock();
for_each_cpu(i, cpu_map) {
-- 
1.8.3.1



[PATCH v4 01/10] sched: Provide sparsemask, a reduced contention bitmap

2018-12-06 Thread Steve Sistare
Provide struct sparsemask and functions to manipulate it.  A sparsemask is
a sparse bitmap.  It reduces cache contention vs the usual bitmap when many
threads concurrently set, clear, and visit elements, by reducing the number
of significant bits per cacheline.  For each cacheline chunk of the mask,
only the first K bits of the first word are used, and the remaining bits
are ignored, where K is a creation time parameter.  Thus a sparsemask that
can represent a set of N elements is approximately (N/K * CACHELINE) bytes
in size.

This type is simpler and more efficient than the struct sbitmap used by
block drivers.

Signed-off-by: Steve Sistare 
---
 kernel/sched/sparsemask.h | 210 ++
 1 file changed, 210 insertions(+)
 create mode 100644 kernel/sched/sparsemask.h

diff --git a/kernel/sched/sparsemask.h b/kernel/sched/sparsemask.h
new file mode 100644
index 000..1194862
--- /dev/null
+++ b/kernel/sched/sparsemask.h
@@ -0,0 +1,210 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * sparsemask.h - sparse bitmap operations
+ *
+ * Copyright (c) 2018 Oracle Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __LINUX_SPARSEMASK_H
+#define __LINUX_SPARSEMASK_H
+
+#include 
+#include 
+#include 
+
+/*
+ * A sparsemask is a sparse bitmap.  It reduces cache contention vs the usual
+ * bitmap when many threads concurrently set, clear, and visit elements.  For
+ * each cacheline chunk of the mask, only the first K bits of the first word 
are
+ * used, and the remaining bits are ignored, where K is a creation time
+ * parameter.  Thus a sparsemask that can represent a set of N elements is
+ * approximately (N/K * CACHELINE) bytes in size.
+ *
+ * Clients pass and receive element numbers in the public API, and the
+ * implementation translates them to bit numbers to perform the bitmap
+ * operations.
+ */
+
+struct sparsemask_chunk {
+   unsigned long word; /* the significant bits */
+} cacheline_aligned_in_smp;
+
+struct sparsemask {
+   short nelems;   /* current number of elements */
+   short density;  /* store 2^density elements per chunk */
+   struct sparsemask_chunk chunks[0];  /* embedded array of chunks */
+};
+
+#define _SMASK_INDEX(density, elem)((elem) >> (density))
+#define _SMASK_BIT(density, elem)  ((elem) & ((1U << (density)) - 1U))
+#define SMASK_INDEX(mask, elem)_SMASK_INDEX((mask)->density, 
elem)
+#define SMASK_BIT(mask, elem)  _SMASK_BIT((mask)->density, elem)
+#define SMASK_WORD(mask, elem) \
+   (&(mask)->chunks[SMASK_INDEX((mask), (elem))].word)
+
+/*
+ * sparsemask_next() - Return the next one bit in a bitmap, starting at a
+ * specified position and wrapping from the last bit to the first, up to but
+ * not including a specified origin.  This is a helper, so do not call it
+ * directly.
+ *
+ * @mask: Bitmap to search.
+ * @origin: Origin.
+ * @prev: Previous bit. Start search after this bit number.
+ *   If -1, start search at @origin.
+ *
+ * Return: the bit number, else mask->nelems if no bits are set in the range.
+ */
+static inline int
+sparsemask_next(const struct sparsemask *mask, int origin, int prev)
+{
+   int density = mask->density;
+   int bits_per_word = 1U << density;
+   const struct sparsemask_chunk *chunk;
+   int nelems = mask->nelems;
+   int next, bit, nbits;
+   unsigned long word;
+
+   /* Calculate number of bits to be searched. */
+   if (prev == -1) {
+   nbits = nelems;
+   next = origin;
+   } else if (prev < origin) {
+   nbits = origin - prev;
+   next = prev + 1;
+   } else {
+   nbits = nelems - prev + origin - 1;
+   next = prev + 1;
+   }
+
+   if (unlikely(next >= nelems))
+   return nelems;
+
+   /*
+* Fetch and adjust first word.  Clear word bits below @next, and round
+* @next down to @bits_per_word boundary because later ffs will add
+* those bits back.
+*/
+   chunk = >chunks[_SMASK_INDEX(density, next)];
+   bit = _SMASK_BIT(density, next);
+   word = chunk->word & (~0UL << bit);
+   next -= bit;
+   nbits += bit;
+
+   while (!word) {
+   next += bits_per_word;
+   nbits -= bits_per_word;
+   if (nbits <= 0)
+   return nelems;

[PATCH v4 03/10] sched/topology: Provide cfs_overload_cpus bitmap

2018-12-06 Thread Steve Sistare
From: Steve Sistare 

Define and initialize a sparse bitmap of overloaded CPUs, per
last-level-cache scheduling domain, for use by the CFS scheduling class.
Save a pointer to cfs_overload_cpus in the rq for efficient access.

Signed-off-by: Steve Sistare 
---
 include/linux/sched/topology.h |  1 +
 kernel/sched/sched.h   |  2 ++
 kernel/sched/topology.c| 25 +++--
 3 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 6b99761..b173a77 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -72,6 +72,7 @@ struct sched_domain_shared {
atomic_tref;
atomic_tnr_busy_cpus;
int has_idle_cores;
+   struct sparsemask *cfs_overload_cpus;
 };
 
 struct sched_domain {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 618577f..eacf5db 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -81,6 +81,7 @@
 
 struct rq;
 struct cpuidle_state;
+struct sparsemask;
 
 /* task_struct::on_rq states: */
 #define TASK_ON_RQ_QUEUED  1
@@ -812,6 +813,7 @@ struct rq {
struct cfs_rq   cfs;
struct rt_rqrt;
struct dl_rqdl;
+   struct sparsemask   *cfs_overload_cpus;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
/* list of leaf cfs_rq on this CPU: */
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 3e72ce0..89a78ce 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -3,6 +3,7 @@
  * Scheduler topology setup/handling methods
  */
 #include "sched.h"
+#include "sparsemask.h"
 
 DEFINE_MUTEX(sched_domains_mutex);
 
@@ -410,7 +411,9 @@ static void destroy_sched_domains(struct sched_domain *sd)
 
 static void update_top_cache_domain(int cpu)
 {
+   struct sparsemask *cfs_overload_cpus = NULL;
struct sched_domain_shared *sds = NULL;
+   struct rq *rq = cpu_rq(cpu);
struct sched_domain *sd;
int id = cpu;
int size = 1;
@@ -420,8 +423,10 @@ static void update_top_cache_domain(int cpu)
id = cpumask_first(sched_domain_span(sd));
size = cpumask_weight(sched_domain_span(sd));
sds = sd->shared;
+   cfs_overload_cpus = sds->cfs_overload_cpus;
}
 
+   rcu_assign_pointer(rq->cfs_overload_cpus, cfs_overload_cpus);
rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
per_cpu(sd_llc_size, cpu) = size;
per_cpu(sd_llc_id, cpu) = id;
@@ -1621,7 +1626,22 @@ static void __sdt_free(const struct cpumask *cpu_map)
 
 static int sd_llc_alloc(struct sched_domain *sd)
 {
-   /* Allocate sd->shared data here. Empty for now. */
+   struct sched_domain_shared *sds = sd->shared;
+   struct cpumask *span = sched_domain_span(sd);
+   int nid = cpu_to_node(cpumask_first(span));
+   int flags = __GFP_ZERO | GFP_KERNEL;
+   struct sparsemask *mask;
+
+   /*
+* Allocate the bitmap if not already allocated.  This is called for
+* every CPU in the LLC but only allocates once per sd_llc_shared.
+*/
+   if (!sds->cfs_overload_cpus) {
+   mask = sparsemask_alloc_node(nr_cpu_ids, 3, flags, nid);
+   if (!mask)
+   return 1;
+   sds->cfs_overload_cpus = mask;
+   }
 
return 0;
 }
@@ -1633,7 +1653,8 @@ static void sd_llc_free(struct sched_domain *sd)
if (!sds)
return;
 
-   /* Free data here. Empty for now. */
+   sparsemask_free(sds->cfs_overload_cpus);
+   sds->cfs_overload_cpus = NULL;
 }
 
 static int sd_llc_alloc_all(const struct cpumask *cpu_map, struct s_data *d)
-- 
1.8.3.1



[PATCH v4 06/10] sched/fair: Generalize the detach_task interface

2018-12-06 Thread Steve Sistare
The detach_task function takes a struct lb_env argument, but only needs a
few of its members.  Pass the rq and cpu arguments explicitly so the
function may be called from code that is not based on lb_env.  No
functional change.

Signed-off-by: Steve Sistare 
---
 kernel/sched/fair.c | 14 +++---
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8a33ad9..9b7c85b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7207,15 +7207,15 @@ int can_migrate_task(struct task_struct *p, struct 
lb_env *env)
 }
 
 /*
- * detach_task() -- detach the task for the migration specified in env
+ * detach_task() -- detach the task for the migration from @src_rq to @dst_cpu.
  */
-static void detach_task(struct task_struct *p, struct lb_env *env)
+static void detach_task(struct task_struct *p, struct rq *src_rq, int dst_cpu)
 {
-   lockdep_assert_held(>src_rq->lock);
+   lockdep_assert_held(_rq->lock);
 
p->on_rq = TASK_ON_RQ_MIGRATING;
-   deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
-   set_task_cpu(p, env->dst_cpu);
+   deactivate_task(src_rq, p, DEQUEUE_NOCLOCK);
+   set_task_cpu(p, dst_cpu);
 }
 
 /*
@@ -7235,7 +7235,7 @@ static struct task_struct *detach_one_task(struct lb_env 
*env)
if (!can_migrate_task(p, env))
continue;
 
-   detach_task(p, env);
+   detach_task(p, env->src_rq, env->dst_cpu);
 
/*
 * Right now, this is only the second place where
@@ -7302,7 +7302,7 @@ static int detach_tasks(struct lb_env *env)
if ((load / 2) > env->imbalance)
goto next;
 
-   detach_task(p, env);
+   detach_task(p, env->src_rq, env->dst_cpu);
list_add(>se.group_node, >tasks);
 
detached++;
-- 
1.8.3.1



[PATCH v4 00/10] steal tasks to improve CPU utilization

2018-12-06 Thread Steve Sistare
   9.3
  16  43.7790.1  41.7410.2   4.8

KVM 4-cpu
Intel(R) Xeon(R) CPU E5-2699 v3 @ 2.30GHz
tbench, average of 11 runs.

  clients%speedup
116.2
211.7
4 9.9
812.8
   1613.7

KVM 2-cpu
Intel(R) Xeon(R) CPU E5-2699 v3 @ 2.30GHz

  Benchmark %speedup
  specjbb2015_critical_jops  5.7
  mysql_sysb1.0.14_mutex_2  40.6
  mysql_sysb1.0.14_oltp_23.9

-- 2 Socket Results --

X6-2: 2 sockets * 10 cores * 2 hyperthreads = 40 CPUs
Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz
Average of 10 runs of: hackbench  process 10

--- base ----- new ---
  groupstime %stdevtime %stdev  %speedup
   1   7.9450.2   7.2198.7  10.0
   2   8.4440.4   6.6891.5  26.2
   3  12.1001.1   9.9622.0  21.4
   4  15.0010.4  13.1091.1  14.4
   8  27.9600.2  26.1270.3   7.0

X6-2: 2 sockets * 22 cores * 2 hyperthreads = 88 CPUs
Intel(R) Xeon(R) CPU E5-2699 v4 @ 2.20GHz
Average of 10 runs of: hackbench  process 10

--- base ----- new ---
  groupstime %stdevtime %stdev  %speedup
   1   5.8265.4   5.8405.0  -0.3
   2   5.0415.3   6.171   23.4 -18.4
   3   6.8392.1   6.3243.8   8.1
   4   8.1770.6   7.3183.6  11.7
   8  14.4290.7  13.9661.3   3.3
  16  26.4010.3  25.1491.5   4.9


X6-2: 2 sockets * 22 cores * 2 hyperthreads = 88 CPUs
Intel(R) Xeon(R) CPU E5-2699 v4 @ 2.20GHz
Oracle database OLTP, logging disabled, NVRAM storage

  Customers   Users   %speedup
120  40   -1.2
240  802.7
360 1208.9
480 1604.4
600 2003.0

X6-2: 2 sockets * 14 cores * 2 hyperthreads = 56 CPUs
Intel(R) Xeon(R) CPU E5-2690 v4 @ 2.60GHz
Results from the Oracle "Performance PIT".

  Benchmark   %speedup

  mysql_sysb1.0.14_fileio_56_rndrd19.6
  mysql_sysb1.0.14_fileio_56_seqrd12.1
  mysql_sysb1.0.14_fileio_56_rndwr 0.4
  mysql_sysb1.0.14_fileio_56_seqrewr  -0.3

  pgsql_sysb1.0.14_fileio_56_rndrd19.5
  pgsql_sysb1.0.14_fileio_56_seqrd 8.6
  pgsql_sysb1.0.14_fileio_56_rndwr 1.0
  pgsql_sysb1.0.14_fileio_56_seqrewr   0.5

  opatch_time_ASM_12.2.0.1.0_HP2M  7.5
  select-1_users-warm_asmm_ASM_12.2.0.1.0_HP2M 5.1
  select-1_users_asmm_ASM_12.2.0.1.0_HP2M  4.4
  swingbenchv3_asmm_soebench_ASM_12.2.0.1.0_HP2M   5.8

  lm3_memlat_L24.8
  lm3_memlat_L10.0

  ub_gcc_56CPUs-56copies_Pipe-based_Context_Switching 60.1
  ub_gcc_56CPUs-56copies_Shell_Scripts_1_concurrent5.2
  ub_gcc_56CPUs-56copies_Shell_Scripts_8_concurrent   -3.0
  ub_gcc_56CPUs-56copies_File_Copy_1024_bufsize_2000_maxblocks 2.4

X5-2: 2 sockets * 18 cores * 2 hyperthreads = 72 CPUs
Intel(R) Xeon(R) CPU E5-2699 v3 @ 2.30GHz

  NAS_OMP
  bench class   ncpu%improved(Mops)
  dcB   72  1.3
  isC   72  0.9
  isD   72  0.7

  sysbench mysql, average of 24 runs
  --- base --- --- new ---
  nthr   events  %stdev   events  %stdev %speedup
 1331.00.25331.00.24 -0.1
 2661.30.22661.80.22  0.0
 4   1297.00.88   1300.50.82  0.2
 8   2420.80.04   2420.50.04 -0.1
16   4826.30.07   4825.40.05 -0.1
32   8815.30.27   8830.20.18  0.1
64  12823.00.24  12823.60.26  0.0

--

Changes from v1 to v2:
  - Remove stray find_time hunk from patch 5
  - Fix "warning: label out defined but not used" for !CONFIG_SCHED_SMT
  - Set SCHED_STEAL_NODE_LIMIT_DEFAULT to 2
  - Steal iff avg_idle exceeds the cost of stealing

Changes from v2 to v3:
  - Update series for kernel 4.20.  Context changes only.

Changes from v3 to v4:
  - Avoid 64-bit division on 32-bit processors in compute_skid()
  - Replace IF_SMP with inline functions to set idle_stamp
  - Push ZALLOC_MASK body into calling function
  - Set rq->cfs_overload_cpus in update_top_cache_domain instead of
cpu_attach_domain
  - Rewrite sparsemask iterator for complete inlining
  - Cull and clean up sparsemask functions and moved all into
sched/sparsemask.h

Steve Sistare (10):
  sched: Provide sparsemask, a reduced contention bitmap
  sched/topology: Provide hooks to allocate data shared per LLC
  sched/topology: Provide cfs_overload_cpus bitmap
  sched/fair: Dynamically upda

[PATCH v4 07/10] sched/fair: Provide can_migrate_task_llc

2018-12-06 Thread Steve Sistare
Define a simpler version of can_migrate_task called can_migrate_task_llc
which does not require a struct lb_env argument, and judges whether a
migration from one CPU to another within the same LLC should be allowed.

Signed-off-by: Steve Sistare 
---
 kernel/sched/fair.c | 28 
 1 file changed, 28 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 9b7c85b..3804156 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7207,6 +7207,34 @@ int can_migrate_task(struct task_struct *p, struct 
lb_env *env)
 }
 
 /*
+ * Return true if task @p can migrate from @rq to @dst_rq in the same LLC.
+ * No need to test for co-locality, and no need to test task_hot(), as sharing
+ * LLC provides cache warmth at that level.
+ */
+static bool
+can_migrate_task_llc(struct task_struct *p, struct rq *rq, struct rq *dst_rq)
+{
+   int dst_cpu = dst_rq->cpu;
+
+   lockdep_assert_held(>lock);
+
+   if (throttled_lb_pair(task_group(p), cpu_of(rq), dst_cpu))
+   return false;
+
+   if (!cpumask_test_cpu(dst_cpu, >cpus_allowed)) {
+   schedstat_inc(p->se.statistics.nr_failed_migrations_affine);
+   return false;
+   }
+
+   if (task_running(rq, p)) {
+   schedstat_inc(p->se.statistics.nr_failed_migrations_running);
+   return false;
+   }
+
+   return true;
+}
+
+/*
  * detach_task() -- detach the task for the migration from @src_rq to @dst_cpu.
  */
 static void detach_task(struct task_struct *p, struct rq *src_rq, int dst_cpu)
-- 
1.8.3.1



[PATCH v4 02/10] sched/topology: Provide hooks to allocate data shared per LLC

2018-12-06 Thread Steve Sistare
Add functions sd_llc_alloc_all() and sd_llc_free_all() to allocate and
free data pointed to by struct sched_domain_shared at the last-level-cache
domain.  sd_llc_alloc_all() is called after the SD hierarchy is known, to
eliminate the unnecessary allocations that would occur if we instead
allocated in __sdt_alloc() and then figured out which shared nodes are
redundant.

Signed-off-by: Steve Sistare 
---
 kernel/sched/topology.c | 75 -
 1 file changed, 74 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 8d7f15b..3e72ce0 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -10,6 +10,12 @@
 static cpumask_var_t sched_domains_tmpmask;
 static cpumask_var_t sched_domains_tmpmask2;
 
+struct s_data;
+static int sd_llc_alloc(struct sched_domain *sd);
+static void sd_llc_free(struct sched_domain *sd);
+static int sd_llc_alloc_all(const struct cpumask *cpu_map, struct s_data *d);
+static void sd_llc_free_all(const struct cpumask *cpu_map);
+
 #ifdef CONFIG_SCHED_DEBUG
 
 static int __init sched_debug_setup(char *str)
@@ -361,8 +367,10 @@ static void destroy_sched_domain(struct sched_domain *sd)
 */
free_sched_groups(sd->groups, 1);
 
-   if (sd->shared && atomic_dec_and_test(>shared->ref))
+   if (sd->shared && atomic_dec_and_test(>shared->ref)) {
+   sd_llc_free(sd);
kfree(sd->shared);
+   }
kfree(sd);
 }
 
@@ -996,6 +1004,7 @@ static void __free_domain_allocs(struct s_data *d, enum 
s_alloc what,
free_percpu(d->sd);
/* Fall through */
case sa_sd_storage:
+   sd_llc_free_all(cpu_map);
__sdt_free(cpu_map);
/* Fall through */
case sa_none:
@@ -1610,6 +1619,62 @@ static void __sdt_free(const struct cpumask *cpu_map)
}
 }
 
+static int sd_llc_alloc(struct sched_domain *sd)
+{
+   /* Allocate sd->shared data here. Empty for now. */
+
+   return 0;
+}
+
+static void sd_llc_free(struct sched_domain *sd)
+{
+   struct sched_domain_shared *sds = sd->shared;
+
+   if (!sds)
+   return;
+
+   /* Free data here. Empty for now. */
+}
+
+static int sd_llc_alloc_all(const struct cpumask *cpu_map, struct s_data *d)
+{
+   struct sched_domain *sd, *hsd;
+   int i;
+
+   for_each_cpu(i, cpu_map) {
+   /* Find highest domain that shares resources */
+   hsd = NULL;
+   for (sd = *per_cpu_ptr(d->sd, i); sd; sd = sd->parent) {
+   if (!(sd->flags & SD_SHARE_PKG_RESOURCES))
+   break;
+   hsd = sd;
+   }
+   if (hsd && sd_llc_alloc(hsd))
+   return 1;
+   }
+
+   return 0;
+}
+
+static void sd_llc_free_all(const struct cpumask *cpu_map)
+{
+   struct sched_domain_topology_level *tl;
+   struct sched_domain *sd;
+   struct sd_data *sdd;
+   int j;
+
+   for_each_sd_topology(tl) {
+   sdd = >data;
+   if (!sdd)
+   continue;
+   for_each_cpu(j, cpu_map) {
+   sd = *per_cpu_ptr(sdd->sd, j);
+   if (sd)
+   sd_llc_free(sd);
+   }
+   }
+}
+
 static struct sched_domain *build_sched_domain(struct 
sched_domain_topology_level *tl,
const struct cpumask *cpu_map, struct sched_domain_attr *attr,
struct sched_domain *child, int dflags, int cpu)
@@ -1769,6 +1834,14 @@ static struct sched_domain *build_sched_domain(struct 
sched_domain_topology_leve
}
}
 
+   /*
+* Allocate shared sd data at last level cache.  Must be done after
+* domains are built above, but before the data is used in
+* cpu_attach_domain and descendants below.
+*/
+   if (sd_llc_alloc_all(cpu_map, ))
+   goto error;
+
/* Attach the domains */
rcu_read_lock();
for_each_cpu(i, cpu_map) {
-- 
1.8.3.1



[PATCH v4 10/10] sched/fair: Provide idle search schedstats

2018-12-06 Thread Steve Sistare
Add schedstats to measure the effectiveness of searching for idle CPUs
and stealing tasks.  This is a temporary patch intended for use during
development only.  SCHEDSTAT_VERSION is bumped to 16, and the following
fields are added to the per-CPU statistics of /proc/schedstat:

field 10: # of times select_idle_sibling "easily" found an idle CPU --
  prev or target is idle.
field 11: # of times select_idle_sibling searched and found an idle cpu.
field 12: # of times select_idle_sibling searched and found an idle core.
field 13: # of times select_idle_sibling failed to find anything idle.
field 14: time in nanoseconds spent in functions that search for idle
  CPUs and search for tasks to steal.
field 15: # of times an idle CPU steals a task from another CPU.
field 16: # of times try_steal finds overloaded CPUs but no task is
   migratable.

Signed-off-by: Steve Sistare 
---
 kernel/sched/core.c  | 31 --
 kernel/sched/fair.c  | 54 ++--
 kernel/sched/sched.h |  9 +
 kernel/sched/stats.c | 11 ++-
 kernel/sched/stats.h | 13 +
 5 files changed, 109 insertions(+), 9 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f12225f..14ee88b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2220,17 +2220,44 @@ int sysctl_numa_balancing(struct ctl_table *table, int 
write,
 DEFINE_STATIC_KEY_FALSE(sched_schedstats);
 static bool __initdata __sched_schedstats = false;
 
+unsigned long schedstat_skid;
+
+static void compute_skid(void)
+{
+   int i, n = 0;
+   s64 t;
+   int skid = 0;
+
+   for (i = 0; i < 100; i++) {
+   t = local_clock();
+   t = local_clock() - t;
+   if (t > 0 && t < 1000) {/* only use sane samples */
+   skid += (int) t;
+   n++;
+   }
+   }
+
+   if (n > 0)
+   schedstat_skid = skid / n;
+   else
+   schedstat_skid = 0;
+   pr_info("schedstat_skid = %lu\n", schedstat_skid);
+}
+
 static void set_schedstats(bool enabled)
 {
-   if (enabled)
+   if (enabled) {
+   compute_skid();
static_branch_enable(_schedstats);
-   else
+   } else {
static_branch_disable(_schedstats);
+   }
 }
 
 void force_schedstat_enabled(void)
 {
if (!schedstat_enabled()) {
+   compute_skid();
pr_info("kernel profiling enabled schedstats, disable via 
kernel.sched_schedstats.\n");
static_branch_enable(_schedstats);
}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1efd9c4..73e9873 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3748,29 +3748,35 @@ static inline bool steal_enabled(void)
 static void overload_clear(struct rq *rq)
 {
struct sparsemask *overload_cpus;
+   unsigned long time;
 
if (!steal_enabled())
return;
 
+   time = schedstat_start_time();
rcu_read_lock();
overload_cpus = rcu_dereference(rq->cfs_overload_cpus);
if (overload_cpus)
sparsemask_clear_elem(overload_cpus, rq->cpu);
rcu_read_unlock();
+   schedstat_end_time(rq->find_time, time);
 }
 
 static void overload_set(struct rq *rq)
 {
struct sparsemask *overload_cpus;
+   unsigned long time;
 
if (!steal_enabled())
return;
 
+   time = schedstat_start_time();
rcu_read_lock();
overload_cpus = rcu_dereference(rq->cfs_overload_cpus);
if (overload_cpus)
sparsemask_set_elem(overload_cpus, rq->cpu);
rcu_read_unlock();
+   schedstat_end_time(rq->find_time, time);
 }
 
 static int try_steal(struct rq *this_rq, struct rq_flags *rf);
@@ -6191,6 +6197,16 @@ static int select_idle_cpu(struct task_struct *p, struct 
sched_domain *sd, int t
return cpu;
 }
 
+#define SET_STAT(STAT) \
+   do {\
+   if (schedstat_enabled()) {  \
+   struct rq *rq = this_rq();  \
+   \
+   if (rq) \
+   __schedstat_inc(rq->STAT);  \
+   }   \
+   } while (0)
+
 /*
  * Try and locate an idle core/thread in the LLC cache domain.
  */
@@ -6199,14 +6215,18 @@ static int select_idle_sibling(struct task_struct *p, 
int prev, int target)
struct sched_domain *sd;
int i, recent_used_cpu;
 
-   if (available_idle_cpu(target))
+   if (ava

[PATCH v4 09/10] sched/fair: disable stealing if too many NUMA nodes

2018-12-06 Thread Steve Sistare
ize the number of
cross-node moves in all conditions, with limited success.  The fundamental
problem is that the scheduler does not track which groups of tasks talk to
each other.  Parts of several groups become entrenched on the same node,
filling it to capacity, leaving no room for either group to pull its peers
over, and there is neither data nor mechanism for the scheduler to evict
one group to make room for the other.

For now, disable STEAL on such systems until we can do better, or it is
shown that hackbench is atypical and most workloads benefit from stealing.

Signed-off-by: Steve Sistare 
---
 kernel/sched/fair.c | 16 +---
 kernel/sched/sched.h|  2 +-
 kernel/sched/topology.c | 25 +
 3 files changed, 39 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1476ae8..1efd9c4 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3735,11 +3735,21 @@ static inline void rq_idle_stamp_clear(struct rq *rq)
rq->idle_stamp = 0;
 }
 
+static inline bool steal_enabled(void)
+{
+#ifdef CONFIG_NUMA
+   bool allow = static_branch_likely(_steal_allow);
+#else
+   bool allow = true;
+#endif
+   return sched_feat(STEAL) && allow;
+}
+
 static void overload_clear(struct rq *rq)
 {
struct sparsemask *overload_cpus;
 
-   if (!sched_feat(STEAL))
+   if (!steal_enabled())
return;
 
rcu_read_lock();
@@ -3753,7 +3763,7 @@ static void overload_set(struct rq *rq)
 {
struct sparsemask *overload_cpus;
 
-   if (!sched_feat(STEAL))
+   if (!steal_enabled())
return;
 
rcu_read_lock();
@@ -9902,7 +9912,7 @@ static int try_steal(struct rq *dst_rq, struct rq_flags 
*dst_rf)
int stolen = 0;
struct sparsemask *overload_cpus;
 
-   if (!sched_feat(STEAL))
+   if (!steal_enabled())
return 0;
 
if (!cpu_active(dst_cpu))
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index eacf5db..2a28340 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -936,7 +936,6 @@ static inline int cpu_of(struct rq *rq)
 #endif
 }
 
-
 #ifdef CONFIG_SCHED_SMT
 
 extern struct static_key_false sched_smt_present;
@@ -1185,6 +1184,7 @@ enum numa_topology_type {
 #endif
 
 #ifdef CONFIG_NUMA
+extern struct static_key_true sched_steal_allow;
 extern void sched_init_numa(void);
 extern void sched_domains_numa_masks_set(unsigned int cpu);
 extern void sched_domains_numa_masks_clear(unsigned int cpu);
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 89a78ce..259d659 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1344,6 +1344,30 @@ static void init_numa_topology_type(void)
}
 }
 
+DEFINE_STATIC_KEY_TRUE(sched_steal_allow);
+static int sched_steal_node_limit;
+#define SCHED_STEAL_NODE_LIMIT_DEFAULT 2
+
+static int __init steal_node_limit_setup(char *buf)
+{
+   get_option(, _steal_node_limit);
+   return 0;
+}
+
+early_param("sched_steal_node_limit", steal_node_limit_setup);
+
+static void check_node_limit(void)
+{
+   int n = num_possible_nodes();
+
+   if (sched_steal_node_limit == 0)
+   sched_steal_node_limit = SCHED_STEAL_NODE_LIMIT_DEFAULT;
+   if (n > sched_steal_node_limit) {
+   static_branch_disable(_steal_allow);
+   pr_debug("Suppressing sched STEAL. To enable, reboot with 
sched_steal_node_limit=%d", n);
+   }
+}
+
 void sched_init_numa(void)
 {
int next_distance, curr_distance = node_distance(0, 0);
@@ -1492,6 +1516,7 @@ void sched_init_numa(void)
sched_max_numa_distance = sched_domains_numa_distance[level - 1];
 
init_numa_topology_type();
+   check_node_limit();
 }
 
 void sched_domains_numa_masks_set(unsigned int cpu)
-- 
1.8.3.1



[PATCH v4 10/10] sched/fair: Provide idle search schedstats

2018-12-06 Thread Steve Sistare
Add schedstats to measure the effectiveness of searching for idle CPUs
and stealing tasks.  This is a temporary patch intended for use during
development only.  SCHEDSTAT_VERSION is bumped to 16, and the following
fields are added to the per-CPU statistics of /proc/schedstat:

field 10: # of times select_idle_sibling "easily" found an idle CPU --
  prev or target is idle.
field 11: # of times select_idle_sibling searched and found an idle cpu.
field 12: # of times select_idle_sibling searched and found an idle core.
field 13: # of times select_idle_sibling failed to find anything idle.
field 14: time in nanoseconds spent in functions that search for idle
  CPUs and search for tasks to steal.
field 15: # of times an idle CPU steals a task from another CPU.
field 16: # of times try_steal finds overloaded CPUs but no task is
   migratable.

Signed-off-by: Steve Sistare 
---
 kernel/sched/core.c  | 31 --
 kernel/sched/fair.c  | 54 ++--
 kernel/sched/sched.h |  9 +
 kernel/sched/stats.c | 11 ++-
 kernel/sched/stats.h | 13 +
 5 files changed, 109 insertions(+), 9 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f12225f..14ee88b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2220,17 +2220,44 @@ int sysctl_numa_balancing(struct ctl_table *table, int 
write,
 DEFINE_STATIC_KEY_FALSE(sched_schedstats);
 static bool __initdata __sched_schedstats = false;
 
+unsigned long schedstat_skid;
+
+static void compute_skid(void)
+{
+   int i, n = 0;
+   s64 t;
+   int skid = 0;
+
+   for (i = 0; i < 100; i++) {
+   t = local_clock();
+   t = local_clock() - t;
+   if (t > 0 && t < 1000) {/* only use sane samples */
+   skid += (int) t;
+   n++;
+   }
+   }
+
+   if (n > 0)
+   schedstat_skid = skid / n;
+   else
+   schedstat_skid = 0;
+   pr_info("schedstat_skid = %lu\n", schedstat_skid);
+}
+
 static void set_schedstats(bool enabled)
 {
-   if (enabled)
+   if (enabled) {
+   compute_skid();
static_branch_enable(_schedstats);
-   else
+   } else {
static_branch_disable(_schedstats);
+   }
 }
 
 void force_schedstat_enabled(void)
 {
if (!schedstat_enabled()) {
+   compute_skid();
pr_info("kernel profiling enabled schedstats, disable via 
kernel.sched_schedstats.\n");
static_branch_enable(_schedstats);
}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1efd9c4..73e9873 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3748,29 +3748,35 @@ static inline bool steal_enabled(void)
 static void overload_clear(struct rq *rq)
 {
struct sparsemask *overload_cpus;
+   unsigned long time;
 
if (!steal_enabled())
return;
 
+   time = schedstat_start_time();
rcu_read_lock();
overload_cpus = rcu_dereference(rq->cfs_overload_cpus);
if (overload_cpus)
sparsemask_clear_elem(overload_cpus, rq->cpu);
rcu_read_unlock();
+   schedstat_end_time(rq->find_time, time);
 }
 
 static void overload_set(struct rq *rq)
 {
struct sparsemask *overload_cpus;
+   unsigned long time;
 
if (!steal_enabled())
return;
 
+   time = schedstat_start_time();
rcu_read_lock();
overload_cpus = rcu_dereference(rq->cfs_overload_cpus);
if (overload_cpus)
sparsemask_set_elem(overload_cpus, rq->cpu);
rcu_read_unlock();
+   schedstat_end_time(rq->find_time, time);
 }
 
 static int try_steal(struct rq *this_rq, struct rq_flags *rf);
@@ -6191,6 +6197,16 @@ static int select_idle_cpu(struct task_struct *p, struct 
sched_domain *sd, int t
return cpu;
 }
 
+#define SET_STAT(STAT) \
+   do {\
+   if (schedstat_enabled()) {  \
+   struct rq *rq = this_rq();  \
+   \
+   if (rq) \
+   __schedstat_inc(rq->STAT);  \
+   }   \
+   } while (0)
+
 /*
  * Try and locate an idle core/thread in the LLC cache domain.
  */
@@ -6199,14 +6215,18 @@ static int select_idle_sibling(struct task_struct *p, 
int prev, int target)
struct sched_domain *sd;
int i, recent_used_cpu;
 
-   if (available_idle_cpu(target))
+   if (ava

[PATCH v4 09/10] sched/fair: disable stealing if too many NUMA nodes

2018-12-06 Thread Steve Sistare
ize the number of
cross-node moves in all conditions, with limited success.  The fundamental
problem is that the scheduler does not track which groups of tasks talk to
each other.  Parts of several groups become entrenched on the same node,
filling it to capacity, leaving no room for either group to pull its peers
over, and there is neither data nor mechanism for the scheduler to evict
one group to make room for the other.

For now, disable STEAL on such systems until we can do better, or it is
shown that hackbench is atypical and most workloads benefit from stealing.

Signed-off-by: Steve Sistare 
---
 kernel/sched/fair.c | 16 +---
 kernel/sched/sched.h|  2 +-
 kernel/sched/topology.c | 25 +
 3 files changed, 39 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1476ae8..1efd9c4 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3735,11 +3735,21 @@ static inline void rq_idle_stamp_clear(struct rq *rq)
rq->idle_stamp = 0;
 }
 
+static inline bool steal_enabled(void)
+{
+#ifdef CONFIG_NUMA
+   bool allow = static_branch_likely(_steal_allow);
+#else
+   bool allow = true;
+#endif
+   return sched_feat(STEAL) && allow;
+}
+
 static void overload_clear(struct rq *rq)
 {
struct sparsemask *overload_cpus;
 
-   if (!sched_feat(STEAL))
+   if (!steal_enabled())
return;
 
rcu_read_lock();
@@ -3753,7 +3763,7 @@ static void overload_set(struct rq *rq)
 {
struct sparsemask *overload_cpus;
 
-   if (!sched_feat(STEAL))
+   if (!steal_enabled())
return;
 
rcu_read_lock();
@@ -9902,7 +9912,7 @@ static int try_steal(struct rq *dst_rq, struct rq_flags 
*dst_rf)
int stolen = 0;
struct sparsemask *overload_cpus;
 
-   if (!sched_feat(STEAL))
+   if (!steal_enabled())
return 0;
 
if (!cpu_active(dst_cpu))
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index eacf5db..2a28340 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -936,7 +936,6 @@ static inline int cpu_of(struct rq *rq)
 #endif
 }
 
-
 #ifdef CONFIG_SCHED_SMT
 
 extern struct static_key_false sched_smt_present;
@@ -1185,6 +1184,7 @@ enum numa_topology_type {
 #endif
 
 #ifdef CONFIG_NUMA
+extern struct static_key_true sched_steal_allow;
 extern void sched_init_numa(void);
 extern void sched_domains_numa_masks_set(unsigned int cpu);
 extern void sched_domains_numa_masks_clear(unsigned int cpu);
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 89a78ce..259d659 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1344,6 +1344,30 @@ static void init_numa_topology_type(void)
}
 }
 
+DEFINE_STATIC_KEY_TRUE(sched_steal_allow);
+static int sched_steal_node_limit;
+#define SCHED_STEAL_NODE_LIMIT_DEFAULT 2
+
+static int __init steal_node_limit_setup(char *buf)
+{
+   get_option(, _steal_node_limit);
+   return 0;
+}
+
+early_param("sched_steal_node_limit", steal_node_limit_setup);
+
+static void check_node_limit(void)
+{
+   int n = num_possible_nodes();
+
+   if (sched_steal_node_limit == 0)
+   sched_steal_node_limit = SCHED_STEAL_NODE_LIMIT_DEFAULT;
+   if (n > sched_steal_node_limit) {
+   static_branch_disable(_steal_allow);
+   pr_debug("Suppressing sched STEAL. To enable, reboot with 
sched_steal_node_limit=%d", n);
+   }
+}
+
 void sched_init_numa(void)
 {
int next_distance, curr_distance = node_distance(0, 0);
@@ -1492,6 +1516,7 @@ void sched_init_numa(void)
sched_max_numa_distance = sched_domains_numa_distance[level - 1];
 
init_numa_topology_type();
+   check_node_limit();
 }
 
 void sched_domains_numa_masks_set(unsigned int cpu)
-- 
1.8.3.1



[PATCH v4 08/10] sched/fair: Steal work from an overloaded CPU when CPU goes idle

2018-12-06 Thread Steve Sistare
 @ 2.20GHz
Average of 10 runs of: hackbench  process 10

--- base ----- new ---
  groupstime %stdevtime %stdev  %speedup
   1   5.8265.4   5.8405.0  -0.3
   2   5.0415.3   6.171   23.4 -18.4
   3   6.8392.1   6.3243.8   8.1
   4   8.1770.6   7.3183.6  11.7
   8  14.4290.7  13.9661.3   3.3
  16  26.4010.3  25.1491.5   4.9

X6-2: 2 sockets * 22 cores * 2 hyperthreads = 88 CPUs
Intel(R) Xeon(R) CPU E5-2699 v4 @ 2.20GHz
Oracle database OLTP, logging disabled, NVRAM storage

  Customers   Users   %speedup
120  40   -1.2
240  802.7
360 1208.9
480 1604.4
600 2003.0

X6-2: 2 sockets * 14 cores * 2 hyperthreads = 56 CPUs
Intel(R) Xeon(R) CPU E5-2690 v4 @ 2.60GHz
Results from the Oracle "Performance PIT".

  Benchmark   %speedup

  mysql_sysb1.0.14_fileio_56_rndrd19.6
  mysql_sysb1.0.14_fileio_56_seqrd12.1
  mysql_sysb1.0.14_fileio_56_rndwr 0.4
  mysql_sysb1.0.14_fileio_56_seqrewr  -0.3

  pgsql_sysb1.0.14_fileio_56_rndrd19.5
  pgsql_sysb1.0.14_fileio_56_seqrd 8.6
  pgsql_sysb1.0.14_fileio_56_rndwr 1.0
  pgsql_sysb1.0.14_fileio_56_seqrewr   0.5

  opatch_time_ASM_12.2.0.1.0_HP2M  7.5
  select-1_users-warm_asmm_ASM_12.2.0.1.0_HP2M 5.1
  select-1_users_asmm_ASM_12.2.0.1.0_HP2M  4.4
  swingbenchv3_asmm_soebench_ASM_12.2.0.1.0_HP2M   5.8

  lm3_memlat_L24.8
  lm3_memlat_L10.0

  ub_gcc_56CPUs-56copies_Pipe-based_Context_Switching 60.1
  ub_gcc_56CPUs-56copies_Shell_Scripts_1_concurrent5.2
  ub_gcc_56CPUs-56copies_Shell_Scripts_8_concurrent   -3.0
  ub_gcc_56CPUs-56copies_File_Copy_1024_bufsize_2000_maxblocks 2.4

X5-2: 2 sockets * 18 cores * 2 hyperthreads = 72 CPUs
Intel(R) Xeon(R) CPU E5-2699 v3 @ 2.30GHz

  NAS_OMP
  bench class   ncpu%improved(Mops)
  dcB   72  1.3
  isC   72  0.9
  isD   72  0.7

  sysbench mysql, average of 24 runs
  --- base --- --- new ---
  nthr   events  %stdev   events  %stdev %speedup
 1331.00.25331.00.24 -0.1
 2661.30.22661.80.22  0.0
 4   1297.00.88   1300.50.82  0.2
 8   2420.80.04   2420.50.04 -0.1
16   4826.30.07   4825.40.05 -0.1
32   8815.30.27   8830.20.18  0.1
64  12823.00.24  12823.60.26  0.0

-----

Signed-off-by: Steve Sistare 
---
 kernel/sched/fair.c | 169 ++--
 kernel/sched/features.h |   6 ++
 2 files changed, 170 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3804156..1476ae8 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3739,6 +3739,9 @@ static void overload_clear(struct rq *rq)
 {
struct sparsemask *overload_cpus;
 
+   if (!sched_feat(STEAL))
+   return;
+
rcu_read_lock();
overload_cpus = rcu_dereference(rq->cfs_overload_cpus);
if (overload_cpus)
@@ -3750,6 +3753,9 @@ static void overload_set(struct rq *rq)
 {
struct sparsemask *overload_cpus;
 
+   if (!sched_feat(STEAL))
+   return;
+
rcu_read_lock();
overload_cpus = rcu_dereference(rq->cfs_overload_cpus);
if (overload_cpus)
@@ -3757,6 +3763,8 @@ static void overload_set(struct rq *rq)
rcu_read_unlock();
 }
 
+static int try_steal(struct rq *this_rq, struct rq_flags *rf);
+
 #else /* CONFIG_SMP */
 
 #define UPDATE_TG  0x0
@@ -3793,6 +3801,11 @@ static inline void overload_set(struct rq *rq) {}
 bool task_sleep) {}
 static inline void update_misfit_status(struct task_struct *p, struct rq *rq) 
{}
 
+static inline int try_steal(struct rq *this_rq, struct rq_flags *rf)
+{
+   return 0;
+}
+
 #endif /* CONFIG_SMP */
 
 static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -6778,20 +6791,22 @@ static void check_preempt_wakeup(struct rq *rq, struct 
task_struct *p, int wake_
update_misfit_status(NULL, rq);
 
/*
-* We must set idle_stamp _before_ calling idle_balance(), such that we
-* measure the duration of idle_balance() as idle time.
+* We must set idle_stamp _before_ calling try_steal() or
+* idle_balance(), such that we measure the duration as idle time.
 */
rq_idle_stamp_update(rq);
 
new_tasks = idle_balance(rq, rf);
+   if (new_tasks == 0)
+ 

[PATCH v4 08/10] sched/fair: Steal work from an overloaded CPU when CPU goes idle

2018-12-06 Thread Steve Sistare
 @ 2.20GHz
Average of 10 runs of: hackbench  process 10

--- base ----- new ---
  groupstime %stdevtime %stdev  %speedup
   1   5.8265.4   5.8405.0  -0.3
   2   5.0415.3   6.171   23.4 -18.4
   3   6.8392.1   6.3243.8   8.1
   4   8.1770.6   7.3183.6  11.7
   8  14.4290.7  13.9661.3   3.3
  16  26.4010.3  25.1491.5   4.9

X6-2: 2 sockets * 22 cores * 2 hyperthreads = 88 CPUs
Intel(R) Xeon(R) CPU E5-2699 v4 @ 2.20GHz
Oracle database OLTP, logging disabled, NVRAM storage

  Customers   Users   %speedup
120  40   -1.2
240  802.7
360 1208.9
480 1604.4
600 2003.0

X6-2: 2 sockets * 14 cores * 2 hyperthreads = 56 CPUs
Intel(R) Xeon(R) CPU E5-2690 v4 @ 2.60GHz
Results from the Oracle "Performance PIT".

  Benchmark   %speedup

  mysql_sysb1.0.14_fileio_56_rndrd19.6
  mysql_sysb1.0.14_fileio_56_seqrd12.1
  mysql_sysb1.0.14_fileio_56_rndwr 0.4
  mysql_sysb1.0.14_fileio_56_seqrewr  -0.3

  pgsql_sysb1.0.14_fileio_56_rndrd19.5
  pgsql_sysb1.0.14_fileio_56_seqrd 8.6
  pgsql_sysb1.0.14_fileio_56_rndwr 1.0
  pgsql_sysb1.0.14_fileio_56_seqrewr   0.5

  opatch_time_ASM_12.2.0.1.0_HP2M  7.5
  select-1_users-warm_asmm_ASM_12.2.0.1.0_HP2M 5.1
  select-1_users_asmm_ASM_12.2.0.1.0_HP2M  4.4
  swingbenchv3_asmm_soebench_ASM_12.2.0.1.0_HP2M   5.8

  lm3_memlat_L24.8
  lm3_memlat_L10.0

  ub_gcc_56CPUs-56copies_Pipe-based_Context_Switching 60.1
  ub_gcc_56CPUs-56copies_Shell_Scripts_1_concurrent5.2
  ub_gcc_56CPUs-56copies_Shell_Scripts_8_concurrent   -3.0
  ub_gcc_56CPUs-56copies_File_Copy_1024_bufsize_2000_maxblocks 2.4

X5-2: 2 sockets * 18 cores * 2 hyperthreads = 72 CPUs
Intel(R) Xeon(R) CPU E5-2699 v3 @ 2.30GHz

  NAS_OMP
  bench class   ncpu%improved(Mops)
  dcB   72  1.3
  isC   72  0.9
  isD   72  0.7

  sysbench mysql, average of 24 runs
  --- base --- --- new ---
  nthr   events  %stdev   events  %stdev %speedup
 1331.00.25331.00.24 -0.1
 2661.30.22661.80.22  0.0
 4   1297.00.88   1300.50.82  0.2
 8   2420.80.04   2420.50.04 -0.1
16   4826.30.07   4825.40.05 -0.1
32   8815.30.27   8830.20.18  0.1
64  12823.00.24  12823.60.26  0.0

-----

Signed-off-by: Steve Sistare 
---
 kernel/sched/fair.c | 169 ++--
 kernel/sched/features.h |   6 ++
 2 files changed, 170 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3804156..1476ae8 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3739,6 +3739,9 @@ static void overload_clear(struct rq *rq)
 {
struct sparsemask *overload_cpus;
 
+   if (!sched_feat(STEAL))
+   return;
+
rcu_read_lock();
overload_cpus = rcu_dereference(rq->cfs_overload_cpus);
if (overload_cpus)
@@ -3750,6 +3753,9 @@ static void overload_set(struct rq *rq)
 {
struct sparsemask *overload_cpus;
 
+   if (!sched_feat(STEAL))
+   return;
+
rcu_read_lock();
overload_cpus = rcu_dereference(rq->cfs_overload_cpus);
if (overload_cpus)
@@ -3757,6 +3763,8 @@ static void overload_set(struct rq *rq)
rcu_read_unlock();
 }
 
+static int try_steal(struct rq *this_rq, struct rq_flags *rf);
+
 #else /* CONFIG_SMP */
 
 #define UPDATE_TG  0x0
@@ -3793,6 +3801,11 @@ static inline void overload_set(struct rq *rq) {}
 bool task_sleep) {}
 static inline void update_misfit_status(struct task_struct *p, struct rq *rq) 
{}
 
+static inline int try_steal(struct rq *this_rq, struct rq_flags *rf)
+{
+   return 0;
+}
+
 #endif /* CONFIG_SMP */
 
 static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -6778,20 +6791,22 @@ static void check_preempt_wakeup(struct rq *rq, struct 
task_struct *p, int wake_
update_misfit_status(NULL, rq);
 
/*
-* We must set idle_stamp _before_ calling idle_balance(), such that we
-* measure the duration of idle_balance() as idle time.
+* We must set idle_stamp _before_ calling try_steal() or
+* idle_balance(), such that we measure the duration as idle time.
 */
rq_idle_stamp_update(rq);
 
new_tasks = idle_balance(rq, rf);
+   if (new_tasks == 0)
+ 

[PATCH v3 01/10] sched: Provide sparsemask, a reduced contention bitmap

2018-11-09 Thread Steve Sistare
From: Steve Sistare 

Provide struct sparsemask and functions to manipulate it.  A sparsemask is
a sparse bitmap.  It reduces cache contention vs the usual bitmap when many
threads concurrently set, clear, and visit elements, by reducing the number
of significant bits per cacheline.  For each 64 byte chunk of the mask,
only the first K bits of the first word are used, and the remaining bits
are ignored, where K is a creation time parameter.  Thus a sparsemask that
can represent a set of N elements is approximately (N/K * 64) bytes in
size.

Signed-off-by: Steve Sistare 
---
 include/linux/sparsemask.h | 260 +
 lib/Makefile   |   2 +-
 lib/sparsemask.c   | 142 +
 3 files changed, 403 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/sparsemask.h
 create mode 100644 lib/sparsemask.c

diff --git a/include/linux/sparsemask.h b/include/linux/sparsemask.h
new file mode 100644
index 000..d36a3be
--- /dev/null
+++ b/include/linux/sparsemask.h
@@ -0,0 +1,260 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * sparsemask.h - sparse bitmap operations
+ *
+ * Copyright (c) 2018 Oracle Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __LINUX_SPARSEMASK_H
+#define __LINUX_SPARSEMASK_H
+
+#include 
+#include 
+#include 
+
+/*
+ * A sparsemask is a sparse bitmap.  It reduces cache contention vs the usual
+ * bitmap when many threads concurrently set, clear, and visit elements.  For
+ * each 64 byte chunk of the mask, only the first K bits of the first word are
+ * used, and the remaining bits are ignored, where K is a creation time
+ * parameter.  Thus a sparsemask that can represent a set of N elements is
+ * approximately (N/K * 64) bytes in size.
+ *
+ * Clients pass and receive element numbers in the public API, and the
+ * implementation translates them to bit numbers to perform the bitmap
+ * operations.
+ *
+ * This file is partially derived from cpumask.h, and the public sparsemask
+ * operations are drop-in replacements for cpumask operations. However,
+ * sparsemask has no dependency on CPU definitions and can be used to
+ * represent any kind of elements.
+ */
+
+struct sparsemask {
+   short nelems;   /* current number of elements */
+   short density;  /* store 2^density elements per chunk */
+   unsigned long bits[0];  /* embedded array of chunks */
+};
+
+/* The maximum value for density, which implicitly defines the chunk size */
+
+#define _SMASK_DENSITY_MAX 6
+
+#define SMASK_DENSITY_TO_BYTES(density)(1U << (density))
+#define SMASK_DENSITY_TO_ELEMS(density)(1U << (density))
+
+/* The number of elements/bits/bytes/longs in a chunk */
+
+#define SMASK_ELEMS(mask)  SMASK_DENSITY_TO_ELEMS((mask)->density)
+#define SMASK_BYTESSMASK_DENSITY_TO_BYTES(_SMASK_DENSITY_MAX)
+#define SMASK_BITS (SMASK_BYTES * BITS_PER_BYTE)
+#define SMASK_LONGS(SMASK_BYTES / sizeof(long))
+
+/*
+ * Translate element index @elem to a bit/byte/long index.
+ * @density: the density of a chunk.
+ */
+
+#define _SMASK_ELEM_TO_BIT(elem, density)  \
+   ((elem) / SMASK_DENSITY_TO_ELEMS(density) * SMASK_BITS +\
+(elem) % SMASK_DENSITY_TO_ELEMS(density))
+
+#define _SMASK_ELEM_TO_BYTE(elem, density) \
+   (_SMASK_ELEM_TO_BIT(elem, density) / BITS_PER_BYTE)
+
+#define _SMASK_ELEM_TO_LONG(elem, density) \
+   (_SMASK_ELEM_TO_BYTE(elem, density) / sizeof(long))
+
+/* Translate @bit/@byte/@long index to an element index */
+
+#define _SMASK_BIT_TO_ELEM(bit, density)   \
+   ((bit) / SMASK_BITS * SMASK_DENSITY_TO_ELEMS(density) + \
+(bit) % SMASK_BITS)
+
+#define _SMASK_BYTE_TO_ELEM(byte, density) \
+   _SMASK_BIT_TO_ELEM((byte) * BITS_PER_BYTE, density)
+
+#define _SMASK_LONG_TO_ELEM(index, density)\
+   _SMASK_BYTE_TO_ELEM((index) * sizeof(long), density)
+
+/* Same translations as above, but taking sparsemask @m instead of density */
+
+#define SMASK_ELEM_TO_BYTE(elem, m)_SMASK_ELEM_TO_BYTE(elem, (m)->density)
+#define SMASK_ELEM_TO_BIT(elem, m) _SMASK_ELEM_TO_BIT(elem, (m)->density)
+#define SMASK_ELEM_TO_LONG(elem, m)_SMASK_ELEM_TO_LONG(elem, (m)->density)
+#define SMASK_BYTE_TO_ELEM(byte, m)_SMASK_BYTE_TO_ELEM(byte, (m)->density)
+#define SMASK_BIT_TO_ELEM(bit, m)  _SMASK_BIT_TO_ELEM(bit, (m)->density)
+#d

[PATCH v3 07/10] sched/fair: Provide can_migrate_task_llc

2018-11-09 Thread Steve Sistare
Define a simpler version of can_migrate_task called can_migrate_task_llc
which does not require a struct lb_env argument, and judges whether a
migration from one CPU to another within the same LLC should be allowed.

Signed-off-by: Steve Sistare 
---
 kernel/sched/fair.c | 28 
 1 file changed, 28 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 453d280..dc6224d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7199,6 +7199,34 @@ int can_migrate_task(struct task_struct *p, struct 
lb_env *env)
 }
 
 /*
+ * Return true if task @p can migrate from @rq to @dst_rq in the same LLC.
+ * No need to test for co-locality, and no need to test task_hot(), as sharing
+ * LLC provides cache warmth at that level.
+ */
+static bool
+can_migrate_task_llc(struct task_struct *p, struct rq *rq, struct rq *dst_rq)
+{
+   int dst_cpu = dst_rq->cpu;
+
+   lockdep_assert_held(>lock);
+
+   if (throttled_lb_pair(task_group(p), cpu_of(rq), dst_cpu))
+   return false;
+
+   if (!cpumask_test_cpu(dst_cpu, >cpus_allowed)) {
+   schedstat_inc(p->se.statistics.nr_failed_migrations_affine);
+   return false;
+   }
+
+   if (task_running(rq, p)) {
+   schedstat_inc(p->se.statistics.nr_failed_migrations_running);
+   return false;
+   }
+
+   return true;
+}
+
+/*
  * detach_task() -- detach the task for the migration from @src_rq to @dst_cpu.
  */
 static void detach_task(struct task_struct *p, struct rq *src_rq, int dst_cpu)
-- 
1.8.3.1



[PATCH v3 06/10] sched/fair: Generalize the detach_task interface

2018-11-09 Thread Steve Sistare
The detach_task function takes a struct lb_env argument, but only needs a
few of its members.  Pass the rq and cpu arguments explicitly so the
function may be called from code that is not based on lb_env.  No
functional change.

Signed-off-by: Steve Sistare 
---
 kernel/sched/fair.c | 14 +++---
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index da368ed..453d280 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7199,15 +7199,15 @@ int can_migrate_task(struct task_struct *p, struct 
lb_env *env)
 }
 
 /*
- * detach_task() -- detach the task for the migration specified in env
+ * detach_task() -- detach the task for the migration from @src_rq to @dst_cpu.
  */
-static void detach_task(struct task_struct *p, struct lb_env *env)
+static void detach_task(struct task_struct *p, struct rq *src_rq, int dst_cpu)
 {
-   lockdep_assert_held(>src_rq->lock);
+   lockdep_assert_held(_rq->lock);
 
p->on_rq = TASK_ON_RQ_MIGRATING;
-   deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
-   set_task_cpu(p, env->dst_cpu);
+   deactivate_task(src_rq, p, DEQUEUE_NOCLOCK);
+   set_task_cpu(p, dst_cpu);
 }
 
 /*
@@ -7227,7 +7227,7 @@ static struct task_struct *detach_one_task(struct lb_env 
*env)
if (!can_migrate_task(p, env))
continue;
 
-   detach_task(p, env);
+   detach_task(p, env->src_rq, env->dst_cpu);
 
/*
 * Right now, this is only the second place where
@@ -7294,7 +7294,7 @@ static int detach_tasks(struct lb_env *env)
if ((load / 2) > env->imbalance)
goto next;
 
-   detach_task(p, env);
+   detach_task(p, env->src_rq, env->dst_cpu);
list_add(>se.group_node, >tasks);
 
detached++;
-- 
1.8.3.1



[PATCH v3 01/10] sched: Provide sparsemask, a reduced contention bitmap

2018-11-09 Thread Steve Sistare
From: Steve Sistare 

Provide struct sparsemask and functions to manipulate it.  A sparsemask is
a sparse bitmap.  It reduces cache contention vs the usual bitmap when many
threads concurrently set, clear, and visit elements, by reducing the number
of significant bits per cacheline.  For each 64 byte chunk of the mask,
only the first K bits of the first word are used, and the remaining bits
are ignored, where K is a creation time parameter.  Thus a sparsemask that
can represent a set of N elements is approximately (N/K * 64) bytes in
size.

Signed-off-by: Steve Sistare 
---
 include/linux/sparsemask.h | 260 +
 lib/Makefile   |   2 +-
 lib/sparsemask.c   | 142 +
 3 files changed, 403 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/sparsemask.h
 create mode 100644 lib/sparsemask.c

diff --git a/include/linux/sparsemask.h b/include/linux/sparsemask.h
new file mode 100644
index 000..d36a3be
--- /dev/null
+++ b/include/linux/sparsemask.h
@@ -0,0 +1,260 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * sparsemask.h - sparse bitmap operations
+ *
+ * Copyright (c) 2018 Oracle Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __LINUX_SPARSEMASK_H
+#define __LINUX_SPARSEMASK_H
+
+#include 
+#include 
+#include 
+
+/*
+ * A sparsemask is a sparse bitmap.  It reduces cache contention vs the usual
+ * bitmap when many threads concurrently set, clear, and visit elements.  For
+ * each 64 byte chunk of the mask, only the first K bits of the first word are
+ * used, and the remaining bits are ignored, where K is a creation time
+ * parameter.  Thus a sparsemask that can represent a set of N elements is
+ * approximately (N/K * 64) bytes in size.
+ *
+ * Clients pass and receive element numbers in the public API, and the
+ * implementation translates them to bit numbers to perform the bitmap
+ * operations.
+ *
+ * This file is partially derived from cpumask.h, and the public sparsemask
+ * operations are drop-in replacements for cpumask operations. However,
+ * sparsemask has no dependency on CPU definitions and can be used to
+ * represent any kind of elements.
+ */
+
+struct sparsemask {
+   short nelems;   /* current number of elements */
+   short density;  /* store 2^density elements per chunk */
+   unsigned long bits[0];  /* embedded array of chunks */
+};
+
+/* The maximum value for density, which implicitly defines the chunk size */
+
+#define _SMASK_DENSITY_MAX 6
+
+#define SMASK_DENSITY_TO_BYTES(density)(1U << (density))
+#define SMASK_DENSITY_TO_ELEMS(density)(1U << (density))
+
+/* The number of elements/bits/bytes/longs in a chunk */
+
+#define SMASK_ELEMS(mask)  SMASK_DENSITY_TO_ELEMS((mask)->density)
+#define SMASK_BYTESSMASK_DENSITY_TO_BYTES(_SMASK_DENSITY_MAX)
+#define SMASK_BITS (SMASK_BYTES * BITS_PER_BYTE)
+#define SMASK_LONGS(SMASK_BYTES / sizeof(long))
+
+/*
+ * Translate element index @elem to a bit/byte/long index.
+ * @density: the density of a chunk.
+ */
+
+#define _SMASK_ELEM_TO_BIT(elem, density)  \
+   ((elem) / SMASK_DENSITY_TO_ELEMS(density) * SMASK_BITS +\
+(elem) % SMASK_DENSITY_TO_ELEMS(density))
+
+#define _SMASK_ELEM_TO_BYTE(elem, density) \
+   (_SMASK_ELEM_TO_BIT(elem, density) / BITS_PER_BYTE)
+
+#define _SMASK_ELEM_TO_LONG(elem, density) \
+   (_SMASK_ELEM_TO_BYTE(elem, density) / sizeof(long))
+
+/* Translate @bit/@byte/@long index to an element index */
+
+#define _SMASK_BIT_TO_ELEM(bit, density)   \
+   ((bit) / SMASK_BITS * SMASK_DENSITY_TO_ELEMS(density) + \
+(bit) % SMASK_BITS)
+
+#define _SMASK_BYTE_TO_ELEM(byte, density) \
+   _SMASK_BIT_TO_ELEM((byte) * BITS_PER_BYTE, density)
+
+#define _SMASK_LONG_TO_ELEM(index, density)\
+   _SMASK_BYTE_TO_ELEM((index) * sizeof(long), density)
+
+/* Same translations as above, but taking sparsemask @m instead of density */
+
+#define SMASK_ELEM_TO_BYTE(elem, m)_SMASK_ELEM_TO_BYTE(elem, (m)->density)
+#define SMASK_ELEM_TO_BIT(elem, m) _SMASK_ELEM_TO_BIT(elem, (m)->density)
+#define SMASK_ELEM_TO_LONG(elem, m)_SMASK_ELEM_TO_LONG(elem, (m)->density)
+#define SMASK_BYTE_TO_ELEM(byte, m)_SMASK_BYTE_TO_ELEM(byte, (m)->density)
+#define SMASK_BIT_TO_ELEM(bit, m)  _SMASK_BIT_TO_ELEM(bit, (m)->density)
+#d

[PATCH v3 07/10] sched/fair: Provide can_migrate_task_llc

2018-11-09 Thread Steve Sistare
Define a simpler version of can_migrate_task called can_migrate_task_llc
which does not require a struct lb_env argument, and judges whether a
migration from one CPU to another within the same LLC should be allowed.

Signed-off-by: Steve Sistare 
---
 kernel/sched/fair.c | 28 
 1 file changed, 28 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 453d280..dc6224d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7199,6 +7199,34 @@ int can_migrate_task(struct task_struct *p, struct 
lb_env *env)
 }
 
 /*
+ * Return true if task @p can migrate from @rq to @dst_rq in the same LLC.
+ * No need to test for co-locality, and no need to test task_hot(), as sharing
+ * LLC provides cache warmth at that level.
+ */
+static bool
+can_migrate_task_llc(struct task_struct *p, struct rq *rq, struct rq *dst_rq)
+{
+   int dst_cpu = dst_rq->cpu;
+
+   lockdep_assert_held(>lock);
+
+   if (throttled_lb_pair(task_group(p), cpu_of(rq), dst_cpu))
+   return false;
+
+   if (!cpumask_test_cpu(dst_cpu, >cpus_allowed)) {
+   schedstat_inc(p->se.statistics.nr_failed_migrations_affine);
+   return false;
+   }
+
+   if (task_running(rq, p)) {
+   schedstat_inc(p->se.statistics.nr_failed_migrations_running);
+   return false;
+   }
+
+   return true;
+}
+
+/*
  * detach_task() -- detach the task for the migration from @src_rq to @dst_cpu.
  */
 static void detach_task(struct task_struct *p, struct rq *src_rq, int dst_cpu)
-- 
1.8.3.1



[PATCH v3 06/10] sched/fair: Generalize the detach_task interface

2018-11-09 Thread Steve Sistare
The detach_task function takes a struct lb_env argument, but only needs a
few of its members.  Pass the rq and cpu arguments explicitly so the
function may be called from code that is not based on lb_env.  No
functional change.

Signed-off-by: Steve Sistare 
---
 kernel/sched/fair.c | 14 +++---
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index da368ed..453d280 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7199,15 +7199,15 @@ int can_migrate_task(struct task_struct *p, struct 
lb_env *env)
 }
 
 /*
- * detach_task() -- detach the task for the migration specified in env
+ * detach_task() -- detach the task for the migration from @src_rq to @dst_cpu.
  */
-static void detach_task(struct task_struct *p, struct lb_env *env)
+static void detach_task(struct task_struct *p, struct rq *src_rq, int dst_cpu)
 {
-   lockdep_assert_held(>src_rq->lock);
+   lockdep_assert_held(_rq->lock);
 
p->on_rq = TASK_ON_RQ_MIGRATING;
-   deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
-   set_task_cpu(p, env->dst_cpu);
+   deactivate_task(src_rq, p, DEQUEUE_NOCLOCK);
+   set_task_cpu(p, dst_cpu);
 }
 
 /*
@@ -7227,7 +7227,7 @@ static struct task_struct *detach_one_task(struct lb_env 
*env)
if (!can_migrate_task(p, env))
continue;
 
-   detach_task(p, env);
+   detach_task(p, env->src_rq, env->dst_cpu);
 
/*
 * Right now, this is only the second place where
@@ -7294,7 +7294,7 @@ static int detach_tasks(struct lb_env *env)
if ((load / 2) > env->imbalance)
goto next;
 
-   detach_task(p, env);
+   detach_task(p, env->src_rq, env->dst_cpu);
list_add(>se.group_node, >tasks);
 
detached++;
-- 
1.8.3.1



[PATCH v3 09/10] sched/fair: disable stealing if too many NUMA nodes

2018-11-09 Thread Steve Sistare
ize the number of
cross-node moves in all conditions, with limited success.  The fundamental
problem is that the scheduler does not track which groups of tasks talk to
each other.  Parts of several groups become entrenched on the same node,
filling it to capacity, leaving no room for either group to pull its peers
over, and there is neither data nor mechanism for the scheduler to evict
one group to make room for the other.

For now, disable STEAL on such systems until we can do better, or it is
shown that hackbench is atypical and most workloads benefit from stealing.

Signed-off-by: Steve Sistare 
---
 kernel/sched/fair.c | 16 +---
 kernel/sched/sched.h|  2 +-
 kernel/sched/topology.c | 25 +
 3 files changed, 39 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 97bdea2..ac5bbf7 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3727,11 +3727,21 @@ static inline void update_misfit_status(struct 
task_struct *p, struct rq *rq)
 
 #define IF_SMP(statement)  statement
 
+static inline bool steal_enabled(void)
+{
+#ifdef CONFIG_NUMA
+   bool allow = static_branch_likely(_steal_allow);
+#else
+   bool allow = true;
+#endif
+   return sched_feat(STEAL) && allow;
+}
+
 static void overload_clear(struct rq *rq)
 {
struct sparsemask *overload_cpus;
 
-   if (!sched_feat(STEAL))
+   if (!steal_enabled())
return;
 
rcu_read_lock();
@@ -3745,7 +3755,7 @@ static void overload_set(struct rq *rq)
 {
struct sparsemask *overload_cpus;
 
-   if (!sched_feat(STEAL))
+   if (!steal_enabled())
return;
 
rcu_read_lock();
@@ -9894,7 +9904,7 @@ static int try_steal(struct rq *dst_rq, struct rq_flags 
*dst_rf)
int stolen = 0;
struct sparsemask *overload_cpus;
 
-   if (!sched_feat(STEAL))
+   if (!steal_enabled())
return 0;
 
if (!cpu_active(dst_cpu))
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index eacf5db..2a28340 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -936,7 +936,6 @@ static inline int cpu_of(struct rq *rq)
 #endif
 }
 
-
 #ifdef CONFIG_SCHED_SMT
 
 extern struct static_key_false sched_smt_present;
@@ -1185,6 +1184,7 @@ enum numa_topology_type {
 #endif
 
 #ifdef CONFIG_NUMA
+extern struct static_key_true sched_steal_allow;
 extern void sched_init_numa(void);
 extern void sched_domains_numa_masks_set(unsigned int cpu);
 extern void sched_domains_numa_masks_clear(unsigned int cpu);
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 6455bde..fc511de 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1345,6 +1345,30 @@ static void init_numa_topology_type(void)
}
 }
 
+DEFINE_STATIC_KEY_TRUE(sched_steal_allow);
+static int sched_steal_node_limit;
+#define SCHED_STEAL_NODE_LIMIT_DEFAULT 2
+
+static int __init steal_node_limit_setup(char *buf)
+{
+   get_option(, _steal_node_limit);
+   return 0;
+}
+
+early_param("sched_steal_node_limit", steal_node_limit_setup);
+
+static void check_node_limit(void)
+{
+   int n = num_possible_nodes();
+
+   if (sched_steal_node_limit == 0)
+   sched_steal_node_limit = SCHED_STEAL_NODE_LIMIT_DEFAULT;
+   if (n > sched_steal_node_limit) {
+   static_branch_disable(_steal_allow);
+   pr_debug("Suppressing sched STEAL. To enable, reboot with 
sched_steal_node_limit=%d", n);
+   }
+}
+
 void sched_init_numa(void)
 {
int next_distance, curr_distance = node_distance(0, 0);
@@ -1493,6 +1517,7 @@ void sched_init_numa(void)
sched_max_numa_distance = sched_domains_numa_distance[level - 1];
 
init_numa_topology_type();
+   check_node_limit();
 }
 
 void sched_domains_numa_masks_set(unsigned int cpu)
-- 
1.8.3.1



[PATCH v3 00/10] steal tasks to improve CPU utilization

2018-11-09 Thread Steve Sistare
   9.3
  16  43.7790.1  41.7410.2   4.8

KVM 4-cpu
Intel(R) Xeon(R) CPU E5-2699 v3 @ 2.30GHz
tbench, average of 11 runs.

  clients%speedup
116.2
211.7
4 9.9
812.8
   1613.7

KVM 2-cpu
Intel(R) Xeon(R) CPU E5-2699 v3 @ 2.30GHz

  Benchmark %speedup
  specjbb2015_critical_jops  5.7
  mysql_sysb1.0.14_mutex_2  40.6
  mysql_sysb1.0.14_oltp_23.9

-- 2 Socket Results --

X6-2: 2 sockets * 10 cores * 2 hyperthreads = 40 CPUs
Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz
Average of 10 runs of: hackbench  process 10

--- base ----- new ---
  groupstime %stdevtime %stdev  %speedup
   1   7.9450.2   7.2198.7  10.0
   2   8.4440.4   6.6891.5  26.2
   3  12.1001.1   9.9622.0  21.4
   4  15.0010.4  13.1091.1  14.4
   8  27.9600.2  26.1270.3   7.0

X6-2: 2 sockets * 22 cores * 2 hyperthreads = 88 CPUs
Intel(R) Xeon(R) CPU E5-2699 v4 @ 2.20GHz
Average of 10 runs of: hackbench  process 10

--- base ----- new ---
  groupstime %stdevtime %stdev  %speedup
   1   5.8265.4   5.8405.0  -0.3
   2   5.0415.3   6.171   23.4 -18.4
   3   6.8392.1   6.3243.8   8.1
   4   8.1770.6   7.3183.6  11.7
   8  14.4290.7  13.9661.3   3.3
  16  26.4010.3  25.1491.5   4.9


X6-2: 2 sockets * 22 cores * 2 hyperthreads = 88 CPUs
Intel(R) Xeon(R) CPU E5-2699 v4 @ 2.20GHz
Oracle database OLTP, logging disabled, NVRAM storage

  Customers   Users   %speedup
120  40   -1.2
240  802.7
360 1208.9
480 1604.4
600 2003.0

X6-2: 2 sockets * 14 cores * 2 hyperthreads = 56 CPUs
Intel(R) Xeon(R) CPU E5-2690 v4 @ 2.60GHz
Results from the Oracle "Performance PIT".

  Benchmark   %speedup

  mysql_sysb1.0.14_fileio_56_rndrd19.6
  mysql_sysb1.0.14_fileio_56_seqrd12.1
  mysql_sysb1.0.14_fileio_56_rndwr 0.4
  mysql_sysb1.0.14_fileio_56_seqrewr  -0.3

  pgsql_sysb1.0.14_fileio_56_rndrd19.5
  pgsql_sysb1.0.14_fileio_56_seqrd 8.6
  pgsql_sysb1.0.14_fileio_56_rndwr 1.0
  pgsql_sysb1.0.14_fileio_56_seqrewr   0.5

  opatch_time_ASM_12.2.0.1.0_HP2M  7.5
  select-1_users-warm_asmm_ASM_12.2.0.1.0_HP2M 5.1
  select-1_users_asmm_ASM_12.2.0.1.0_HP2M  4.4
  swingbenchv3_asmm_soebench_ASM_12.2.0.1.0_HP2M   5.8

  lm3_memlat_L24.8
  lm3_memlat_L10.0

  ub_gcc_56CPUs-56copies_Pipe-based_Context_Switching 60.1
  ub_gcc_56CPUs-56copies_Shell_Scripts_1_concurrent5.2
  ub_gcc_56CPUs-56copies_Shell_Scripts_8_concurrent   -3.0
  ub_gcc_56CPUs-56copies_File_Copy_1024_bufsize_2000_maxblocks 2.4

X5-2: 2 sockets * 18 cores * 2 hyperthreads = 72 CPUs
Intel(R) Xeon(R) CPU E5-2699 v3 @ 2.30GHz

  NAS_OMP
  bench class   ncpu%improved(Mops)
  dcB   72  1.3
  isC   72  0.9
  isD   72  0.7

  sysbench mysql, average of 24 runs
  --- base --- --- new ---
  nthr   events  %stdev   events  %stdev %speedup
 1331.00.25331.00.24 -0.1
 2661.30.22661.80.22  0.0
 4   1297.00.88   1300.50.82  0.2
 8   2420.80.04   2420.50.04 -0.1
16   4826.30.07   4825.40.05 -0.1
32   8815.30.27   8830.20.18  0.1
64  12823.00.24  12823.60.26  0.0

--

Changes from v1 to v2:
  - Remove stray find_time hunk from patch 5
  - Fix "warning: label out defined but not used" for !CONFIG_SCHED_SMT
  - Set SCHED_STEAL_NODE_LIMIT_DEFAULT to 2
  - Steal iff avg_idle exceeds the cost of stealing

Changes from v2 to v3:
  - Update series for kernel 4.20.  Context changes only.

Steve Sistare (10):
  sched: Provide sparsemask, a reduced contention bitmap
  sched/topology: Provide hooks to allocate data shared per LLC
  sched/topology: Provide cfs_overload_cpus bitmap
  sched/fair: Dynamically update cfs_overload_cpus
  sched/fair: Hoist idle_stamp up from idle_balance
  sched/fair: Generalize the detach_task interface
  sched/fair: Provide can_migrate_task_llc
  sched/fair: Steal work from an overloaded CPU when CPU goes idle
  sched/fair: disable stealing if too many NUMA nodes
  sched/fair: Provide idle search schedstats

 include/linux/sched/topology.h |   1 +
 include/linux/sparsemask.h | 260 +++

[PATCH v3 02/10] sched/topology: Provide hooks to allocate data shared per LLC

2018-11-09 Thread Steve Sistare
Add functions sd_llc_alloc_all() and sd_llc_free_all() to allocate and
free data pointed to by struct sched_domain_shared at the last-level-cache
domain.  sd_llc_alloc_all() is called after the SD hierarchy is known, to
eliminate the unnecessary allocations that would occur if we instead
allocated in __sdt_alloc() and then figured out which shared nodes are
redundant.

Signed-off-by: Steve Sistare 
---
 kernel/sched/topology.c | 75 -
 1 file changed, 74 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 8d7f15b..3e72ce0 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -10,6 +10,12 @@
 static cpumask_var_t sched_domains_tmpmask;
 static cpumask_var_t sched_domains_tmpmask2;
 
+struct s_data;
+static int sd_llc_alloc(struct sched_domain *sd);
+static void sd_llc_free(struct sched_domain *sd);
+static int sd_llc_alloc_all(const struct cpumask *cpu_map, struct s_data *d);
+static void sd_llc_free_all(const struct cpumask *cpu_map);
+
 #ifdef CONFIG_SCHED_DEBUG
 
 static int __init sched_debug_setup(char *str)
@@ -361,8 +367,10 @@ static void destroy_sched_domain(struct sched_domain *sd)
 */
free_sched_groups(sd->groups, 1);
 
-   if (sd->shared && atomic_dec_and_test(>shared->ref))
+   if (sd->shared && atomic_dec_and_test(>shared->ref)) {
+   sd_llc_free(sd);
kfree(sd->shared);
+   }
kfree(sd);
 }
 
@@ -996,6 +1004,7 @@ static void __free_domain_allocs(struct s_data *d, enum 
s_alloc what,
free_percpu(d->sd);
/* Fall through */
case sa_sd_storage:
+   sd_llc_free_all(cpu_map);
__sdt_free(cpu_map);
/* Fall through */
case sa_none:
@@ -1610,6 +1619,62 @@ static void __sdt_free(const struct cpumask *cpu_map)
}
 }
 
+static int sd_llc_alloc(struct sched_domain *sd)
+{
+   /* Allocate sd->shared data here. Empty for now. */
+
+   return 0;
+}
+
+static void sd_llc_free(struct sched_domain *sd)
+{
+   struct sched_domain_shared *sds = sd->shared;
+
+   if (!sds)
+   return;
+
+   /* Free data here. Empty for now. */
+}
+
+static int sd_llc_alloc_all(const struct cpumask *cpu_map, struct s_data *d)
+{
+   struct sched_domain *sd, *hsd;
+   int i;
+
+   for_each_cpu(i, cpu_map) {
+   /* Find highest domain that shares resources */
+   hsd = NULL;
+   for (sd = *per_cpu_ptr(d->sd, i); sd; sd = sd->parent) {
+   if (!(sd->flags & SD_SHARE_PKG_RESOURCES))
+   break;
+   hsd = sd;
+   }
+   if (hsd && sd_llc_alloc(hsd))
+   return 1;
+   }
+
+   return 0;
+}
+
+static void sd_llc_free_all(const struct cpumask *cpu_map)
+{
+   struct sched_domain_topology_level *tl;
+   struct sched_domain *sd;
+   struct sd_data *sdd;
+   int j;
+
+   for_each_sd_topology(tl) {
+   sdd = >data;
+   if (!sdd)
+   continue;
+   for_each_cpu(j, cpu_map) {
+   sd = *per_cpu_ptr(sdd->sd, j);
+   if (sd)
+   sd_llc_free(sd);
+   }
+   }
+}
+
 static struct sched_domain *build_sched_domain(struct 
sched_domain_topology_level *tl,
const struct cpumask *cpu_map, struct sched_domain_attr *attr,
struct sched_domain *child, int dflags, int cpu)
@@ -1769,6 +1834,14 @@ static struct sched_domain *build_sched_domain(struct 
sched_domain_topology_leve
}
}
 
+   /*
+* Allocate shared sd data at last level cache.  Must be done after
+* domains are built above, but before the data is used in
+* cpu_attach_domain and descendants below.
+*/
+   if (sd_llc_alloc_all(cpu_map, ))
+   goto error;
+
/* Attach the domains */
rcu_read_lock();
for_each_cpu(i, cpu_map) {
-- 
1.8.3.1



[PATCH v3 09/10] sched/fair: disable stealing if too many NUMA nodes

2018-11-09 Thread Steve Sistare
ize the number of
cross-node moves in all conditions, with limited success.  The fundamental
problem is that the scheduler does not track which groups of tasks talk to
each other.  Parts of several groups become entrenched on the same node,
filling it to capacity, leaving no room for either group to pull its peers
over, and there is neither data nor mechanism for the scheduler to evict
one group to make room for the other.

For now, disable STEAL on such systems until we can do better, or it is
shown that hackbench is atypical and most workloads benefit from stealing.

Signed-off-by: Steve Sistare 
---
 kernel/sched/fair.c | 16 +---
 kernel/sched/sched.h|  2 +-
 kernel/sched/topology.c | 25 +
 3 files changed, 39 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 97bdea2..ac5bbf7 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3727,11 +3727,21 @@ static inline void update_misfit_status(struct 
task_struct *p, struct rq *rq)
 
 #define IF_SMP(statement)  statement
 
+static inline bool steal_enabled(void)
+{
+#ifdef CONFIG_NUMA
+   bool allow = static_branch_likely(_steal_allow);
+#else
+   bool allow = true;
+#endif
+   return sched_feat(STEAL) && allow;
+}
+
 static void overload_clear(struct rq *rq)
 {
struct sparsemask *overload_cpus;
 
-   if (!sched_feat(STEAL))
+   if (!steal_enabled())
return;
 
rcu_read_lock();
@@ -3745,7 +3755,7 @@ static void overload_set(struct rq *rq)
 {
struct sparsemask *overload_cpus;
 
-   if (!sched_feat(STEAL))
+   if (!steal_enabled())
return;
 
rcu_read_lock();
@@ -9894,7 +9904,7 @@ static int try_steal(struct rq *dst_rq, struct rq_flags 
*dst_rf)
int stolen = 0;
struct sparsemask *overload_cpus;
 
-   if (!sched_feat(STEAL))
+   if (!steal_enabled())
return 0;
 
if (!cpu_active(dst_cpu))
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index eacf5db..2a28340 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -936,7 +936,6 @@ static inline int cpu_of(struct rq *rq)
 #endif
 }
 
-
 #ifdef CONFIG_SCHED_SMT
 
 extern struct static_key_false sched_smt_present;
@@ -1185,6 +1184,7 @@ enum numa_topology_type {
 #endif
 
 #ifdef CONFIG_NUMA
+extern struct static_key_true sched_steal_allow;
 extern void sched_init_numa(void);
 extern void sched_domains_numa_masks_set(unsigned int cpu);
 extern void sched_domains_numa_masks_clear(unsigned int cpu);
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 6455bde..fc511de 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1345,6 +1345,30 @@ static void init_numa_topology_type(void)
}
 }
 
+DEFINE_STATIC_KEY_TRUE(sched_steal_allow);
+static int sched_steal_node_limit;
+#define SCHED_STEAL_NODE_LIMIT_DEFAULT 2
+
+static int __init steal_node_limit_setup(char *buf)
+{
+   get_option(, _steal_node_limit);
+   return 0;
+}
+
+early_param("sched_steal_node_limit", steal_node_limit_setup);
+
+static void check_node_limit(void)
+{
+   int n = num_possible_nodes();
+
+   if (sched_steal_node_limit == 0)
+   sched_steal_node_limit = SCHED_STEAL_NODE_LIMIT_DEFAULT;
+   if (n > sched_steal_node_limit) {
+   static_branch_disable(_steal_allow);
+   pr_debug("Suppressing sched STEAL. To enable, reboot with 
sched_steal_node_limit=%d", n);
+   }
+}
+
 void sched_init_numa(void)
 {
int next_distance, curr_distance = node_distance(0, 0);
@@ -1493,6 +1517,7 @@ void sched_init_numa(void)
sched_max_numa_distance = sched_domains_numa_distance[level - 1];
 
init_numa_topology_type();
+   check_node_limit();
 }
 
 void sched_domains_numa_masks_set(unsigned int cpu)
-- 
1.8.3.1



[PATCH v3 00/10] steal tasks to improve CPU utilization

2018-11-09 Thread Steve Sistare
   9.3
  16  43.7790.1  41.7410.2   4.8

KVM 4-cpu
Intel(R) Xeon(R) CPU E5-2699 v3 @ 2.30GHz
tbench, average of 11 runs.

  clients%speedup
116.2
211.7
4 9.9
812.8
   1613.7

KVM 2-cpu
Intel(R) Xeon(R) CPU E5-2699 v3 @ 2.30GHz

  Benchmark %speedup
  specjbb2015_critical_jops  5.7
  mysql_sysb1.0.14_mutex_2  40.6
  mysql_sysb1.0.14_oltp_23.9

-- 2 Socket Results --

X6-2: 2 sockets * 10 cores * 2 hyperthreads = 40 CPUs
Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz
Average of 10 runs of: hackbench  process 10

--- base ----- new ---
  groupstime %stdevtime %stdev  %speedup
   1   7.9450.2   7.2198.7  10.0
   2   8.4440.4   6.6891.5  26.2
   3  12.1001.1   9.9622.0  21.4
   4  15.0010.4  13.1091.1  14.4
   8  27.9600.2  26.1270.3   7.0

X6-2: 2 sockets * 22 cores * 2 hyperthreads = 88 CPUs
Intel(R) Xeon(R) CPU E5-2699 v4 @ 2.20GHz
Average of 10 runs of: hackbench  process 10

--- base ----- new ---
  groupstime %stdevtime %stdev  %speedup
   1   5.8265.4   5.8405.0  -0.3
   2   5.0415.3   6.171   23.4 -18.4
   3   6.8392.1   6.3243.8   8.1
   4   8.1770.6   7.3183.6  11.7
   8  14.4290.7  13.9661.3   3.3
  16  26.4010.3  25.1491.5   4.9


X6-2: 2 sockets * 22 cores * 2 hyperthreads = 88 CPUs
Intel(R) Xeon(R) CPU E5-2699 v4 @ 2.20GHz
Oracle database OLTP, logging disabled, NVRAM storage

  Customers   Users   %speedup
120  40   -1.2
240  802.7
360 1208.9
480 1604.4
600 2003.0

X6-2: 2 sockets * 14 cores * 2 hyperthreads = 56 CPUs
Intel(R) Xeon(R) CPU E5-2690 v4 @ 2.60GHz
Results from the Oracle "Performance PIT".

  Benchmark   %speedup

  mysql_sysb1.0.14_fileio_56_rndrd19.6
  mysql_sysb1.0.14_fileio_56_seqrd12.1
  mysql_sysb1.0.14_fileio_56_rndwr 0.4
  mysql_sysb1.0.14_fileio_56_seqrewr  -0.3

  pgsql_sysb1.0.14_fileio_56_rndrd19.5
  pgsql_sysb1.0.14_fileio_56_seqrd 8.6
  pgsql_sysb1.0.14_fileio_56_rndwr 1.0
  pgsql_sysb1.0.14_fileio_56_seqrewr   0.5

  opatch_time_ASM_12.2.0.1.0_HP2M  7.5
  select-1_users-warm_asmm_ASM_12.2.0.1.0_HP2M 5.1
  select-1_users_asmm_ASM_12.2.0.1.0_HP2M  4.4
  swingbenchv3_asmm_soebench_ASM_12.2.0.1.0_HP2M   5.8

  lm3_memlat_L24.8
  lm3_memlat_L10.0

  ub_gcc_56CPUs-56copies_Pipe-based_Context_Switching 60.1
  ub_gcc_56CPUs-56copies_Shell_Scripts_1_concurrent5.2
  ub_gcc_56CPUs-56copies_Shell_Scripts_8_concurrent   -3.0
  ub_gcc_56CPUs-56copies_File_Copy_1024_bufsize_2000_maxblocks 2.4

X5-2: 2 sockets * 18 cores * 2 hyperthreads = 72 CPUs
Intel(R) Xeon(R) CPU E5-2699 v3 @ 2.30GHz

  NAS_OMP
  bench class   ncpu%improved(Mops)
  dcB   72  1.3
  isC   72  0.9
  isD   72  0.7

  sysbench mysql, average of 24 runs
  --- base --- --- new ---
  nthr   events  %stdev   events  %stdev %speedup
 1331.00.25331.00.24 -0.1
 2661.30.22661.80.22  0.0
 4   1297.00.88   1300.50.82  0.2
 8   2420.80.04   2420.50.04 -0.1
16   4826.30.07   4825.40.05 -0.1
32   8815.30.27   8830.20.18  0.1
64  12823.00.24  12823.60.26  0.0

--

Changes from v1 to v2:
  - Remove stray find_time hunk from patch 5
  - Fix "warning: label out defined but not used" for !CONFIG_SCHED_SMT
  - Set SCHED_STEAL_NODE_LIMIT_DEFAULT to 2
  - Steal iff avg_idle exceeds the cost of stealing

Changes from v2 to v3:
  - Update series for kernel 4.20.  Context changes only.

Steve Sistare (10):
  sched: Provide sparsemask, a reduced contention bitmap
  sched/topology: Provide hooks to allocate data shared per LLC
  sched/topology: Provide cfs_overload_cpus bitmap
  sched/fair: Dynamically update cfs_overload_cpus
  sched/fair: Hoist idle_stamp up from idle_balance
  sched/fair: Generalize the detach_task interface
  sched/fair: Provide can_migrate_task_llc
  sched/fair: Steal work from an overloaded CPU when CPU goes idle
  sched/fair: disable stealing if too many NUMA nodes
  sched/fair: Provide idle search schedstats

 include/linux/sched/topology.h |   1 +
 include/linux/sparsemask.h | 260 +++

[PATCH v3 02/10] sched/topology: Provide hooks to allocate data shared per LLC

2018-11-09 Thread Steve Sistare
Add functions sd_llc_alloc_all() and sd_llc_free_all() to allocate and
free data pointed to by struct sched_domain_shared at the last-level-cache
domain.  sd_llc_alloc_all() is called after the SD hierarchy is known, to
eliminate the unnecessary allocations that would occur if we instead
allocated in __sdt_alloc() and then figured out which shared nodes are
redundant.

Signed-off-by: Steve Sistare 
---
 kernel/sched/topology.c | 75 -
 1 file changed, 74 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 8d7f15b..3e72ce0 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -10,6 +10,12 @@
 static cpumask_var_t sched_domains_tmpmask;
 static cpumask_var_t sched_domains_tmpmask2;
 
+struct s_data;
+static int sd_llc_alloc(struct sched_domain *sd);
+static void sd_llc_free(struct sched_domain *sd);
+static int sd_llc_alloc_all(const struct cpumask *cpu_map, struct s_data *d);
+static void sd_llc_free_all(const struct cpumask *cpu_map);
+
 #ifdef CONFIG_SCHED_DEBUG
 
 static int __init sched_debug_setup(char *str)
@@ -361,8 +367,10 @@ static void destroy_sched_domain(struct sched_domain *sd)
 */
free_sched_groups(sd->groups, 1);
 
-   if (sd->shared && atomic_dec_and_test(>shared->ref))
+   if (sd->shared && atomic_dec_and_test(>shared->ref)) {
+   sd_llc_free(sd);
kfree(sd->shared);
+   }
kfree(sd);
 }
 
@@ -996,6 +1004,7 @@ static void __free_domain_allocs(struct s_data *d, enum 
s_alloc what,
free_percpu(d->sd);
/* Fall through */
case sa_sd_storage:
+   sd_llc_free_all(cpu_map);
__sdt_free(cpu_map);
/* Fall through */
case sa_none:
@@ -1610,6 +1619,62 @@ static void __sdt_free(const struct cpumask *cpu_map)
}
 }
 
+static int sd_llc_alloc(struct sched_domain *sd)
+{
+   /* Allocate sd->shared data here. Empty for now. */
+
+   return 0;
+}
+
+static void sd_llc_free(struct sched_domain *sd)
+{
+   struct sched_domain_shared *sds = sd->shared;
+
+   if (!sds)
+   return;
+
+   /* Free data here. Empty for now. */
+}
+
+static int sd_llc_alloc_all(const struct cpumask *cpu_map, struct s_data *d)
+{
+   struct sched_domain *sd, *hsd;
+   int i;
+
+   for_each_cpu(i, cpu_map) {
+   /* Find highest domain that shares resources */
+   hsd = NULL;
+   for (sd = *per_cpu_ptr(d->sd, i); sd; sd = sd->parent) {
+   if (!(sd->flags & SD_SHARE_PKG_RESOURCES))
+   break;
+   hsd = sd;
+   }
+   if (hsd && sd_llc_alloc(hsd))
+   return 1;
+   }
+
+   return 0;
+}
+
+static void sd_llc_free_all(const struct cpumask *cpu_map)
+{
+   struct sched_domain_topology_level *tl;
+   struct sched_domain *sd;
+   struct sd_data *sdd;
+   int j;
+
+   for_each_sd_topology(tl) {
+   sdd = >data;
+   if (!sdd)
+   continue;
+   for_each_cpu(j, cpu_map) {
+   sd = *per_cpu_ptr(sdd->sd, j);
+   if (sd)
+   sd_llc_free(sd);
+   }
+   }
+}
+
 static struct sched_domain *build_sched_domain(struct 
sched_domain_topology_level *tl,
const struct cpumask *cpu_map, struct sched_domain_attr *attr,
struct sched_domain *child, int dflags, int cpu)
@@ -1769,6 +1834,14 @@ static struct sched_domain *build_sched_domain(struct 
sched_domain_topology_leve
}
}
 
+   /*
+* Allocate shared sd data at last level cache.  Must be done after
+* domains are built above, but before the data is used in
+* cpu_attach_domain and descendants below.
+*/
+   if (sd_llc_alloc_all(cpu_map, ))
+   goto error;
+
/* Attach the domains */
rcu_read_lock();
for_each_cpu(i, cpu_map) {
-- 
1.8.3.1



[PATCH v3 10/10] sched/fair: Provide idle search schedstats

2018-11-09 Thread Steve Sistare
Add schedstats to measure the effectiveness of searching for idle CPUs
and stealing tasks.  This is a temporary patch intended for use during
development only.  SCHEDSTAT_VERSION is bumped to 16, and the following
fields are added to the per-CPU statistics of /proc/schedstat:

field 10: # of times select_idle_sibling "easily" found an idle CPU --
  prev or target is idle.
field 11: # of times select_idle_sibling searched and found an idle cpu.
field 12: # of times select_idle_sibling searched and found an idle core.
field 13: # of times select_idle_sibling failed to find anything idle.
field 14: time in nanoseconds spent in functions that search for idle
  CPUs and search for tasks to steal.
field 15: # of times an idle CPU steals a task from another CPU.
field 16: # of times try_steal finds overloaded CPUs but no task is
   migratable.

Signed-off-by: Steve Sistare 
---
 kernel/sched/core.c  | 30 +++--
 kernel/sched/fair.c  | 54 ++--
 kernel/sched/sched.h |  9 +
 kernel/sched/stats.c | 11 ++-
 kernel/sched/stats.h | 13 +
 5 files changed, 108 insertions(+), 9 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f12225f..49b48da 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2220,17 +2220,43 @@ int sysctl_numa_balancing(struct ctl_table *table, int 
write,
 DEFINE_STATIC_KEY_FALSE(sched_schedstats);
 static bool __initdata __sched_schedstats = false;
 
+unsigned long schedstat_skid;
+
+static void compute_skid(void)
+{
+   int i, n = 0;
+   s64 t, skid = 0;
+
+   for (i = 0; i < 100; i++) {
+   t = local_clock();
+   t = local_clock() - t;
+   if (t > 0 && t < 1000) {/* only use sane samples */
+   skid += t;
+   n++;
+   }
+   }
+
+   if (n > 0)
+   schedstat_skid = skid / n;
+   else
+   schedstat_skid = 0;
+   pr_info("schedstat_skid = %lu\n", schedstat_skid);
+}
+
 static void set_schedstats(bool enabled)
 {
-   if (enabled)
+   if (enabled) {
+   compute_skid();
static_branch_enable(_schedstats);
-   else
+   } else {
static_branch_disable(_schedstats);
+   }
 }
 
 void force_schedstat_enabled(void)
 {
if (!schedstat_enabled()) {
+   compute_skid();
pr_info("kernel profiling enabled schedstats, disable via 
kernel.sched_schedstats.\n");
static_branch_enable(_schedstats);
}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ac5bbf7..115b1a1 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3740,29 +3740,35 @@ static inline bool steal_enabled(void)
 static void overload_clear(struct rq *rq)
 {
struct sparsemask *overload_cpus;
+   unsigned long time;
 
if (!steal_enabled())
return;
 
+   time = schedstat_start_time();
rcu_read_lock();
overload_cpus = rcu_dereference(rq->cfs_overload_cpus);
if (overload_cpus)
sparsemask_clear_elem(rq->cpu, overload_cpus);
rcu_read_unlock();
+   schedstat_end_time(rq->find_time, time);
 }
 
 static void overload_set(struct rq *rq)
 {
struct sparsemask *overload_cpus;
+   unsigned long time;
 
if (!steal_enabled())
return;
 
+   time = schedstat_start_time();
rcu_read_lock();
overload_cpus = rcu_dereference(rq->cfs_overload_cpus);
if (overload_cpus)
sparsemask_set_elem(rq->cpu, overload_cpus);
rcu_read_unlock();
+   schedstat_end_time(rq->find_time, time);
 }
 
 static int try_steal(struct rq *this_rq, struct rq_flags *rf);
@@ -6183,6 +6189,16 @@ static int select_idle_cpu(struct task_struct *p, struct 
sched_domain *sd, int t
return cpu;
 }
 
+#define SET_STAT(STAT) \
+   do {\
+   if (schedstat_enabled()) {  \
+   struct rq *rq = this_rq();  \
+   \
+   if (rq) \
+   __schedstat_inc(rq->STAT);  \
+   }   \
+   } while (0)
+
 /*
  * Try and locate an idle core/thread in the LLC cache domain.
  */
@@ -6191,14 +6207,18 @@ static int select_idle_sibling(struct task_struct *p, 
int prev, int target)
struct sched_domain *sd;
int i, recent_used_cpu;
 
-   if (available_idle_cpu(target))
+   if (available_idle_cpu(target)) 

[PATCH v3 10/10] sched/fair: Provide idle search schedstats

2018-11-09 Thread Steve Sistare
Add schedstats to measure the effectiveness of searching for idle CPUs
and stealing tasks.  This is a temporary patch intended for use during
development only.  SCHEDSTAT_VERSION is bumped to 16, and the following
fields are added to the per-CPU statistics of /proc/schedstat:

field 10: # of times select_idle_sibling "easily" found an idle CPU --
  prev or target is idle.
field 11: # of times select_idle_sibling searched and found an idle cpu.
field 12: # of times select_idle_sibling searched and found an idle core.
field 13: # of times select_idle_sibling failed to find anything idle.
field 14: time in nanoseconds spent in functions that search for idle
  CPUs and search for tasks to steal.
field 15: # of times an idle CPU steals a task from another CPU.
field 16: # of times try_steal finds overloaded CPUs but no task is
   migratable.

Signed-off-by: Steve Sistare 
---
 kernel/sched/core.c  | 30 +++--
 kernel/sched/fair.c  | 54 ++--
 kernel/sched/sched.h |  9 +
 kernel/sched/stats.c | 11 ++-
 kernel/sched/stats.h | 13 +
 5 files changed, 108 insertions(+), 9 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f12225f..49b48da 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2220,17 +2220,43 @@ int sysctl_numa_balancing(struct ctl_table *table, int 
write,
 DEFINE_STATIC_KEY_FALSE(sched_schedstats);
 static bool __initdata __sched_schedstats = false;
 
+unsigned long schedstat_skid;
+
+static void compute_skid(void)
+{
+   int i, n = 0;
+   s64 t, skid = 0;
+
+   for (i = 0; i < 100; i++) {
+   t = local_clock();
+   t = local_clock() - t;
+   if (t > 0 && t < 1000) {/* only use sane samples */
+   skid += t;
+   n++;
+   }
+   }
+
+   if (n > 0)
+   schedstat_skid = skid / n;
+   else
+   schedstat_skid = 0;
+   pr_info("schedstat_skid = %lu\n", schedstat_skid);
+}
+
 static void set_schedstats(bool enabled)
 {
-   if (enabled)
+   if (enabled) {
+   compute_skid();
static_branch_enable(_schedstats);
-   else
+   } else {
static_branch_disable(_schedstats);
+   }
 }
 
 void force_schedstat_enabled(void)
 {
if (!schedstat_enabled()) {
+   compute_skid();
pr_info("kernel profiling enabled schedstats, disable via 
kernel.sched_schedstats.\n");
static_branch_enable(_schedstats);
}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ac5bbf7..115b1a1 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3740,29 +3740,35 @@ static inline bool steal_enabled(void)
 static void overload_clear(struct rq *rq)
 {
struct sparsemask *overload_cpus;
+   unsigned long time;
 
if (!steal_enabled())
return;
 
+   time = schedstat_start_time();
rcu_read_lock();
overload_cpus = rcu_dereference(rq->cfs_overload_cpus);
if (overload_cpus)
sparsemask_clear_elem(rq->cpu, overload_cpus);
rcu_read_unlock();
+   schedstat_end_time(rq->find_time, time);
 }
 
 static void overload_set(struct rq *rq)
 {
struct sparsemask *overload_cpus;
+   unsigned long time;
 
if (!steal_enabled())
return;
 
+   time = schedstat_start_time();
rcu_read_lock();
overload_cpus = rcu_dereference(rq->cfs_overload_cpus);
if (overload_cpus)
sparsemask_set_elem(rq->cpu, overload_cpus);
rcu_read_unlock();
+   schedstat_end_time(rq->find_time, time);
 }
 
 static int try_steal(struct rq *this_rq, struct rq_flags *rf);
@@ -6183,6 +6189,16 @@ static int select_idle_cpu(struct task_struct *p, struct 
sched_domain *sd, int t
return cpu;
 }
 
+#define SET_STAT(STAT) \
+   do {\
+   if (schedstat_enabled()) {  \
+   struct rq *rq = this_rq();  \
+   \
+   if (rq) \
+   __schedstat_inc(rq->STAT);  \
+   }   \
+   } while (0)
+
 /*
  * Try and locate an idle core/thread in the LLC cache domain.
  */
@@ -6191,14 +6207,18 @@ static int select_idle_sibling(struct task_struct *p, 
int prev, int target)
struct sched_domain *sd;
int i, recent_used_cpu;
 
-   if (available_idle_cpu(target))
+   if (available_idle_cpu(target)) 

[PATCH v3 08/10] sched/fair: Steal work from an overloaded CPU when CPU goes idle

2018-11-09 Thread Steve Sistare
 @ 2.20GHz
Average of 10 runs of: hackbench  process 10

--- base ----- new ---
  groupstime %stdevtime %stdev  %speedup
   1   5.8265.4   5.8405.0  -0.3
   2   5.0415.3   6.171   23.4 -18.4
   3   6.8392.1   6.3243.8   8.1
   4   8.1770.6   7.3183.6  11.7
   8  14.4290.7  13.9661.3   3.3
  16  26.4010.3  25.1491.5   4.9

X6-2: 2 sockets * 22 cores * 2 hyperthreads = 88 CPUs
Intel(R) Xeon(R) CPU E5-2699 v4 @ 2.20GHz
Oracle database OLTP, logging disabled, NVRAM storage

  Customers   Users   %speedup
120  40   -1.2
240  802.7
360 1208.9
480 1604.4
600 2003.0

X6-2: 2 sockets * 14 cores * 2 hyperthreads = 56 CPUs
Intel(R) Xeon(R) CPU E5-2690 v4 @ 2.60GHz
Results from the Oracle "Performance PIT".

  Benchmark   %speedup

  mysql_sysb1.0.14_fileio_56_rndrd19.6
  mysql_sysb1.0.14_fileio_56_seqrd12.1
  mysql_sysb1.0.14_fileio_56_rndwr 0.4
  mysql_sysb1.0.14_fileio_56_seqrewr  -0.3

  pgsql_sysb1.0.14_fileio_56_rndrd19.5
  pgsql_sysb1.0.14_fileio_56_seqrd 8.6
  pgsql_sysb1.0.14_fileio_56_rndwr 1.0
  pgsql_sysb1.0.14_fileio_56_seqrewr   0.5

  opatch_time_ASM_12.2.0.1.0_HP2M  7.5
  select-1_users-warm_asmm_ASM_12.2.0.1.0_HP2M 5.1
  select-1_users_asmm_ASM_12.2.0.1.0_HP2M  4.4
  swingbenchv3_asmm_soebench_ASM_12.2.0.1.0_HP2M   5.8

  lm3_memlat_L24.8
  lm3_memlat_L10.0

  ub_gcc_56CPUs-56copies_Pipe-based_Context_Switching 60.1
  ub_gcc_56CPUs-56copies_Shell_Scripts_1_concurrent5.2
  ub_gcc_56CPUs-56copies_Shell_Scripts_8_concurrent   -3.0
  ub_gcc_56CPUs-56copies_File_Copy_1024_bufsize_2000_maxblocks 2.4

X5-2: 2 sockets * 18 cores * 2 hyperthreads = 72 CPUs
Intel(R) Xeon(R) CPU E5-2699 v3 @ 2.30GHz

  NAS_OMP
  bench class   ncpu%improved(Mops)
  dcB   72  1.3
  isC   72  0.9
  isD   72  0.7

  sysbench mysql, average of 24 runs
  --- base --- --- new ---
  nthr   events  %stdev   events  %stdev %speedup
 1331.00.25331.00.24 -0.1
 2661.30.22661.80.22  0.0
 4   1297.00.88   1300.50.82  0.2
 8   2420.80.04   2420.50.04 -0.1
16   4826.30.07   4825.40.05 -0.1
32   8815.30.27   8830.20.18  0.1
64  12823.00.24  12823.60.26  0.0

-----

Signed-off-by: Steve Sistare 
---
 kernel/sched/fair.c | 169 ++--
 kernel/sched/features.h |   6 ++
 2 files changed, 170 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index dc6224d..97bdea2 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3731,6 +3731,9 @@ static void overload_clear(struct rq *rq)
 {
struct sparsemask *overload_cpus;
 
+   if (!sched_feat(STEAL))
+   return;
+
rcu_read_lock();
overload_cpus = rcu_dereference(rq->cfs_overload_cpus);
if (overload_cpus)
@@ -3742,6 +3745,9 @@ static void overload_set(struct rq *rq)
 {
struct sparsemask *overload_cpus;
 
+   if (!sched_feat(STEAL))
+   return;
+
rcu_read_lock();
overload_cpus = rcu_dereference(rq->cfs_overload_cpus);
if (overload_cpus)
@@ -3749,6 +3755,8 @@ static void overload_set(struct rq *rq)
rcu_read_unlock();
 }
 
+static int try_steal(struct rq *this_rq, struct rq_flags *rf);
+
 #else /* CONFIG_SMP */
 
 #define UPDATE_TG  0x0
@@ -3785,6 +3793,11 @@ static inline void overload_set(struct rq *rq) {}
 bool task_sleep) {}
 static inline void update_misfit_status(struct task_struct *p, struct rq *rq) 
{}
 
+static inline int try_steal(struct rq *this_rq, struct rq_flags *rf)
+{
+   return 0;
+}
+
 #endif /* CONFIG_SMP */
 
 static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -6770,20 +6783,22 @@ static void check_preempt_wakeup(struct rq *rq, struct 
task_struct *p, int wake_
update_misfit_status(NULL, rq);
 
/*
-* We must set idle_stamp _before_ calling idle_balance(), such that we
-* measure the duration of idle_balance() as idle time.
+* We must set idle_stamp _before_ calling try_steal() or
+* idle_balance(), such that we measure the duration as idle time.
 */
IF_SMP(rq->idle_stamp = rq_clock(rq);)
 
new_tasks = idle_balance(rq, rf);
+   if (new_tasks ==

[PATCH v3 05/10] sched/fair: Hoist idle_stamp up from idle_balance

2018-11-09 Thread Steve Sistare
Move the update of idle_stamp from idle_balance to the call site in
pick_next_task_fair, to prepare for a future patch that adds work to
pick_next_task_fair which must be included in the idle_stamp interval.
No functional change.

Signed-off-by: Steve Sistare 
---
 kernel/sched/fair.c | 23 ++-
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 9031d39..da368ed 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3725,6 +3725,8 @@ static inline void update_misfit_status(struct 
task_struct *p, struct rq *rq)
rq->misfit_task_load = task_h_load(p);
 }
 
+#define IF_SMP(statement)  statement
+
 static void overload_clear(struct rq *rq)
 {
struct sparsemask *overload_cpus;
@@ -3770,6 +3772,8 @@ static inline int idle_balance(struct rq *rq, struct 
rq_flags *rf)
return 0;
 }
 
+#define IF_SMP(statement)  /* empty */
+
 static inline void overload_clear(struct rq *rq) {}
 static inline void overload_set(struct rq *rq) {}
 
@@ -6764,8 +6768,18 @@ static void check_preempt_wakeup(struct rq *rq, struct 
task_struct *p, int wake_
 
 idle:
update_misfit_status(NULL, rq);
+
+   /*
+* We must set idle_stamp _before_ calling idle_balance(), such that we
+* measure the duration of idle_balance() as idle time.
+*/
+   IF_SMP(rq->idle_stamp = rq_clock(rq);)
+
new_tasks = idle_balance(rq, rf);
 
+   if (new_tasks)
+   IF_SMP(rq->idle_stamp = 0;)
+
/*
 * Because idle_balance() releases (and re-acquires) rq->lock, it is
 * possible for any higher priority task to appear. In that case we
@@ -9611,12 +9625,6 @@ static int idle_balance(struct rq *this_rq, struct 
rq_flags *rf)
u64 curr_cost = 0;
 
/*
-* We must set idle_stamp _before_ calling idle_balance(), such that we
-* measure the duration of idle_balance() as idle time.
-*/
-   this_rq->idle_stamp = rq_clock(this_rq);
-
-   /*
 * Do not pull tasks towards !active CPUs...
 */
if (!cpu_active(this_cpu))
@@ -9707,9 +9715,6 @@ static int idle_balance(struct rq *this_rq, struct 
rq_flags *rf)
if (this_rq->nr_running != this_rq->cfs.h_nr_running)
pulled_task = -1;
 
-   if (pulled_task)
-   this_rq->idle_stamp = 0;
-
rq_repin_lock(this_rq, rf);
 
return pulled_task;
-- 
1.8.3.1



[PATCH v3 03/10] sched/topology: Provide cfs_overload_cpus bitmap

2018-11-09 Thread Steve Sistare
From: Steve Sistare 

Define and initialize a sparse bitmap of overloaded CPUs, per
last-level-cache scheduling domain, for use by the CFS scheduling class.
Save a pointer to cfs_overload_cpus in the rq for efficient access.

Signed-off-by: Steve Sistare 
---
 include/linux/sched/topology.h |  1 +
 kernel/sched/sched.h   |  2 ++
 kernel/sched/topology.c| 21 +++--
 3 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 6b99761..b173a77 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -72,6 +72,7 @@ struct sched_domain_shared {
atomic_tref;
atomic_tnr_busy_cpus;
int has_idle_cores;
+   struct sparsemask *cfs_overload_cpus;
 };
 
 struct sched_domain {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 618577f..eacf5db 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -81,6 +81,7 @@
 
 struct rq;
 struct cpuidle_state;
+struct sparsemask;
 
 /* task_struct::on_rq states: */
 #define TASK_ON_RQ_QUEUED  1
@@ -812,6 +813,7 @@ struct rq {
struct cfs_rq   cfs;
struct rt_rqrt;
struct dl_rqdl;
+   struct sparsemask   *cfs_overload_cpus;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
/* list of leaf cfs_rq on this CPU: */
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 3e72ce0..6455bde 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -3,6 +3,7 @@
  * Scheduler topology setup/handling methods
  */
 #include "sched.h"
+#include 
 
 DEFINE_MUTEX(sched_domains_mutex);
 
@@ -441,6 +442,7 @@ static void update_top_cache_domain(int cpu)
 static void
 cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
 {
+   struct sparsemask *cfs_overload_cpus;
struct rq *rq = cpu_rq(cpu);
struct sched_domain *tmp;
 
@@ -482,6 +484,10 @@ static void update_top_cache_domain(int cpu)
dirty_sched_domain_sysctl(cpu);
destroy_sched_domains(tmp);
 
+   sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
+   cfs_overload_cpus = (sd ? sd->shared->cfs_overload_cpus : NULL);
+   rcu_assign_pointer(rq->cfs_overload_cpus, cfs_overload_cpus);
+
update_top_cache_domain(cpu);
 }
 
@@ -1619,9 +1625,19 @@ static void __sdt_free(const struct cpumask *cpu_map)
}
 }
 
+#define ZALLOC_MASK(maskp, nelems, node) \
+   (!*(maskp) && !zalloc_sparsemask_node(maskp, nelems,  \
+ SPARSEMASK_DENSITY_DEFAULT, \
+ GFP_KERNEL, node))  \
+
 static int sd_llc_alloc(struct sched_domain *sd)
 {
-   /* Allocate sd->shared data here. Empty for now. */
+   struct sched_domain_shared *sds = sd->shared;
+   struct cpumask *span = sched_domain_span(sd);
+   int nid = cpu_to_node(cpumask_first(span));
+
+   if (ZALLOC_MASK(>cfs_overload_cpus, nr_cpu_ids, nid))
+   return 1;
 
return 0;
 }
@@ -1633,7 +1649,8 @@ static void sd_llc_free(struct sched_domain *sd)
if (!sds)
return;
 
-   /* Free data here. Empty for now. */
+   free_sparsemask(sds->cfs_overload_cpus);
+   sds->cfs_overload_cpus = NULL;
 }
 
 static int sd_llc_alloc_all(const struct cpumask *cpu_map, struct s_data *d)
-- 
1.8.3.1



[PATCH v3 05/10] sched/fair: Hoist idle_stamp up from idle_balance

2018-11-09 Thread Steve Sistare
Move the update of idle_stamp from idle_balance to the call site in
pick_next_task_fair, to prepare for a future patch that adds work to
pick_next_task_fair which must be included in the idle_stamp interval.
No functional change.

Signed-off-by: Steve Sistare 
---
 kernel/sched/fair.c | 23 ++-
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 9031d39..da368ed 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3725,6 +3725,8 @@ static inline void update_misfit_status(struct 
task_struct *p, struct rq *rq)
rq->misfit_task_load = task_h_load(p);
 }
 
+#define IF_SMP(statement)  statement
+
 static void overload_clear(struct rq *rq)
 {
struct sparsemask *overload_cpus;
@@ -3770,6 +3772,8 @@ static inline int idle_balance(struct rq *rq, struct 
rq_flags *rf)
return 0;
 }
 
+#define IF_SMP(statement)  /* empty */
+
 static inline void overload_clear(struct rq *rq) {}
 static inline void overload_set(struct rq *rq) {}
 
@@ -6764,8 +6768,18 @@ static void check_preempt_wakeup(struct rq *rq, struct 
task_struct *p, int wake_
 
 idle:
update_misfit_status(NULL, rq);
+
+   /*
+* We must set idle_stamp _before_ calling idle_balance(), such that we
+* measure the duration of idle_balance() as idle time.
+*/
+   IF_SMP(rq->idle_stamp = rq_clock(rq);)
+
new_tasks = idle_balance(rq, rf);
 
+   if (new_tasks)
+   IF_SMP(rq->idle_stamp = 0;)
+
/*
 * Because idle_balance() releases (and re-acquires) rq->lock, it is
 * possible for any higher priority task to appear. In that case we
@@ -9611,12 +9625,6 @@ static int idle_balance(struct rq *this_rq, struct 
rq_flags *rf)
u64 curr_cost = 0;
 
/*
-* We must set idle_stamp _before_ calling idle_balance(), such that we
-* measure the duration of idle_balance() as idle time.
-*/
-   this_rq->idle_stamp = rq_clock(this_rq);
-
-   /*
 * Do not pull tasks towards !active CPUs...
 */
if (!cpu_active(this_cpu))
@@ -9707,9 +9715,6 @@ static int idle_balance(struct rq *this_rq, struct 
rq_flags *rf)
if (this_rq->nr_running != this_rq->cfs.h_nr_running)
pulled_task = -1;
 
-   if (pulled_task)
-   this_rq->idle_stamp = 0;
-
rq_repin_lock(this_rq, rf);
 
return pulled_task;
-- 
1.8.3.1



[PATCH v3 03/10] sched/topology: Provide cfs_overload_cpus bitmap

2018-11-09 Thread Steve Sistare
From: Steve Sistare 

Define and initialize a sparse bitmap of overloaded CPUs, per
last-level-cache scheduling domain, for use by the CFS scheduling class.
Save a pointer to cfs_overload_cpus in the rq for efficient access.

Signed-off-by: Steve Sistare 
---
 include/linux/sched/topology.h |  1 +
 kernel/sched/sched.h   |  2 ++
 kernel/sched/topology.c| 21 +++--
 3 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 6b99761..b173a77 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -72,6 +72,7 @@ struct sched_domain_shared {
atomic_tref;
atomic_tnr_busy_cpus;
int has_idle_cores;
+   struct sparsemask *cfs_overload_cpus;
 };
 
 struct sched_domain {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 618577f..eacf5db 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -81,6 +81,7 @@
 
 struct rq;
 struct cpuidle_state;
+struct sparsemask;
 
 /* task_struct::on_rq states: */
 #define TASK_ON_RQ_QUEUED  1
@@ -812,6 +813,7 @@ struct rq {
struct cfs_rq   cfs;
struct rt_rqrt;
struct dl_rqdl;
+   struct sparsemask   *cfs_overload_cpus;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
/* list of leaf cfs_rq on this CPU: */
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 3e72ce0..6455bde 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -3,6 +3,7 @@
  * Scheduler topology setup/handling methods
  */
 #include "sched.h"
+#include 
 
 DEFINE_MUTEX(sched_domains_mutex);
 
@@ -441,6 +442,7 @@ static void update_top_cache_domain(int cpu)
 static void
 cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
 {
+   struct sparsemask *cfs_overload_cpus;
struct rq *rq = cpu_rq(cpu);
struct sched_domain *tmp;
 
@@ -482,6 +484,10 @@ static void update_top_cache_domain(int cpu)
dirty_sched_domain_sysctl(cpu);
destroy_sched_domains(tmp);
 
+   sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
+   cfs_overload_cpus = (sd ? sd->shared->cfs_overload_cpus : NULL);
+   rcu_assign_pointer(rq->cfs_overload_cpus, cfs_overload_cpus);
+
update_top_cache_domain(cpu);
 }
 
@@ -1619,9 +1625,19 @@ static void __sdt_free(const struct cpumask *cpu_map)
}
 }
 
+#define ZALLOC_MASK(maskp, nelems, node) \
+   (!*(maskp) && !zalloc_sparsemask_node(maskp, nelems,  \
+ SPARSEMASK_DENSITY_DEFAULT, \
+ GFP_KERNEL, node))  \
+
 static int sd_llc_alloc(struct sched_domain *sd)
 {
-   /* Allocate sd->shared data here. Empty for now. */
+   struct sched_domain_shared *sds = sd->shared;
+   struct cpumask *span = sched_domain_span(sd);
+   int nid = cpu_to_node(cpumask_first(span));
+
+   if (ZALLOC_MASK(>cfs_overload_cpus, nr_cpu_ids, nid))
+   return 1;
 
return 0;
 }
@@ -1633,7 +1649,8 @@ static void sd_llc_free(struct sched_domain *sd)
if (!sds)
return;
 
-   /* Free data here. Empty for now. */
+   free_sparsemask(sds->cfs_overload_cpus);
+   sds->cfs_overload_cpus = NULL;
 }
 
 static int sd_llc_alloc_all(const struct cpumask *cpu_map, struct s_data *d)
-- 
1.8.3.1



[PATCH v3 08/10] sched/fair: Steal work from an overloaded CPU when CPU goes idle

2018-11-09 Thread Steve Sistare
 @ 2.20GHz
Average of 10 runs of: hackbench  process 10

--- base ----- new ---
  groupstime %stdevtime %stdev  %speedup
   1   5.8265.4   5.8405.0  -0.3
   2   5.0415.3   6.171   23.4 -18.4
   3   6.8392.1   6.3243.8   8.1
   4   8.1770.6   7.3183.6  11.7
   8  14.4290.7  13.9661.3   3.3
  16  26.4010.3  25.1491.5   4.9

X6-2: 2 sockets * 22 cores * 2 hyperthreads = 88 CPUs
Intel(R) Xeon(R) CPU E5-2699 v4 @ 2.20GHz
Oracle database OLTP, logging disabled, NVRAM storage

  Customers   Users   %speedup
120  40   -1.2
240  802.7
360 1208.9
480 1604.4
600 2003.0

X6-2: 2 sockets * 14 cores * 2 hyperthreads = 56 CPUs
Intel(R) Xeon(R) CPU E5-2690 v4 @ 2.60GHz
Results from the Oracle "Performance PIT".

  Benchmark   %speedup

  mysql_sysb1.0.14_fileio_56_rndrd19.6
  mysql_sysb1.0.14_fileio_56_seqrd12.1
  mysql_sysb1.0.14_fileio_56_rndwr 0.4
  mysql_sysb1.0.14_fileio_56_seqrewr  -0.3

  pgsql_sysb1.0.14_fileio_56_rndrd19.5
  pgsql_sysb1.0.14_fileio_56_seqrd 8.6
  pgsql_sysb1.0.14_fileio_56_rndwr 1.0
  pgsql_sysb1.0.14_fileio_56_seqrewr   0.5

  opatch_time_ASM_12.2.0.1.0_HP2M  7.5
  select-1_users-warm_asmm_ASM_12.2.0.1.0_HP2M 5.1
  select-1_users_asmm_ASM_12.2.0.1.0_HP2M  4.4
  swingbenchv3_asmm_soebench_ASM_12.2.0.1.0_HP2M   5.8

  lm3_memlat_L24.8
  lm3_memlat_L10.0

  ub_gcc_56CPUs-56copies_Pipe-based_Context_Switching 60.1
  ub_gcc_56CPUs-56copies_Shell_Scripts_1_concurrent5.2
  ub_gcc_56CPUs-56copies_Shell_Scripts_8_concurrent   -3.0
  ub_gcc_56CPUs-56copies_File_Copy_1024_bufsize_2000_maxblocks 2.4

X5-2: 2 sockets * 18 cores * 2 hyperthreads = 72 CPUs
Intel(R) Xeon(R) CPU E5-2699 v3 @ 2.30GHz

  NAS_OMP
  bench class   ncpu%improved(Mops)
  dcB   72  1.3
  isC   72  0.9
  isD   72  0.7

  sysbench mysql, average of 24 runs
  --- base --- --- new ---
  nthr   events  %stdev   events  %stdev %speedup
 1331.00.25331.00.24 -0.1
 2661.30.22661.80.22  0.0
 4   1297.00.88   1300.50.82  0.2
 8   2420.80.04   2420.50.04 -0.1
16   4826.30.07   4825.40.05 -0.1
32   8815.30.27   8830.20.18  0.1
64  12823.00.24  12823.60.26  0.0

-----

Signed-off-by: Steve Sistare 
---
 kernel/sched/fair.c | 169 ++--
 kernel/sched/features.h |   6 ++
 2 files changed, 170 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index dc6224d..97bdea2 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3731,6 +3731,9 @@ static void overload_clear(struct rq *rq)
 {
struct sparsemask *overload_cpus;
 
+   if (!sched_feat(STEAL))
+   return;
+
rcu_read_lock();
overload_cpus = rcu_dereference(rq->cfs_overload_cpus);
if (overload_cpus)
@@ -3742,6 +3745,9 @@ static void overload_set(struct rq *rq)
 {
struct sparsemask *overload_cpus;
 
+   if (!sched_feat(STEAL))
+   return;
+
rcu_read_lock();
overload_cpus = rcu_dereference(rq->cfs_overload_cpus);
if (overload_cpus)
@@ -3749,6 +3755,8 @@ static void overload_set(struct rq *rq)
rcu_read_unlock();
 }
 
+static int try_steal(struct rq *this_rq, struct rq_flags *rf);
+
 #else /* CONFIG_SMP */
 
 #define UPDATE_TG  0x0
@@ -3785,6 +3793,11 @@ static inline void overload_set(struct rq *rq) {}
 bool task_sleep) {}
 static inline void update_misfit_status(struct task_struct *p, struct rq *rq) 
{}
 
+static inline int try_steal(struct rq *this_rq, struct rq_flags *rf)
+{
+   return 0;
+}
+
 #endif /* CONFIG_SMP */
 
 static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -6770,20 +6783,22 @@ static void check_preempt_wakeup(struct rq *rq, struct 
task_struct *p, int wake_
update_misfit_status(NULL, rq);
 
/*
-* We must set idle_stamp _before_ calling idle_balance(), such that we
-* measure the duration of idle_balance() as idle time.
+* We must set idle_stamp _before_ calling try_steal() or
+* idle_balance(), such that we measure the duration as idle time.
 */
IF_SMP(rq->idle_stamp = rq_clock(rq);)
 
new_tasks = idle_balance(rq, rf);
+   if (new_tasks ==

[PATCH v3 04/10] sched/fair: Dynamically update cfs_overload_cpus

2018-11-09 Thread Steve Sistare
An overloaded CPU has more than 1 runnable task.  When a CFS task wakes
on a CPU, if h_nr_running transitions from 1 to more, then set the CPU in
the cfs_overload_cpus bitmap.  When a CFS task sleeps, if h_nr_running
transitions from 2 to less, then clear the CPU in cfs_overload_cpus.

Signed-off-by: Steve Sistare 
---
 kernel/sched/fair.c | 52 
 1 file changed, 48 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ee271bb..9031d39 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -23,6 +23,7 @@
 #include "sched.h"
 
 #include 
+#include 
 
 /*
  * Targeted preemption latency for CPU-bound tasks:
@@ -3724,6 +3725,28 @@ static inline void update_misfit_status(struct 
task_struct *p, struct rq *rq)
rq->misfit_task_load = task_h_load(p);
 }
 
+static void overload_clear(struct rq *rq)
+{
+   struct sparsemask *overload_cpus;
+
+   rcu_read_lock();
+   overload_cpus = rcu_dereference(rq->cfs_overload_cpus);
+   if (overload_cpus)
+   sparsemask_clear_elem(rq->cpu, overload_cpus);
+   rcu_read_unlock();
+}
+
+static void overload_set(struct rq *rq)
+{
+   struct sparsemask *overload_cpus;
+
+   rcu_read_lock();
+   overload_cpus = rcu_dereference(rq->cfs_overload_cpus);
+   if (overload_cpus)
+   sparsemask_set_elem(rq->cpu, overload_cpus);
+   rcu_read_unlock();
+}
+
 #else /* CONFIG_SMP */
 
 #define UPDATE_TG  0x0
@@ -3747,6 +3770,9 @@ static inline int idle_balance(struct rq *rq, struct 
rq_flags *rf)
return 0;
 }
 
+static inline void overload_clear(struct rq *rq) {}
+static inline void overload_set(struct rq *rq) {}
+
 static inline void
 util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
 
@@ -4441,6 +4467,7 @@ static int tg_throttle_down(struct task_group *tg, void 
*data)
 static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
 {
struct rq *rq = rq_of(cfs_rq);
+   unsigned int prev_nr = rq->cfs.h_nr_running;
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct sched_entity *se;
long task_delta, dequeue = 1;
@@ -4468,8 +4495,12 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
dequeue = 0;
}
 
-   if (!se)
+   if (!se) {
sub_nr_running(rq, task_delta);
+   if (prev_nr >= 2 && prev_nr - task_delta < 2)
+   overload_clear(rq);
+
+   }
 
cfs_rq->throttled = 1;
cfs_rq->throttled_clock = rq_clock(rq);
@@ -4499,6 +4530,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
 void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
 {
struct rq *rq = rq_of(cfs_rq);
+   unsigned int prev_nr = rq->cfs.h_nr_running;
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct sched_entity *se;
int enqueue = 1;
@@ -4535,8 +4567,11 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
break;
}
 
-   if (!se)
+   if (!se) {
add_nr_running(rq, task_delta);
+   if (prev_nr < 2 && prev_nr + task_delta >= 2)
+   overload_set(rq);
+   }
 
/* Determine whether we need to wake up potentially idle CPU: */
if (rq->curr == rq->idle && rq->cfs.nr_running)
@@ -5082,6 +5117,7 @@ static inline void hrtick_update(struct rq *rq)
 {
struct cfs_rq *cfs_rq;
struct sched_entity *se = >se;
+   unsigned int prev_nr = rq->cfs.h_nr_running;
 
/*
 * The code below (indirectly) updates schedutil which looks at
@@ -5129,8 +5165,12 @@ static inline void hrtick_update(struct rq *rq)
update_cfs_group(se);
}
 
-   if (!se)
+   if (!se) {
add_nr_running(rq, 1);
+   if (prev_nr == 1)
+   overload_set(rq);
+
+   }
 
hrtick_update(rq);
 }
@@ -5147,6 +5187,7 @@ static void dequeue_task_fair(struct rq *rq, struct 
task_struct *p, int flags)
struct cfs_rq *cfs_rq;
struct sched_entity *se = >se;
int task_sleep = flags & DEQUEUE_SLEEP;
+   unsigned int prev_nr = rq->cfs.h_nr_running;
 
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
@@ -5188,8 +5229,11 @@ static void dequeue_task_fair(struct rq *rq, struct 
task_struct *p, int flags)
update_cfs_group(se);
}
 
-   if (!se)
+   if (!se) {
sub_nr_running(rq, 1);
+   if (prev_nr == 2)
+   overload_clear(rq);
+   }
 
util_est_dequeue(>cfs, p, task_sleep);
hrtick_update(rq);
-- 
1.8.3.1



[PATCH v3 04/10] sched/fair: Dynamically update cfs_overload_cpus

2018-11-09 Thread Steve Sistare
An overloaded CPU has more than 1 runnable task.  When a CFS task wakes
on a CPU, if h_nr_running transitions from 1 to more, then set the CPU in
the cfs_overload_cpus bitmap.  When a CFS task sleeps, if h_nr_running
transitions from 2 to less, then clear the CPU in cfs_overload_cpus.

Signed-off-by: Steve Sistare 
---
 kernel/sched/fair.c | 52 
 1 file changed, 48 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ee271bb..9031d39 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -23,6 +23,7 @@
 #include "sched.h"
 
 #include 
+#include 
 
 /*
  * Targeted preemption latency for CPU-bound tasks:
@@ -3724,6 +3725,28 @@ static inline void update_misfit_status(struct 
task_struct *p, struct rq *rq)
rq->misfit_task_load = task_h_load(p);
 }
 
+static void overload_clear(struct rq *rq)
+{
+   struct sparsemask *overload_cpus;
+
+   rcu_read_lock();
+   overload_cpus = rcu_dereference(rq->cfs_overload_cpus);
+   if (overload_cpus)
+   sparsemask_clear_elem(rq->cpu, overload_cpus);
+   rcu_read_unlock();
+}
+
+static void overload_set(struct rq *rq)
+{
+   struct sparsemask *overload_cpus;
+
+   rcu_read_lock();
+   overload_cpus = rcu_dereference(rq->cfs_overload_cpus);
+   if (overload_cpus)
+   sparsemask_set_elem(rq->cpu, overload_cpus);
+   rcu_read_unlock();
+}
+
 #else /* CONFIG_SMP */
 
 #define UPDATE_TG  0x0
@@ -3747,6 +3770,9 @@ static inline int idle_balance(struct rq *rq, struct 
rq_flags *rf)
return 0;
 }
 
+static inline void overload_clear(struct rq *rq) {}
+static inline void overload_set(struct rq *rq) {}
+
 static inline void
 util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
 
@@ -4441,6 +4467,7 @@ static int tg_throttle_down(struct task_group *tg, void 
*data)
 static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
 {
struct rq *rq = rq_of(cfs_rq);
+   unsigned int prev_nr = rq->cfs.h_nr_running;
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct sched_entity *se;
long task_delta, dequeue = 1;
@@ -4468,8 +4495,12 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
dequeue = 0;
}
 
-   if (!se)
+   if (!se) {
sub_nr_running(rq, task_delta);
+   if (prev_nr >= 2 && prev_nr - task_delta < 2)
+   overload_clear(rq);
+
+   }
 
cfs_rq->throttled = 1;
cfs_rq->throttled_clock = rq_clock(rq);
@@ -4499,6 +4530,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
 void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
 {
struct rq *rq = rq_of(cfs_rq);
+   unsigned int prev_nr = rq->cfs.h_nr_running;
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct sched_entity *se;
int enqueue = 1;
@@ -4535,8 +4567,11 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
break;
}
 
-   if (!se)
+   if (!se) {
add_nr_running(rq, task_delta);
+   if (prev_nr < 2 && prev_nr + task_delta >= 2)
+   overload_set(rq);
+   }
 
/* Determine whether we need to wake up potentially idle CPU: */
if (rq->curr == rq->idle && rq->cfs.nr_running)
@@ -5082,6 +5117,7 @@ static inline void hrtick_update(struct rq *rq)
 {
struct cfs_rq *cfs_rq;
struct sched_entity *se = >se;
+   unsigned int prev_nr = rq->cfs.h_nr_running;
 
/*
 * The code below (indirectly) updates schedutil which looks at
@@ -5129,8 +5165,12 @@ static inline void hrtick_update(struct rq *rq)
update_cfs_group(se);
}
 
-   if (!se)
+   if (!se) {
add_nr_running(rq, 1);
+   if (prev_nr == 1)
+   overload_set(rq);
+
+   }
 
hrtick_update(rq);
 }
@@ -5147,6 +5187,7 @@ static void dequeue_task_fair(struct rq *rq, struct 
task_struct *p, int flags)
struct cfs_rq *cfs_rq;
struct sched_entity *se = >se;
int task_sleep = flags & DEQUEUE_SLEEP;
+   unsigned int prev_nr = rq->cfs.h_nr_running;
 
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
@@ -5188,8 +5229,11 @@ static void dequeue_task_fair(struct rq *rq, struct 
task_struct *p, int flags)
update_cfs_group(se);
}
 
-   if (!se)
+   if (!se) {
sub_nr_running(rq, 1);
+   if (prev_nr == 2)
+   overload_clear(rq);
+   }
 
util_est_dequeue(>cfs, p, task_sleep);
hrtick_update(rq);
-- 
1.8.3.1



[PATCH v2 10/10] sched/fair: Provide idle search schedstats

2018-11-05 Thread Steve Sistare
Add schedstats to measure the effectiveness of searching for idle CPUs
and stealing tasks.  This is a temporary patch intended for use during
development only.  SCHEDSTAT_VERSION is bumped to 16, and the following
fields are added to the per-CPU statistics of /proc/schedstat:

field 10: # of times select_idle_sibling "easily" found an idle CPU --
  prev or target is idle.
field 11: # of times select_idle_sibling searched and found an idle cpu.
field 12: # of times select_idle_sibling searched and found an idle core.
field 13: # of times select_idle_sibling failed to find anything idle.
field 14: time in nanoseconds spent in functions that search for idle
  CPUs and search for tasks to steal.
field 15: # of times an idle CPU steals a task from another CPU.
field 16: # of times try_steal finds overloaded CPUs but no task is
   migratable.

Signed-off-by: Steve Sistare 
---
 kernel/sched/core.c  | 30 +++--
 kernel/sched/fair.c  | 54 ++--
 kernel/sched/sched.h |  9 +
 kernel/sched/stats.c | 11 ++-
 kernel/sched/stats.h | 13 +
 5 files changed, 108 insertions(+), 9 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ad97f3b..b61d15d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2214,17 +2214,43 @@ int sysctl_numa_balancing(struct ctl_table *table, int 
write,
 DEFINE_STATIC_KEY_FALSE(sched_schedstats);
 static bool __initdata __sched_schedstats = false;
 
+unsigned long schedstat_skid;
+
+static void compute_skid(void)
+{
+   int i, n = 0;
+   s64 t, skid = 0;
+
+   for (i = 0; i < 100; i++) {
+   t = local_clock();
+   t = local_clock() - t;
+   if (t > 0 && t < 1000) {/* only use sane samples */
+   skid += t;
+   n++;
+   }
+   }
+
+   if (n > 0)
+   schedstat_skid = skid / n;
+   else
+   schedstat_skid = 0;
+   pr_info("schedstat_skid = %lu\n", schedstat_skid);
+}
+
 static void set_schedstats(bool enabled)
 {
-   if (enabled)
+   if (enabled) {
+   compute_skid();
static_branch_enable(_schedstats);
-   else
+   } else {
static_branch_disable(_schedstats);
+   }
 }
 
 void force_schedstat_enabled(void)
 {
if (!schedstat_enabled()) {
+   compute_skid();
pr_info("kernel profiling enabled schedstats, disable via 
kernel.sched_schedstats.\n");
static_branch_enable(_schedstats);
}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 56dce30..21ffe34 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3739,29 +3739,35 @@ static inline bool steal_enabled(void)
 static void overload_clear(struct rq *rq)
 {
struct sparsemask *overload_cpus;
+   unsigned long time;
 
if (!steal_enabled())
return;
 
+   time = schedstat_start_time();
rcu_read_lock();
overload_cpus = rcu_dereference(rq->cfs_overload_cpus);
if (overload_cpus)
sparsemask_clear_elem(rq->cpu, overload_cpus);
rcu_read_unlock();
+   schedstat_end_time(rq->find_time, time);
 }
 
 static void overload_set(struct rq *rq)
 {
struct sparsemask *overload_cpus;
+   unsigned long time;
 
if (!steal_enabled())
return;
 
+   time = schedstat_start_time();
rcu_read_lock();
overload_cpus = rcu_dereference(rq->cfs_overload_cpus);
if (overload_cpus)
sparsemask_set_elem(rq->cpu, overload_cpus);
rcu_read_unlock();
+   schedstat_end_time(rq->find_time, time);
 }
 
 static int try_steal(struct rq *this_rq, struct rq_flags *rf);
@@ -6165,6 +6171,16 @@ static int select_idle_cpu(struct task_struct *p, struct 
sched_domain *sd, int t
return cpu;
 }
 
+#define SET_STAT(STAT) \
+   do {\
+   if (schedstat_enabled()) {  \
+   struct rq *rq = this_rq();  \
+   \
+   if (rq) \
+   __schedstat_inc(rq->STAT);  \
+   }   \
+   } while (0)
+
 /*
  * Try and locate an idle core/thread in the LLC cache domain.
  */
@@ -6173,14 +6189,18 @@ static int select_idle_sibling(struct task_struct *p, 
int prev, int target)
struct sched_domain *sd;
int i, recent_used_cpu;
 
-   if (available_idle_cpu(target))
+   if (available_idle_cpu(target)) 

[PATCH v2 10/10] sched/fair: Provide idle search schedstats

2018-11-05 Thread Steve Sistare
Add schedstats to measure the effectiveness of searching for idle CPUs
and stealing tasks.  This is a temporary patch intended for use during
development only.  SCHEDSTAT_VERSION is bumped to 16, and the following
fields are added to the per-CPU statistics of /proc/schedstat:

field 10: # of times select_idle_sibling "easily" found an idle CPU --
  prev or target is idle.
field 11: # of times select_idle_sibling searched and found an idle cpu.
field 12: # of times select_idle_sibling searched and found an idle core.
field 13: # of times select_idle_sibling failed to find anything idle.
field 14: time in nanoseconds spent in functions that search for idle
  CPUs and search for tasks to steal.
field 15: # of times an idle CPU steals a task from another CPU.
field 16: # of times try_steal finds overloaded CPUs but no task is
   migratable.

Signed-off-by: Steve Sistare 
---
 kernel/sched/core.c  | 30 +++--
 kernel/sched/fair.c  | 54 ++--
 kernel/sched/sched.h |  9 +
 kernel/sched/stats.c | 11 ++-
 kernel/sched/stats.h | 13 +
 5 files changed, 108 insertions(+), 9 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ad97f3b..b61d15d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2214,17 +2214,43 @@ int sysctl_numa_balancing(struct ctl_table *table, int 
write,
 DEFINE_STATIC_KEY_FALSE(sched_schedstats);
 static bool __initdata __sched_schedstats = false;
 
+unsigned long schedstat_skid;
+
+static void compute_skid(void)
+{
+   int i, n = 0;
+   s64 t, skid = 0;
+
+   for (i = 0; i < 100; i++) {
+   t = local_clock();
+   t = local_clock() - t;
+   if (t > 0 && t < 1000) {/* only use sane samples */
+   skid += t;
+   n++;
+   }
+   }
+
+   if (n > 0)
+   schedstat_skid = skid / n;
+   else
+   schedstat_skid = 0;
+   pr_info("schedstat_skid = %lu\n", schedstat_skid);
+}
+
 static void set_schedstats(bool enabled)
 {
-   if (enabled)
+   if (enabled) {
+   compute_skid();
static_branch_enable(_schedstats);
-   else
+   } else {
static_branch_disable(_schedstats);
+   }
 }
 
 void force_schedstat_enabled(void)
 {
if (!schedstat_enabled()) {
+   compute_skid();
pr_info("kernel profiling enabled schedstats, disable via 
kernel.sched_schedstats.\n");
static_branch_enable(_schedstats);
}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 56dce30..21ffe34 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3739,29 +3739,35 @@ static inline bool steal_enabled(void)
 static void overload_clear(struct rq *rq)
 {
struct sparsemask *overload_cpus;
+   unsigned long time;
 
if (!steal_enabled())
return;
 
+   time = schedstat_start_time();
rcu_read_lock();
overload_cpus = rcu_dereference(rq->cfs_overload_cpus);
if (overload_cpus)
sparsemask_clear_elem(rq->cpu, overload_cpus);
rcu_read_unlock();
+   schedstat_end_time(rq->find_time, time);
 }
 
 static void overload_set(struct rq *rq)
 {
struct sparsemask *overload_cpus;
+   unsigned long time;
 
if (!steal_enabled())
return;
 
+   time = schedstat_start_time();
rcu_read_lock();
overload_cpus = rcu_dereference(rq->cfs_overload_cpus);
if (overload_cpus)
sparsemask_set_elem(rq->cpu, overload_cpus);
rcu_read_unlock();
+   schedstat_end_time(rq->find_time, time);
 }
 
 static int try_steal(struct rq *this_rq, struct rq_flags *rf);
@@ -6165,6 +6171,16 @@ static int select_idle_cpu(struct task_struct *p, struct 
sched_domain *sd, int t
return cpu;
 }
 
+#define SET_STAT(STAT) \
+   do {\
+   if (schedstat_enabled()) {  \
+   struct rq *rq = this_rq();  \
+   \
+   if (rq) \
+   __schedstat_inc(rq->STAT);  \
+   }   \
+   } while (0)
+
 /*
  * Try and locate an idle core/thread in the LLC cache domain.
  */
@@ -6173,14 +6189,18 @@ static int select_idle_sibling(struct task_struct *p, 
int prev, int target)
struct sched_domain *sd;
int i, recent_used_cpu;
 
-   if (available_idle_cpu(target))
+   if (available_idle_cpu(target)) 

[PATCH v2 01/10] sched: Provide sparsemask, a reduced contention bitmap

2018-11-05 Thread Steve Sistare
From: Steve Sistare 

Provide struct sparsemask and functions to manipulate it.  A sparsemask is
a sparse bitmap.  It reduces cache contention vs the usual bitmap when many
threads concurrently set, clear, and visit elements, by reducing the number
of significant bits per cacheline.  For each 64 byte chunk of the mask,
only the first K bits of the first word are used, and the remaining bits
are ignored, where K is a creation time parameter.  Thus a sparsemask that
can represent a set of N elements is approximately (N/K * 64) bytes in
size.

Signed-off-by: Steve Sistare 
---
 include/linux/sparsemask.h | 260 +
 lib/Makefile   |   2 +-
 lib/sparsemask.c   | 142 +
 3 files changed, 403 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/sparsemask.h
 create mode 100644 lib/sparsemask.c

diff --git a/include/linux/sparsemask.h b/include/linux/sparsemask.h
new file mode 100644
index 000..d36a3be
--- /dev/null
+++ b/include/linux/sparsemask.h
@@ -0,0 +1,260 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * sparsemask.h - sparse bitmap operations
+ *
+ * Copyright (c) 2018 Oracle Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __LINUX_SPARSEMASK_H
+#define __LINUX_SPARSEMASK_H
+
+#include 
+#include 
+#include 
+
+/*
+ * A sparsemask is a sparse bitmap.  It reduces cache contention vs the usual
+ * bitmap when many threads concurrently set, clear, and visit elements.  For
+ * each 64 byte chunk of the mask, only the first K bits of the first word are
+ * used, and the remaining bits are ignored, where K is a creation time
+ * parameter.  Thus a sparsemask that can represent a set of N elements is
+ * approximately (N/K * 64) bytes in size.
+ *
+ * Clients pass and receive element numbers in the public API, and the
+ * implementation translates them to bit numbers to perform the bitmap
+ * operations.
+ *
+ * This file is partially derived from cpumask.h, and the public sparsemask
+ * operations are drop-in replacements for cpumask operations. However,
+ * sparsemask has no dependency on CPU definitions and can be used to
+ * represent any kind of elements.
+ */
+
+struct sparsemask {
+   short nelems;   /* current number of elements */
+   short density;  /* store 2^density elements per chunk */
+   unsigned long bits[0];  /* embedded array of chunks */
+};
+
+/* The maximum value for density, which implicitly defines the chunk size */
+
+#define _SMASK_DENSITY_MAX 6
+
+#define SMASK_DENSITY_TO_BYTES(density)(1U << (density))
+#define SMASK_DENSITY_TO_ELEMS(density)(1U << (density))
+
+/* The number of elements/bits/bytes/longs in a chunk */
+
+#define SMASK_ELEMS(mask)  SMASK_DENSITY_TO_ELEMS((mask)->density)
+#define SMASK_BYTESSMASK_DENSITY_TO_BYTES(_SMASK_DENSITY_MAX)
+#define SMASK_BITS (SMASK_BYTES * BITS_PER_BYTE)
+#define SMASK_LONGS(SMASK_BYTES / sizeof(long))
+
+/*
+ * Translate element index @elem to a bit/byte/long index.
+ * @density: the density of a chunk.
+ */
+
+#define _SMASK_ELEM_TO_BIT(elem, density)  \
+   ((elem) / SMASK_DENSITY_TO_ELEMS(density) * SMASK_BITS +\
+(elem) % SMASK_DENSITY_TO_ELEMS(density))
+
+#define _SMASK_ELEM_TO_BYTE(elem, density) \
+   (_SMASK_ELEM_TO_BIT(elem, density) / BITS_PER_BYTE)
+
+#define _SMASK_ELEM_TO_LONG(elem, density) \
+   (_SMASK_ELEM_TO_BYTE(elem, density) / sizeof(long))
+
+/* Translate @bit/@byte/@long index to an element index */
+
+#define _SMASK_BIT_TO_ELEM(bit, density)   \
+   ((bit) / SMASK_BITS * SMASK_DENSITY_TO_ELEMS(density) + \
+(bit) % SMASK_BITS)
+
+#define _SMASK_BYTE_TO_ELEM(byte, density) \
+   _SMASK_BIT_TO_ELEM((byte) * BITS_PER_BYTE, density)
+
+#define _SMASK_LONG_TO_ELEM(index, density)\
+   _SMASK_BYTE_TO_ELEM((index) * sizeof(long), density)
+
+/* Same translations as above, but taking sparsemask @m instead of density */
+
+#define SMASK_ELEM_TO_BYTE(elem, m)_SMASK_ELEM_TO_BYTE(elem, (m)->density)
+#define SMASK_ELEM_TO_BIT(elem, m) _SMASK_ELEM_TO_BIT(elem, (m)->density)
+#define SMASK_ELEM_TO_LONG(elem, m)_SMASK_ELEM_TO_LONG(elem, (m)->density)
+#define SMASK_BYTE_TO_ELEM(byte, m)_SMASK_BYTE_TO_ELEM(byte, (m)->density)
+#define SMASK_BIT_TO_ELEM(bit, m)  _SMASK_BIT_TO_ELEM(bit, (m)->density)
+#d

[PATCH v2 09/10] sched/fair: disable stealing if too many NUMA nodes

2018-11-05 Thread Steve Sistare
ize the number of
cross-node moves in all conditions, with limited success.  The fundamental
problem is that the scheduler does not track which groups of tasks talk to
each other.  Parts of several groups become entrenched on the same node,
filling it to capacity, leaving no room for either group to pull its peers
over, and there is neither data nor mechanism for the scheduler to evict
one group to make room for the other.

For now, disable STEAL on such systems until we can do better, or it is
shown that hackbench is atypical and most workloads benefit from stealing.

Signed-off-by: Steve Sistare 
---
 kernel/sched/fair.c | 16 +---
 kernel/sched/sched.h|  2 +-
 kernel/sched/topology.c | 25 +
 3 files changed, 39 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0f12f56..56dce30 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3726,11 +3726,21 @@ static inline bool within_margin(int value, int margin)
 
 #define IF_SMP(statement)  statement
 
+static inline bool steal_enabled(void)
+{
+#ifdef CONFIG_NUMA
+   bool allow = static_branch_likely(_steal_allow);
+#else
+   bool allow = true;
+#endif
+   return sched_feat(STEAL) && allow;
+}
+
 static void overload_clear(struct rq *rq)
 {
struct sparsemask *overload_cpus;
 
-   if (!sched_feat(STEAL))
+   if (!steal_enabled())
return;
 
rcu_read_lock();
@@ -3744,7 +3754,7 @@ static void overload_set(struct rq *rq)
 {
struct sparsemask *overload_cpus;
 
-   if (!sched_feat(STEAL))
+   if (!steal_enabled())
return;
 
rcu_read_lock();
@@ -9786,7 +9796,7 @@ static int try_steal(struct rq *dst_rq, struct rq_flags 
*dst_rf)
int stolen = 0;
struct sparsemask *overload_cpus;
 
-   if (!sched_feat(STEAL))
+   if (!steal_enabled())
return 0;
 
if (!cpu_active(dst_cpu))
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index aadfe68..5f181e9 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -928,7 +928,6 @@ static inline int cpu_of(struct rq *rq)
 #endif
 }
 
-
 #ifdef CONFIG_SCHED_SMT
 
 extern struct static_key_false sched_smt_present;
@@ -1083,6 +1082,7 @@ enum numa_topology_type {
 #endif
 
 #ifdef CONFIG_NUMA
+extern struct static_key_true sched_steal_allow;
 extern void sched_init_numa(void);
 extern void sched_domains_numa_masks_set(unsigned int cpu);
 extern void sched_domains_numa_masks_clear(unsigned int cpu);
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index f18c416..e80c354 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1337,6 +1337,30 @@ static void init_numa_topology_type(void)
}
 }
 
+DEFINE_STATIC_KEY_TRUE(sched_steal_allow);
+static int sched_steal_node_limit;
+#define SCHED_STEAL_NODE_LIMIT_DEFAULT 2
+
+static int __init steal_node_limit_setup(char *buf)
+{
+   get_option(, _steal_node_limit);
+   return 0;
+}
+
+early_param("sched_steal_node_limit", steal_node_limit_setup);
+
+static void check_node_limit(void)
+{
+   int n = num_possible_nodes();
+
+   if (sched_steal_node_limit == 0)
+   sched_steal_node_limit = SCHED_STEAL_NODE_LIMIT_DEFAULT;
+   if (n > sched_steal_node_limit) {
+   static_branch_disable(_steal_allow);
+   pr_debug("Suppressing sched STEAL. To enable, reboot with 
sched_steal_node_limit=%d", n);
+   }
+}
+
 void sched_init_numa(void)
 {
int next_distance, curr_distance = node_distance(0, 0);
@@ -1485,6 +1509,7 @@ void sched_init_numa(void)
sched_max_numa_distance = sched_domains_numa_distance[level - 1];
 
init_numa_topology_type();
+   check_node_limit();
 }
 
 void sched_domains_numa_masks_set(unsigned int cpu)
-- 
1.8.3.1



[PATCH v2 06/10] sched/fair: Generalize the detach_task interface

2018-11-05 Thread Steve Sistare
The detach_task function takes a struct lb_env argument, but only needs a
few of its members.  Pass the rq and cpu arguments explicitly so the
function may be called from code that is not based on lb_env.  No
functional change.

Signed-off-by: Steve Sistare 
---
 kernel/sched/fair.c | 14 +++---
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 305edf8..eb6e6cd 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7166,15 +7166,15 @@ int can_migrate_task(struct task_struct *p, struct 
lb_env *env)
 }
 
 /*
- * detach_task() -- detach the task for the migration specified in env
+ * detach_task() -- detach the task for the migration from @src_rq to @dst_cpu.
  */
-static void detach_task(struct task_struct *p, struct lb_env *env)
+static void detach_task(struct task_struct *p, struct rq *src_rq, int dst_cpu)
 {
-   lockdep_assert_held(>src_rq->lock);
+   lockdep_assert_held(_rq->lock);
 
p->on_rq = TASK_ON_RQ_MIGRATING;
-   deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
-   set_task_cpu(p, env->dst_cpu);
+   deactivate_task(src_rq, p, DEQUEUE_NOCLOCK);
+   set_task_cpu(p, dst_cpu);
 }
 
 /*
@@ -7194,7 +7194,7 @@ static struct task_struct *detach_one_task(struct lb_env 
*env)
if (!can_migrate_task(p, env))
continue;
 
-   detach_task(p, env);
+   detach_task(p, env->src_rq, env->dst_cpu);
 
/*
 * Right now, this is only the second place where
@@ -7261,7 +7261,7 @@ static int detach_tasks(struct lb_env *env)
if ((load / 2) > env->imbalance)
goto next;
 
-   detach_task(p, env);
+   detach_task(p, env->src_rq, env->dst_cpu);
list_add(>se.group_node, >tasks);
 
detached++;
-- 
1.8.3.1



[PATCH v2 01/10] sched: Provide sparsemask, a reduced contention bitmap

2018-11-05 Thread Steve Sistare
From: Steve Sistare 

Provide struct sparsemask and functions to manipulate it.  A sparsemask is
a sparse bitmap.  It reduces cache contention vs the usual bitmap when many
threads concurrently set, clear, and visit elements, by reducing the number
of significant bits per cacheline.  For each 64 byte chunk of the mask,
only the first K bits of the first word are used, and the remaining bits
are ignored, where K is a creation time parameter.  Thus a sparsemask that
can represent a set of N elements is approximately (N/K * 64) bytes in
size.

Signed-off-by: Steve Sistare 
---
 include/linux/sparsemask.h | 260 +
 lib/Makefile   |   2 +-
 lib/sparsemask.c   | 142 +
 3 files changed, 403 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/sparsemask.h
 create mode 100644 lib/sparsemask.c

diff --git a/include/linux/sparsemask.h b/include/linux/sparsemask.h
new file mode 100644
index 000..d36a3be
--- /dev/null
+++ b/include/linux/sparsemask.h
@@ -0,0 +1,260 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * sparsemask.h - sparse bitmap operations
+ *
+ * Copyright (c) 2018 Oracle Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __LINUX_SPARSEMASK_H
+#define __LINUX_SPARSEMASK_H
+
+#include 
+#include 
+#include 
+
+/*
+ * A sparsemask is a sparse bitmap.  It reduces cache contention vs the usual
+ * bitmap when many threads concurrently set, clear, and visit elements.  For
+ * each 64 byte chunk of the mask, only the first K bits of the first word are
+ * used, and the remaining bits are ignored, where K is a creation time
+ * parameter.  Thus a sparsemask that can represent a set of N elements is
+ * approximately (N/K * 64) bytes in size.
+ *
+ * Clients pass and receive element numbers in the public API, and the
+ * implementation translates them to bit numbers to perform the bitmap
+ * operations.
+ *
+ * This file is partially derived from cpumask.h, and the public sparsemask
+ * operations are drop-in replacements for cpumask operations. However,
+ * sparsemask has no dependency on CPU definitions and can be used to
+ * represent any kind of elements.
+ */
+
+struct sparsemask {
+   short nelems;   /* current number of elements */
+   short density;  /* store 2^density elements per chunk */
+   unsigned long bits[0];  /* embedded array of chunks */
+};
+
+/* The maximum value for density, which implicitly defines the chunk size */
+
+#define _SMASK_DENSITY_MAX 6
+
+#define SMASK_DENSITY_TO_BYTES(density)(1U << (density))
+#define SMASK_DENSITY_TO_ELEMS(density)(1U << (density))
+
+/* The number of elements/bits/bytes/longs in a chunk */
+
+#define SMASK_ELEMS(mask)  SMASK_DENSITY_TO_ELEMS((mask)->density)
+#define SMASK_BYTESSMASK_DENSITY_TO_BYTES(_SMASK_DENSITY_MAX)
+#define SMASK_BITS (SMASK_BYTES * BITS_PER_BYTE)
+#define SMASK_LONGS(SMASK_BYTES / sizeof(long))
+
+/*
+ * Translate element index @elem to a bit/byte/long index.
+ * @density: the density of a chunk.
+ */
+
+#define _SMASK_ELEM_TO_BIT(elem, density)  \
+   ((elem) / SMASK_DENSITY_TO_ELEMS(density) * SMASK_BITS +\
+(elem) % SMASK_DENSITY_TO_ELEMS(density))
+
+#define _SMASK_ELEM_TO_BYTE(elem, density) \
+   (_SMASK_ELEM_TO_BIT(elem, density) / BITS_PER_BYTE)
+
+#define _SMASK_ELEM_TO_LONG(elem, density) \
+   (_SMASK_ELEM_TO_BYTE(elem, density) / sizeof(long))
+
+/* Translate @bit/@byte/@long index to an element index */
+
+#define _SMASK_BIT_TO_ELEM(bit, density)   \
+   ((bit) / SMASK_BITS * SMASK_DENSITY_TO_ELEMS(density) + \
+(bit) % SMASK_BITS)
+
+#define _SMASK_BYTE_TO_ELEM(byte, density) \
+   _SMASK_BIT_TO_ELEM((byte) * BITS_PER_BYTE, density)
+
+#define _SMASK_LONG_TO_ELEM(index, density)\
+   _SMASK_BYTE_TO_ELEM((index) * sizeof(long), density)
+
+/* Same translations as above, but taking sparsemask @m instead of density */
+
+#define SMASK_ELEM_TO_BYTE(elem, m)_SMASK_ELEM_TO_BYTE(elem, (m)->density)
+#define SMASK_ELEM_TO_BIT(elem, m) _SMASK_ELEM_TO_BIT(elem, (m)->density)
+#define SMASK_ELEM_TO_LONG(elem, m)_SMASK_ELEM_TO_LONG(elem, (m)->density)
+#define SMASK_BYTE_TO_ELEM(byte, m)_SMASK_BYTE_TO_ELEM(byte, (m)->density)
+#define SMASK_BIT_TO_ELEM(bit, m)  _SMASK_BIT_TO_ELEM(bit, (m)->density)
+#d

[PATCH v2 09/10] sched/fair: disable stealing if too many NUMA nodes

2018-11-05 Thread Steve Sistare
ize the number of
cross-node moves in all conditions, with limited success.  The fundamental
problem is that the scheduler does not track which groups of tasks talk to
each other.  Parts of several groups become entrenched on the same node,
filling it to capacity, leaving no room for either group to pull its peers
over, and there is neither data nor mechanism for the scheduler to evict
one group to make room for the other.

For now, disable STEAL on such systems until we can do better, or it is
shown that hackbench is atypical and most workloads benefit from stealing.

Signed-off-by: Steve Sistare 
---
 kernel/sched/fair.c | 16 +---
 kernel/sched/sched.h|  2 +-
 kernel/sched/topology.c | 25 +
 3 files changed, 39 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0f12f56..56dce30 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3726,11 +3726,21 @@ static inline bool within_margin(int value, int margin)
 
 #define IF_SMP(statement)  statement
 
+static inline bool steal_enabled(void)
+{
+#ifdef CONFIG_NUMA
+   bool allow = static_branch_likely(_steal_allow);
+#else
+   bool allow = true;
+#endif
+   return sched_feat(STEAL) && allow;
+}
+
 static void overload_clear(struct rq *rq)
 {
struct sparsemask *overload_cpus;
 
-   if (!sched_feat(STEAL))
+   if (!steal_enabled())
return;
 
rcu_read_lock();
@@ -3744,7 +3754,7 @@ static void overload_set(struct rq *rq)
 {
struct sparsemask *overload_cpus;
 
-   if (!sched_feat(STEAL))
+   if (!steal_enabled())
return;
 
rcu_read_lock();
@@ -9786,7 +9796,7 @@ static int try_steal(struct rq *dst_rq, struct rq_flags 
*dst_rf)
int stolen = 0;
struct sparsemask *overload_cpus;
 
-   if (!sched_feat(STEAL))
+   if (!steal_enabled())
return 0;
 
if (!cpu_active(dst_cpu))
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index aadfe68..5f181e9 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -928,7 +928,6 @@ static inline int cpu_of(struct rq *rq)
 #endif
 }
 
-
 #ifdef CONFIG_SCHED_SMT
 
 extern struct static_key_false sched_smt_present;
@@ -1083,6 +1082,7 @@ enum numa_topology_type {
 #endif
 
 #ifdef CONFIG_NUMA
+extern struct static_key_true sched_steal_allow;
 extern void sched_init_numa(void);
 extern void sched_domains_numa_masks_set(unsigned int cpu);
 extern void sched_domains_numa_masks_clear(unsigned int cpu);
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index f18c416..e80c354 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1337,6 +1337,30 @@ static void init_numa_topology_type(void)
}
 }
 
+DEFINE_STATIC_KEY_TRUE(sched_steal_allow);
+static int sched_steal_node_limit;
+#define SCHED_STEAL_NODE_LIMIT_DEFAULT 2
+
+static int __init steal_node_limit_setup(char *buf)
+{
+   get_option(, _steal_node_limit);
+   return 0;
+}
+
+early_param("sched_steal_node_limit", steal_node_limit_setup);
+
+static void check_node_limit(void)
+{
+   int n = num_possible_nodes();
+
+   if (sched_steal_node_limit == 0)
+   sched_steal_node_limit = SCHED_STEAL_NODE_LIMIT_DEFAULT;
+   if (n > sched_steal_node_limit) {
+   static_branch_disable(_steal_allow);
+   pr_debug("Suppressing sched STEAL. To enable, reboot with 
sched_steal_node_limit=%d", n);
+   }
+}
+
 void sched_init_numa(void)
 {
int next_distance, curr_distance = node_distance(0, 0);
@@ -1485,6 +1509,7 @@ void sched_init_numa(void)
sched_max_numa_distance = sched_domains_numa_distance[level - 1];
 
init_numa_topology_type();
+   check_node_limit();
 }
 
 void sched_domains_numa_masks_set(unsigned int cpu)
-- 
1.8.3.1



[PATCH v2 06/10] sched/fair: Generalize the detach_task interface

2018-11-05 Thread Steve Sistare
The detach_task function takes a struct lb_env argument, but only needs a
few of its members.  Pass the rq and cpu arguments explicitly so the
function may be called from code that is not based on lb_env.  No
functional change.

Signed-off-by: Steve Sistare 
---
 kernel/sched/fair.c | 14 +++---
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 305edf8..eb6e6cd 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7166,15 +7166,15 @@ int can_migrate_task(struct task_struct *p, struct 
lb_env *env)
 }
 
 /*
- * detach_task() -- detach the task for the migration specified in env
+ * detach_task() -- detach the task for the migration from @src_rq to @dst_cpu.
  */
-static void detach_task(struct task_struct *p, struct lb_env *env)
+static void detach_task(struct task_struct *p, struct rq *src_rq, int dst_cpu)
 {
-   lockdep_assert_held(>src_rq->lock);
+   lockdep_assert_held(_rq->lock);
 
p->on_rq = TASK_ON_RQ_MIGRATING;
-   deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
-   set_task_cpu(p, env->dst_cpu);
+   deactivate_task(src_rq, p, DEQUEUE_NOCLOCK);
+   set_task_cpu(p, dst_cpu);
 }
 
 /*
@@ -7194,7 +7194,7 @@ static struct task_struct *detach_one_task(struct lb_env 
*env)
if (!can_migrate_task(p, env))
continue;
 
-   detach_task(p, env);
+   detach_task(p, env->src_rq, env->dst_cpu);
 
/*
 * Right now, this is only the second place where
@@ -7261,7 +7261,7 @@ static int detach_tasks(struct lb_env *env)
if ((load / 2) > env->imbalance)
goto next;
 
-   detach_task(p, env);
+   detach_task(p, env->src_rq, env->dst_cpu);
list_add(>se.group_node, >tasks);
 
detached++;
-- 
1.8.3.1



[PATCH v2 03/10] sched/topology: Provide cfs_overload_cpus bitmap

2018-11-05 Thread Steve Sistare
From: Steve Sistare 

Define and initialize a sparse bitmap of overloaded CPUs, per
last-level-cache scheduling domain, for use by the CFS scheduling class.
Save a pointer to cfs_overload_cpus in the rq for efficient access.

Signed-off-by: Steve Sistare 
---
 include/linux/sched/topology.h |  1 +
 kernel/sched/sched.h   |  2 ++
 kernel/sched/topology.c| 21 +++--
 3 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 2634774..8bac15d 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -72,6 +72,7 @@ struct sched_domain_shared {
atomic_tref;
atomic_tnr_busy_cpus;
int has_idle_cores;
+   struct sparsemask *cfs_overload_cpus;
 };
 
 struct sched_domain {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 455fa33..aadfe68 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -81,6 +81,7 @@
 
 struct rq;
 struct cpuidle_state;
+struct sparsemask;
 
 /* task_struct::on_rq states: */
 #define TASK_ON_RQ_QUEUED  1
@@ -805,6 +806,7 @@ struct rq {
struct cfs_rq   cfs;
struct rt_rqrt;
struct dl_rqdl;
+   struct sparsemask   *cfs_overload_cpus;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
/* list of leaf cfs_rq on this CPU: */
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index a2363f6..f18c416 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -3,6 +3,7 @@
  * Scheduler topology setup/handling methods
  */
 #include "sched.h"
+#include 
 
 DEFINE_MUTEX(sched_domains_mutex);
 
@@ -440,6 +441,7 @@ static void update_top_cache_domain(int cpu)
 static void
 cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
 {
+   struct sparsemask *cfs_overload_cpus;
struct rq *rq = cpu_rq(cpu);
struct sched_domain *tmp;
 
@@ -481,6 +483,10 @@ static void update_top_cache_domain(int cpu)
dirty_sched_domain_sysctl(cpu);
destroy_sched_domains(tmp);
 
+   sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
+   cfs_overload_cpus = (sd ? sd->shared->cfs_overload_cpus : NULL);
+   rcu_assign_pointer(rq->cfs_overload_cpus, cfs_overload_cpus);
+
update_top_cache_domain(cpu);
 }
 
@@ -1611,9 +1617,19 @@ static void __sdt_free(const struct cpumask *cpu_map)
}
 }
 
+#define ZALLOC_MASK(maskp, nelems, node) \
+   (!*(maskp) && !zalloc_sparsemask_node(maskp, nelems,  \
+ SPARSEMASK_DENSITY_DEFAULT, \
+ GFP_KERNEL, node))  \
+
 static int sd_llc_alloc(struct sched_domain *sd)
 {
-   /* Allocate sd->shared data here. Empty for now. */
+   struct sched_domain_shared *sds = sd->shared;
+   struct cpumask *span = sched_domain_span(sd);
+   int nid = cpu_to_node(cpumask_first(span));
+
+   if (ZALLOC_MASK(>cfs_overload_cpus, nr_cpu_ids, nid))
+   return 1;
 
return 0;
 }
@@ -1625,7 +1641,8 @@ static void sd_llc_free(struct sched_domain *sd)
if (!sds)
return;
 
-   /* Free data here. Empty for now. */
+   free_sparsemask(sds->cfs_overload_cpus);
+   sds->cfs_overload_cpus = NULL;
 }
 
 static int sd_llc_alloc_all(const struct cpumask *cpu_map, struct s_data *d)
-- 
1.8.3.1



[PATCH v2 03/10] sched/topology: Provide cfs_overload_cpus bitmap

2018-11-05 Thread Steve Sistare
From: Steve Sistare 

Define and initialize a sparse bitmap of overloaded CPUs, per
last-level-cache scheduling domain, for use by the CFS scheduling class.
Save a pointer to cfs_overload_cpus in the rq for efficient access.

Signed-off-by: Steve Sistare 
---
 include/linux/sched/topology.h |  1 +
 kernel/sched/sched.h   |  2 ++
 kernel/sched/topology.c| 21 +++--
 3 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 2634774..8bac15d 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -72,6 +72,7 @@ struct sched_domain_shared {
atomic_tref;
atomic_tnr_busy_cpus;
int has_idle_cores;
+   struct sparsemask *cfs_overload_cpus;
 };
 
 struct sched_domain {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 455fa33..aadfe68 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -81,6 +81,7 @@
 
 struct rq;
 struct cpuidle_state;
+struct sparsemask;
 
 /* task_struct::on_rq states: */
 #define TASK_ON_RQ_QUEUED  1
@@ -805,6 +806,7 @@ struct rq {
struct cfs_rq   cfs;
struct rt_rqrt;
struct dl_rqdl;
+   struct sparsemask   *cfs_overload_cpus;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
/* list of leaf cfs_rq on this CPU: */
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index a2363f6..f18c416 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -3,6 +3,7 @@
  * Scheduler topology setup/handling methods
  */
 #include "sched.h"
+#include 
 
 DEFINE_MUTEX(sched_domains_mutex);
 
@@ -440,6 +441,7 @@ static void update_top_cache_domain(int cpu)
 static void
 cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
 {
+   struct sparsemask *cfs_overload_cpus;
struct rq *rq = cpu_rq(cpu);
struct sched_domain *tmp;
 
@@ -481,6 +483,10 @@ static void update_top_cache_domain(int cpu)
dirty_sched_domain_sysctl(cpu);
destroy_sched_domains(tmp);
 
+   sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
+   cfs_overload_cpus = (sd ? sd->shared->cfs_overload_cpus : NULL);
+   rcu_assign_pointer(rq->cfs_overload_cpus, cfs_overload_cpus);
+
update_top_cache_domain(cpu);
 }
 
@@ -1611,9 +1617,19 @@ static void __sdt_free(const struct cpumask *cpu_map)
}
 }
 
+#define ZALLOC_MASK(maskp, nelems, node) \
+   (!*(maskp) && !zalloc_sparsemask_node(maskp, nelems,  \
+ SPARSEMASK_DENSITY_DEFAULT, \
+ GFP_KERNEL, node))  \
+
 static int sd_llc_alloc(struct sched_domain *sd)
 {
-   /* Allocate sd->shared data here. Empty for now. */
+   struct sched_domain_shared *sds = sd->shared;
+   struct cpumask *span = sched_domain_span(sd);
+   int nid = cpu_to_node(cpumask_first(span));
+
+   if (ZALLOC_MASK(>cfs_overload_cpus, nr_cpu_ids, nid))
+   return 1;
 
return 0;
 }
@@ -1625,7 +1641,8 @@ static void sd_llc_free(struct sched_domain *sd)
if (!sds)
return;
 
-   /* Free data here. Empty for now. */
+   free_sparsemask(sds->cfs_overload_cpus);
+   sds->cfs_overload_cpus = NULL;
 }
 
 static int sd_llc_alloc_all(const struct cpumask *cpu_map, struct s_data *d)
-- 
1.8.3.1



[PATCH v2 04/10] sched/fair: Dynamically update cfs_overload_cpus

2018-11-05 Thread Steve Sistare
An overloaded CPU has more than 1 runnable task.  When a CFS task wakes
on a CPU, if h_nr_running transitions from 1 to more, then set the CPU in
the cfs_overload_cpus bitmap.  When a CFS task sleeps, if h_nr_running
transitions from 2 to less, then clear the CPU in cfs_overload_cpus.

Signed-off-by: Steve Sistare 
---
 kernel/sched/fair.c | 52 
 1 file changed, 48 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7fc4a37..c623338 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -23,6 +23,7 @@
 #include "sched.h"
 
 #include 
+#include 
 
 /*
  * Targeted preemption latency for CPU-bound tasks:
@@ -3723,6 +3724,28 @@ static inline bool within_margin(int value, int margin)
WRITE_ONCE(p->se.avg.util_est, ue);
 }
 
+static void overload_clear(struct rq *rq)
+{
+   struct sparsemask *overload_cpus;
+
+   rcu_read_lock();
+   overload_cpus = rcu_dereference(rq->cfs_overload_cpus);
+   if (overload_cpus)
+   sparsemask_clear_elem(rq->cpu, overload_cpus);
+   rcu_read_unlock();
+}
+
+static void overload_set(struct rq *rq)
+{
+   struct sparsemask *overload_cpus;
+
+   rcu_read_lock();
+   overload_cpus = rcu_dereference(rq->cfs_overload_cpus);
+   if (overload_cpus)
+   sparsemask_set_elem(rq->cpu, overload_cpus);
+   rcu_read_unlock();
+}
+
 #else /* CONFIG_SMP */
 
 #define UPDATE_TG  0x0
@@ -3746,6 +3769,9 @@ static inline int idle_balance(struct rq *rq, struct 
rq_flags *rf)
return 0;
 }
 
+static inline void overload_clear(struct rq *rq) {}
+static inline void overload_set(struct rq *rq) {}
+
 static inline void
 util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
 
@@ -4439,6 +4465,7 @@ static int tg_throttle_down(struct task_group *tg, void 
*data)
 static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
 {
struct rq *rq = rq_of(cfs_rq);
+   unsigned int prev_nr = rq->cfs.h_nr_running;
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct sched_entity *se;
long task_delta, dequeue = 1;
@@ -4466,8 +4493,12 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
dequeue = 0;
}
 
-   if (!se)
+   if (!se) {
sub_nr_running(rq, task_delta);
+   if (prev_nr >= 2 && prev_nr - task_delta < 2)
+   overload_clear(rq);
+
+   }
 
cfs_rq->throttled = 1;
cfs_rq->throttled_clock = rq_clock(rq);
@@ -4493,6 +4524,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
 void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
 {
struct rq *rq = rq_of(cfs_rq);
+   unsigned int prev_nr = rq->cfs.h_nr_running;
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct sched_entity *se;
int enqueue = 1;
@@ -4529,8 +4561,11 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
break;
}
 
-   if (!se)
+   if (!se) {
add_nr_running(rq, task_delta);
+   if (prev_nr < 2 && prev_nr + task_delta >= 2)
+   overload_set(rq);
+   }
 
/* Determine whether we need to wake up potentially idle CPU: */
if (rq->curr == rq->idle && rq->cfs.nr_running)
@@ -5064,6 +5099,7 @@ static inline void hrtick_update(struct rq *rq)
 {
struct cfs_rq *cfs_rq;
struct sched_entity *se = >se;
+   unsigned int prev_nr = rq->cfs.h_nr_running;
 
/*
 * The code below (indirectly) updates schedutil which looks at
@@ -5111,8 +5147,12 @@ static inline void hrtick_update(struct rq *rq)
update_cfs_group(se);
}
 
-   if (!se)
+   if (!se) {
add_nr_running(rq, 1);
+   if (prev_nr == 1)
+   overload_set(rq);
+
+   }
 
hrtick_update(rq);
 }
@@ -5129,6 +5169,7 @@ static void dequeue_task_fair(struct rq *rq, struct 
task_struct *p, int flags)
struct cfs_rq *cfs_rq;
struct sched_entity *se = >se;
int task_sleep = flags & DEQUEUE_SLEEP;
+   unsigned int prev_nr = rq->cfs.h_nr_running;
 
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
@@ -5170,8 +5211,11 @@ static void dequeue_task_fair(struct rq *rq, struct 
task_struct *p, int flags)
update_cfs_group(se);
}
 
-   if (!se)
+   if (!se) {
sub_nr_running(rq, 1);
+   if (prev_nr == 2)
+   overload_clear(rq);
+   }
 
util_est_dequeue(>cfs, p, task_sleep);
hrtick_update(rq);
-- 
1.8.3.1



[PATCH v2 08/10] sched/fair: Steal work from an overloaded CPU when CPU goes idle

2018-11-05 Thread Steve Sistare
 @ 2.20GHz
Average of 10 runs of: hackbench  process 10

--- base ----- new ---
  groupstime %stdevtime %stdev  %speedup
   1   5.8265.4   5.8405.0  -0.3
   2   5.0415.3   6.171   23.4 -18.4
   3   6.8392.1   6.3243.8   8.1
   4   8.1770.6   7.3183.6  11.7
   8  14.4290.7  13.9661.3   3.3
  16  26.4010.3  25.1491.5   4.9

X6-2: 2 sockets * 22 cores * 2 hyperthreads = 88 CPUs
Intel(R) Xeon(R) CPU E5-2699 v4 @ 2.20GHz
Oracle database OLTP, logging disabled, NVRAM storage

  Customers   Users   %speedup
120  40   -1.2
240  802.7
360 1208.9
480 1604.4
600 2003.0

X6-2: 2 sockets * 14 cores * 2 hyperthreads = 56 CPUs
Intel(R) Xeon(R) CPU E5-2690 v4 @ 2.60GHz
Results from the Oracle "Performance PIT".

  Benchmark   %speedup

  mysql_sysb1.0.14_fileio_56_rndrd19.6
  mysql_sysb1.0.14_fileio_56_seqrd12.1
  mysql_sysb1.0.14_fileio_56_rndwr 0.4
  mysql_sysb1.0.14_fileio_56_seqrewr  -0.3

  pgsql_sysb1.0.14_fileio_56_rndrd19.5
  pgsql_sysb1.0.14_fileio_56_seqrd 8.6
  pgsql_sysb1.0.14_fileio_56_rndwr 1.0
  pgsql_sysb1.0.14_fileio_56_seqrewr   0.5

  opatch_time_ASM_12.2.0.1.0_HP2M  7.5
  select-1_users-warm_asmm_ASM_12.2.0.1.0_HP2M 5.1
  select-1_users_asmm_ASM_12.2.0.1.0_HP2M  4.4
  swingbenchv3_asmm_soebench_ASM_12.2.0.1.0_HP2M   5.8

  lm3_memlat_L24.8
  lm3_memlat_L10.0

  ub_gcc_56CPUs-56copies_Pipe-based_Context_Switching 60.1
  ub_gcc_56CPUs-56copies_Shell_Scripts_1_concurrent5.2
  ub_gcc_56CPUs-56copies_Shell_Scripts_8_concurrent   -3.0
  ub_gcc_56CPUs-56copies_File_Copy_1024_bufsize_2000_maxblocks 2.4

X5-2: 2 sockets * 18 cores * 2 hyperthreads = 72 CPUs
Intel(R) Xeon(R) CPU E5-2699 v3 @ 2.30GHz

  NAS_OMP
  bench class   ncpu%improved(Mops)
  dcB   72  1.3
  isC   72  0.9
  isD   72  0.7

  sysbench mysql, average of 24 runs
  --- base --- --- new ---
  nthr   events  %stdev   events  %stdev %speedup
 1331.00.25331.00.24 -0.1
 2661.30.22661.80.22  0.0
 4   1297.00.88   1300.50.82  0.2
 8   2420.80.04   2420.50.04 -0.1
16   4826.30.07   4825.40.05 -0.1
32   8815.30.27   8830.20.18  0.1
64  12823.00.24  12823.60.26  0.0

-----

Signed-off-by: Steve Sistare 
---
 kernel/sched/fair.c | 169 ++--
 kernel/sched/features.h |   6 ++
 2 files changed, 170 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6b08383..0f12f56 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3730,6 +3730,9 @@ static void overload_clear(struct rq *rq)
 {
struct sparsemask *overload_cpus;
 
+   if (!sched_feat(STEAL))
+   return;
+
rcu_read_lock();
overload_cpus = rcu_dereference(rq->cfs_overload_cpus);
if (overload_cpus)
@@ -3741,6 +3744,9 @@ static void overload_set(struct rq *rq)
 {
struct sparsemask *overload_cpus;
 
+   if (!sched_feat(STEAL))
+   return;
+
rcu_read_lock();
overload_cpus = rcu_dereference(rq->cfs_overload_cpus);
if (overload_cpus)
@@ -3748,6 +3754,8 @@ static void overload_set(struct rq *rq)
rcu_read_unlock();
 }
 
+static int try_steal(struct rq *this_rq, struct rq_flags *rf);
+
 #else /* CONFIG_SMP */
 
 #define UPDATE_TG  0x0
@@ -3783,6 +3791,11 @@ static inline void overload_set(struct rq *rq) {}
 util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p,
 bool task_sleep) {}
 
+static inline int try_steal(struct rq *this_rq, struct rq_flags *rf)
+{
+   return 0;
+}
+
 #endif /* CONFIG_SMP */
 
 static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -6745,20 +6758,22 @@ static void check_preempt_wakeup(struct rq *rq, struct 
task_struct *p, int wake_
 
 idle:
/*
-* We must set idle_stamp _before_ calling idle_balance(), such that we
-* measure the duration of idle_balance() as idle time.
+* We must set idle_stamp _before_ calling try_steal() or
+* idle_balance(), such that we measure the duration as idle time.
 */
IF_SMP(rq->idle_stamp = rq_clock(rq);)
 
new_tasks = idle_balance(rq, rf);
+   if (new_tasks == 0)
+   new_tasks = try_steal(rq,

[PATCH v2 04/10] sched/fair: Dynamically update cfs_overload_cpus

2018-11-05 Thread Steve Sistare
An overloaded CPU has more than 1 runnable task.  When a CFS task wakes
on a CPU, if h_nr_running transitions from 1 to more, then set the CPU in
the cfs_overload_cpus bitmap.  When a CFS task sleeps, if h_nr_running
transitions from 2 to less, then clear the CPU in cfs_overload_cpus.

Signed-off-by: Steve Sistare 
---
 kernel/sched/fair.c | 52 
 1 file changed, 48 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7fc4a37..c623338 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -23,6 +23,7 @@
 #include "sched.h"
 
 #include 
+#include 
 
 /*
  * Targeted preemption latency for CPU-bound tasks:
@@ -3723,6 +3724,28 @@ static inline bool within_margin(int value, int margin)
WRITE_ONCE(p->se.avg.util_est, ue);
 }
 
+static void overload_clear(struct rq *rq)
+{
+   struct sparsemask *overload_cpus;
+
+   rcu_read_lock();
+   overload_cpus = rcu_dereference(rq->cfs_overload_cpus);
+   if (overload_cpus)
+   sparsemask_clear_elem(rq->cpu, overload_cpus);
+   rcu_read_unlock();
+}
+
+static void overload_set(struct rq *rq)
+{
+   struct sparsemask *overload_cpus;
+
+   rcu_read_lock();
+   overload_cpus = rcu_dereference(rq->cfs_overload_cpus);
+   if (overload_cpus)
+   sparsemask_set_elem(rq->cpu, overload_cpus);
+   rcu_read_unlock();
+}
+
 #else /* CONFIG_SMP */
 
 #define UPDATE_TG  0x0
@@ -3746,6 +3769,9 @@ static inline int idle_balance(struct rq *rq, struct 
rq_flags *rf)
return 0;
 }
 
+static inline void overload_clear(struct rq *rq) {}
+static inline void overload_set(struct rq *rq) {}
+
 static inline void
 util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
 
@@ -4439,6 +4465,7 @@ static int tg_throttle_down(struct task_group *tg, void 
*data)
 static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
 {
struct rq *rq = rq_of(cfs_rq);
+   unsigned int prev_nr = rq->cfs.h_nr_running;
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct sched_entity *se;
long task_delta, dequeue = 1;
@@ -4466,8 +4493,12 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
dequeue = 0;
}
 
-   if (!se)
+   if (!se) {
sub_nr_running(rq, task_delta);
+   if (prev_nr >= 2 && prev_nr - task_delta < 2)
+   overload_clear(rq);
+
+   }
 
cfs_rq->throttled = 1;
cfs_rq->throttled_clock = rq_clock(rq);
@@ -4493,6 +4524,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
 void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
 {
struct rq *rq = rq_of(cfs_rq);
+   unsigned int prev_nr = rq->cfs.h_nr_running;
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct sched_entity *se;
int enqueue = 1;
@@ -4529,8 +4561,11 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
break;
}
 
-   if (!se)
+   if (!se) {
add_nr_running(rq, task_delta);
+   if (prev_nr < 2 && prev_nr + task_delta >= 2)
+   overload_set(rq);
+   }
 
/* Determine whether we need to wake up potentially idle CPU: */
if (rq->curr == rq->idle && rq->cfs.nr_running)
@@ -5064,6 +5099,7 @@ static inline void hrtick_update(struct rq *rq)
 {
struct cfs_rq *cfs_rq;
struct sched_entity *se = >se;
+   unsigned int prev_nr = rq->cfs.h_nr_running;
 
/*
 * The code below (indirectly) updates schedutil which looks at
@@ -5111,8 +5147,12 @@ static inline void hrtick_update(struct rq *rq)
update_cfs_group(se);
}
 
-   if (!se)
+   if (!se) {
add_nr_running(rq, 1);
+   if (prev_nr == 1)
+   overload_set(rq);
+
+   }
 
hrtick_update(rq);
 }
@@ -5129,6 +5169,7 @@ static void dequeue_task_fair(struct rq *rq, struct 
task_struct *p, int flags)
struct cfs_rq *cfs_rq;
struct sched_entity *se = >se;
int task_sleep = flags & DEQUEUE_SLEEP;
+   unsigned int prev_nr = rq->cfs.h_nr_running;
 
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
@@ -5170,8 +5211,11 @@ static void dequeue_task_fair(struct rq *rq, struct 
task_struct *p, int flags)
update_cfs_group(se);
}
 
-   if (!se)
+   if (!se) {
sub_nr_running(rq, 1);
+   if (prev_nr == 2)
+   overload_clear(rq);
+   }
 
util_est_dequeue(>cfs, p, task_sleep);
hrtick_update(rq);
-- 
1.8.3.1



[PATCH v2 08/10] sched/fair: Steal work from an overloaded CPU when CPU goes idle

2018-11-05 Thread Steve Sistare
 @ 2.20GHz
Average of 10 runs of: hackbench  process 10

--- base ----- new ---
  groupstime %stdevtime %stdev  %speedup
   1   5.8265.4   5.8405.0  -0.3
   2   5.0415.3   6.171   23.4 -18.4
   3   6.8392.1   6.3243.8   8.1
   4   8.1770.6   7.3183.6  11.7
   8  14.4290.7  13.9661.3   3.3
  16  26.4010.3  25.1491.5   4.9

X6-2: 2 sockets * 22 cores * 2 hyperthreads = 88 CPUs
Intel(R) Xeon(R) CPU E5-2699 v4 @ 2.20GHz
Oracle database OLTP, logging disabled, NVRAM storage

  Customers   Users   %speedup
120  40   -1.2
240  802.7
360 1208.9
480 1604.4
600 2003.0

X6-2: 2 sockets * 14 cores * 2 hyperthreads = 56 CPUs
Intel(R) Xeon(R) CPU E5-2690 v4 @ 2.60GHz
Results from the Oracle "Performance PIT".

  Benchmark   %speedup

  mysql_sysb1.0.14_fileio_56_rndrd19.6
  mysql_sysb1.0.14_fileio_56_seqrd12.1
  mysql_sysb1.0.14_fileio_56_rndwr 0.4
  mysql_sysb1.0.14_fileio_56_seqrewr  -0.3

  pgsql_sysb1.0.14_fileio_56_rndrd19.5
  pgsql_sysb1.0.14_fileio_56_seqrd 8.6
  pgsql_sysb1.0.14_fileio_56_rndwr 1.0
  pgsql_sysb1.0.14_fileio_56_seqrewr   0.5

  opatch_time_ASM_12.2.0.1.0_HP2M  7.5
  select-1_users-warm_asmm_ASM_12.2.0.1.0_HP2M 5.1
  select-1_users_asmm_ASM_12.2.0.1.0_HP2M  4.4
  swingbenchv3_asmm_soebench_ASM_12.2.0.1.0_HP2M   5.8

  lm3_memlat_L24.8
  lm3_memlat_L10.0

  ub_gcc_56CPUs-56copies_Pipe-based_Context_Switching 60.1
  ub_gcc_56CPUs-56copies_Shell_Scripts_1_concurrent5.2
  ub_gcc_56CPUs-56copies_Shell_Scripts_8_concurrent   -3.0
  ub_gcc_56CPUs-56copies_File_Copy_1024_bufsize_2000_maxblocks 2.4

X5-2: 2 sockets * 18 cores * 2 hyperthreads = 72 CPUs
Intel(R) Xeon(R) CPU E5-2699 v3 @ 2.30GHz

  NAS_OMP
  bench class   ncpu%improved(Mops)
  dcB   72  1.3
  isC   72  0.9
  isD   72  0.7

  sysbench mysql, average of 24 runs
  --- base --- --- new ---
  nthr   events  %stdev   events  %stdev %speedup
 1331.00.25331.00.24 -0.1
 2661.30.22661.80.22  0.0
 4   1297.00.88   1300.50.82  0.2
 8   2420.80.04   2420.50.04 -0.1
16   4826.30.07   4825.40.05 -0.1
32   8815.30.27   8830.20.18  0.1
64  12823.00.24  12823.60.26  0.0

-----

Signed-off-by: Steve Sistare 
---
 kernel/sched/fair.c | 169 ++--
 kernel/sched/features.h |   6 ++
 2 files changed, 170 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6b08383..0f12f56 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3730,6 +3730,9 @@ static void overload_clear(struct rq *rq)
 {
struct sparsemask *overload_cpus;
 
+   if (!sched_feat(STEAL))
+   return;
+
rcu_read_lock();
overload_cpus = rcu_dereference(rq->cfs_overload_cpus);
if (overload_cpus)
@@ -3741,6 +3744,9 @@ static void overload_set(struct rq *rq)
 {
struct sparsemask *overload_cpus;
 
+   if (!sched_feat(STEAL))
+   return;
+
rcu_read_lock();
overload_cpus = rcu_dereference(rq->cfs_overload_cpus);
if (overload_cpus)
@@ -3748,6 +3754,8 @@ static void overload_set(struct rq *rq)
rcu_read_unlock();
 }
 
+static int try_steal(struct rq *this_rq, struct rq_flags *rf);
+
 #else /* CONFIG_SMP */
 
 #define UPDATE_TG  0x0
@@ -3783,6 +3791,11 @@ static inline void overload_set(struct rq *rq) {}
 util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p,
 bool task_sleep) {}
 
+static inline int try_steal(struct rq *this_rq, struct rq_flags *rf)
+{
+   return 0;
+}
+
 #endif /* CONFIG_SMP */
 
 static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -6745,20 +6758,22 @@ static void check_preempt_wakeup(struct rq *rq, struct 
task_struct *p, int wake_
 
 idle:
/*
-* We must set idle_stamp _before_ calling idle_balance(), such that we
-* measure the duration of idle_balance() as idle time.
+* We must set idle_stamp _before_ calling try_steal() or
+* idle_balance(), such that we measure the duration as idle time.
 */
IF_SMP(rq->idle_stamp = rq_clock(rq);)
 
new_tasks = idle_balance(rq, rf);
+   if (new_tasks == 0)
+   new_tasks = try_steal(rq,

[PATCH v2 05/10] sched/fair: Hoist idle_stamp up from idle_balance

2018-11-05 Thread Steve Sistare
Move the update of idle_stamp from idle_balance to the call site in
pick_next_task_fair, to prepare for a future patch that adds work to
pick_next_task_fair which must be included in the idle_stamp interval.
No functional change.

Signed-off-by: Steve Sistare 
---
 kernel/sched/fair.c | 22 +-
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c623338..305edf8 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3724,6 +3724,8 @@ static inline bool within_margin(int value, int margin)
WRITE_ONCE(p->se.avg.util_est, ue);
 }
 
+#define IF_SMP(statement)  statement
+
 static void overload_clear(struct rq *rq)
 {
struct sparsemask *overload_cpus;
@@ -3769,6 +3771,8 @@ static inline int idle_balance(struct rq *rq, struct 
rq_flags *rf)
return 0;
 }
 
+#define IF_SMP(statement)  /* empty */
+
 static inline void overload_clear(struct rq *rq) {}
 static inline void overload_set(struct rq *rq) {}
 
@@ -6740,8 +6744,17 @@ static void check_preempt_wakeup(struct rq *rq, struct 
task_struct *p, int wake_
return p;
 
 idle:
+   /*
+* We must set idle_stamp _before_ calling idle_balance(), such that we
+* measure the duration of idle_balance() as idle time.
+*/
+   IF_SMP(rq->idle_stamp = rq_clock(rq);)
+
new_tasks = idle_balance(rq, rf);
 
+   if (new_tasks)
+   IF_SMP(rq->idle_stamp = 0;)
+
/*
 * Because idle_balance() releases (and re-acquires) rq->lock, it is
 * possible for any higher priority task to appear. In that case we
@@ -9504,12 +9517,6 @@ static int idle_balance(struct rq *this_rq, struct 
rq_flags *rf)
u64 curr_cost = 0;
 
/*
-* We must set idle_stamp _before_ calling idle_balance(), such that we
-* measure the duration of idle_balance() as idle time.
-*/
-   this_rq->idle_stamp = rq_clock(this_rq);
-
-   /*
 * Do not pull tasks towards !active CPUs...
 */
if (!cpu_active(this_cpu))
@@ -9600,9 +9607,6 @@ static int idle_balance(struct rq *this_rq, struct 
rq_flags *rf)
if (this_rq->nr_running != this_rq->cfs.h_nr_running)
pulled_task = -1;
 
-   if (pulled_task)
-   this_rq->idle_stamp = 0;
-
rq_repin_lock(this_rq, rf);
 
return pulled_task;
-- 
1.8.3.1



[PATCH v2 05/10] sched/fair: Hoist idle_stamp up from idle_balance

2018-11-05 Thread Steve Sistare
Move the update of idle_stamp from idle_balance to the call site in
pick_next_task_fair, to prepare for a future patch that adds work to
pick_next_task_fair which must be included in the idle_stamp interval.
No functional change.

Signed-off-by: Steve Sistare 
---
 kernel/sched/fair.c | 22 +-
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c623338..305edf8 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3724,6 +3724,8 @@ static inline bool within_margin(int value, int margin)
WRITE_ONCE(p->se.avg.util_est, ue);
 }
 
+#define IF_SMP(statement)  statement
+
 static void overload_clear(struct rq *rq)
 {
struct sparsemask *overload_cpus;
@@ -3769,6 +3771,8 @@ static inline int idle_balance(struct rq *rq, struct 
rq_flags *rf)
return 0;
 }
 
+#define IF_SMP(statement)  /* empty */
+
 static inline void overload_clear(struct rq *rq) {}
 static inline void overload_set(struct rq *rq) {}
 
@@ -6740,8 +6744,17 @@ static void check_preempt_wakeup(struct rq *rq, struct 
task_struct *p, int wake_
return p;
 
 idle:
+   /*
+* We must set idle_stamp _before_ calling idle_balance(), such that we
+* measure the duration of idle_balance() as idle time.
+*/
+   IF_SMP(rq->idle_stamp = rq_clock(rq);)
+
new_tasks = idle_balance(rq, rf);
 
+   if (new_tasks)
+   IF_SMP(rq->idle_stamp = 0;)
+
/*
 * Because idle_balance() releases (and re-acquires) rq->lock, it is
 * possible for any higher priority task to appear. In that case we
@@ -9504,12 +9517,6 @@ static int idle_balance(struct rq *this_rq, struct 
rq_flags *rf)
u64 curr_cost = 0;
 
/*
-* We must set idle_stamp _before_ calling idle_balance(), such that we
-* measure the duration of idle_balance() as idle time.
-*/
-   this_rq->idle_stamp = rq_clock(this_rq);
-
-   /*
 * Do not pull tasks towards !active CPUs...
 */
if (!cpu_active(this_cpu))
@@ -9600,9 +9607,6 @@ static int idle_balance(struct rq *this_rq, struct 
rq_flags *rf)
if (this_rq->nr_running != this_rq->cfs.h_nr_running)
pulled_task = -1;
 
-   if (pulled_task)
-   this_rq->idle_stamp = 0;
-
rq_repin_lock(this_rq, rf);
 
return pulled_task;
-- 
1.8.3.1



[PATCH v2 07/10] sched/fair: Provide can_migrate_task_llc

2018-11-05 Thread Steve Sistare
Define a simpler version of can_migrate_task called can_migrate_task_llc
which does not require a struct lb_env argument, and judges whether a
migration from one CPU to another within the same LLC should be allowed.

Signed-off-by: Steve Sistare 
---
 kernel/sched/fair.c | 28 
 1 file changed, 28 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index eb6e6cd..6b08383 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7166,6 +7166,34 @@ int can_migrate_task(struct task_struct *p, struct 
lb_env *env)
 }
 
 /*
+ * Return true if task @p can migrate from @rq to @dst_rq in the same LLC.
+ * No need to test for co-locality, and no need to test task_hot(), as sharing
+ * LLC provides cache warmth at that level.
+ */
+static bool
+can_migrate_task_llc(struct task_struct *p, struct rq *rq, struct rq *dst_rq)
+{
+   int dst_cpu = dst_rq->cpu;
+
+   lockdep_assert_held(>lock);
+
+   if (throttled_lb_pair(task_group(p), cpu_of(rq), dst_cpu))
+   return false;
+
+   if (!cpumask_test_cpu(dst_cpu, >cpus_allowed)) {
+   schedstat_inc(p->se.statistics.nr_failed_migrations_affine);
+   return false;
+   }
+
+   if (task_running(rq, p)) {
+   schedstat_inc(p->se.statistics.nr_failed_migrations_running);
+   return false;
+   }
+
+   return true;
+}
+
+/*
  * detach_task() -- detach the task for the migration from @src_rq to @dst_cpu.
  */
 static void detach_task(struct task_struct *p, struct rq *src_rq, int dst_cpu)
-- 
1.8.3.1



[PATCH v2 02/10] sched/topology: Provide hooks to allocate data shared per LLC

2018-11-05 Thread Steve Sistare
Add functions sd_llc_alloc_all() and sd_llc_free_all() to allocate and
free data pointed to by struct sched_domain_shared at the last-level-cache
domain.  sd_llc_alloc_all() is called after the SD hierarchy is known, to
eliminate the unnecessary allocations that would occur if we instead
allocated in __sdt_alloc() and then figured out which shared nodes are
redundant.

Signed-off-by: Steve Sistare 
---
 kernel/sched/topology.c | 75 -
 1 file changed, 74 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 505a41c..a2363f6 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -10,6 +10,12 @@
 cpumask_var_t sched_domains_tmpmask;
 cpumask_var_t sched_domains_tmpmask2;
 
+struct s_data;
+static int sd_llc_alloc(struct sched_domain *sd);
+static void sd_llc_free(struct sched_domain *sd);
+static int sd_llc_alloc_all(const struct cpumask *cpu_map, struct s_data *d);
+static void sd_llc_free_all(const struct cpumask *cpu_map);
+
 #ifdef CONFIG_SCHED_DEBUG
 
 static int __init sched_debug_setup(char *str)
@@ -361,8 +367,10 @@ static void destroy_sched_domain(struct sched_domain *sd)
 */
free_sched_groups(sd->groups, 1);
 
-   if (sd->shared && atomic_dec_and_test(>shared->ref))
+   if (sd->shared && atomic_dec_and_test(>shared->ref)) {
+   sd_llc_free(sd);
kfree(sd->shared);
+   }
kfree(sd);
 }
 
@@ -993,6 +1001,7 @@ static void __free_domain_allocs(struct s_data *d, enum 
s_alloc what,
free_percpu(d->sd);
/* Fall through */
case sa_sd_storage:
+   sd_llc_free_all(cpu_map);
__sdt_free(cpu_map);
/* Fall through */
case sa_none:
@@ -1602,6 +1611,62 @@ static void __sdt_free(const struct cpumask *cpu_map)
}
 }
 
+static int sd_llc_alloc(struct sched_domain *sd)
+{
+   /* Allocate sd->shared data here. Empty for now. */
+
+   return 0;
+}
+
+static void sd_llc_free(struct sched_domain *sd)
+{
+   struct sched_domain_shared *sds = sd->shared;
+
+   if (!sds)
+   return;
+
+   /* Free data here. Empty for now. */
+}
+
+static int sd_llc_alloc_all(const struct cpumask *cpu_map, struct s_data *d)
+{
+   struct sched_domain *sd, *hsd;
+   int i;
+
+   for_each_cpu(i, cpu_map) {
+   /* Find highest domain that shares resources */
+   hsd = NULL;
+   for (sd = *per_cpu_ptr(d->sd, i); sd; sd = sd->parent) {
+   if (!(sd->flags & SD_SHARE_PKG_RESOURCES))
+   break;
+   hsd = sd;
+   }
+   if (hsd && sd_llc_alloc(hsd))
+   return 1;
+   }
+
+   return 0;
+}
+
+static void sd_llc_free_all(const struct cpumask *cpu_map)
+{
+   struct sched_domain_topology_level *tl;
+   struct sched_domain *sd;
+   struct sd_data *sdd;
+   int j;
+
+   for_each_sd_topology(tl) {
+   sdd = >data;
+   if (!sdd)
+   continue;
+   for_each_cpu(j, cpu_map) {
+   sd = *per_cpu_ptr(sdd->sd, j);
+   if (sd)
+   sd_llc_free(sd);
+   }
+   }
+}
+
 static struct sched_domain *build_sched_domain(struct 
sched_domain_topology_level *tl,
const struct cpumask *cpu_map, struct sched_domain_attr *attr,
struct sched_domain *child, int cpu)
@@ -1690,6 +1755,14 @@ static struct sched_domain *build_sched_domain(struct 
sched_domain_topology_leve
}
}
 
+   /*
+* Allocate shared sd data at last level cache.  Must be done after
+* domains are built above, but before the data is used in
+* cpu_attach_domain and descendants below.
+*/
+   if (sd_llc_alloc_all(cpu_map, ))
+   goto error;
+
/* Attach the domains */
rcu_read_lock();
for_each_cpu(i, cpu_map) {
-- 
1.8.3.1



[PATCH v2 07/10] sched/fair: Provide can_migrate_task_llc

2018-11-05 Thread Steve Sistare
Define a simpler version of can_migrate_task called can_migrate_task_llc
which does not require a struct lb_env argument, and judges whether a
migration from one CPU to another within the same LLC should be allowed.

Signed-off-by: Steve Sistare 
---
 kernel/sched/fair.c | 28 
 1 file changed, 28 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index eb6e6cd..6b08383 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7166,6 +7166,34 @@ int can_migrate_task(struct task_struct *p, struct 
lb_env *env)
 }
 
 /*
+ * Return true if task @p can migrate from @rq to @dst_rq in the same LLC.
+ * No need to test for co-locality, and no need to test task_hot(), as sharing
+ * LLC provides cache warmth at that level.
+ */
+static bool
+can_migrate_task_llc(struct task_struct *p, struct rq *rq, struct rq *dst_rq)
+{
+   int dst_cpu = dst_rq->cpu;
+
+   lockdep_assert_held(>lock);
+
+   if (throttled_lb_pair(task_group(p), cpu_of(rq), dst_cpu))
+   return false;
+
+   if (!cpumask_test_cpu(dst_cpu, >cpus_allowed)) {
+   schedstat_inc(p->se.statistics.nr_failed_migrations_affine);
+   return false;
+   }
+
+   if (task_running(rq, p)) {
+   schedstat_inc(p->se.statistics.nr_failed_migrations_running);
+   return false;
+   }
+
+   return true;
+}
+
+/*
  * detach_task() -- detach the task for the migration from @src_rq to @dst_cpu.
  */
 static void detach_task(struct task_struct *p, struct rq *src_rq, int dst_cpu)
-- 
1.8.3.1



[PATCH v2 02/10] sched/topology: Provide hooks to allocate data shared per LLC

2018-11-05 Thread Steve Sistare
Add functions sd_llc_alloc_all() and sd_llc_free_all() to allocate and
free data pointed to by struct sched_domain_shared at the last-level-cache
domain.  sd_llc_alloc_all() is called after the SD hierarchy is known, to
eliminate the unnecessary allocations that would occur if we instead
allocated in __sdt_alloc() and then figured out which shared nodes are
redundant.

Signed-off-by: Steve Sistare 
---
 kernel/sched/topology.c | 75 -
 1 file changed, 74 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 505a41c..a2363f6 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -10,6 +10,12 @@
 cpumask_var_t sched_domains_tmpmask;
 cpumask_var_t sched_domains_tmpmask2;
 
+struct s_data;
+static int sd_llc_alloc(struct sched_domain *sd);
+static void sd_llc_free(struct sched_domain *sd);
+static int sd_llc_alloc_all(const struct cpumask *cpu_map, struct s_data *d);
+static void sd_llc_free_all(const struct cpumask *cpu_map);
+
 #ifdef CONFIG_SCHED_DEBUG
 
 static int __init sched_debug_setup(char *str)
@@ -361,8 +367,10 @@ static void destroy_sched_domain(struct sched_domain *sd)
 */
free_sched_groups(sd->groups, 1);
 
-   if (sd->shared && atomic_dec_and_test(>shared->ref))
+   if (sd->shared && atomic_dec_and_test(>shared->ref)) {
+   sd_llc_free(sd);
kfree(sd->shared);
+   }
kfree(sd);
 }
 
@@ -993,6 +1001,7 @@ static void __free_domain_allocs(struct s_data *d, enum 
s_alloc what,
free_percpu(d->sd);
/* Fall through */
case sa_sd_storage:
+   sd_llc_free_all(cpu_map);
__sdt_free(cpu_map);
/* Fall through */
case sa_none:
@@ -1602,6 +1611,62 @@ static void __sdt_free(const struct cpumask *cpu_map)
}
 }
 
+static int sd_llc_alloc(struct sched_domain *sd)
+{
+   /* Allocate sd->shared data here. Empty for now. */
+
+   return 0;
+}
+
+static void sd_llc_free(struct sched_domain *sd)
+{
+   struct sched_domain_shared *sds = sd->shared;
+
+   if (!sds)
+   return;
+
+   /* Free data here. Empty for now. */
+}
+
+static int sd_llc_alloc_all(const struct cpumask *cpu_map, struct s_data *d)
+{
+   struct sched_domain *sd, *hsd;
+   int i;
+
+   for_each_cpu(i, cpu_map) {
+   /* Find highest domain that shares resources */
+   hsd = NULL;
+   for (sd = *per_cpu_ptr(d->sd, i); sd; sd = sd->parent) {
+   if (!(sd->flags & SD_SHARE_PKG_RESOURCES))
+   break;
+   hsd = sd;
+   }
+   if (hsd && sd_llc_alloc(hsd))
+   return 1;
+   }
+
+   return 0;
+}
+
+static void sd_llc_free_all(const struct cpumask *cpu_map)
+{
+   struct sched_domain_topology_level *tl;
+   struct sched_domain *sd;
+   struct sd_data *sdd;
+   int j;
+
+   for_each_sd_topology(tl) {
+   sdd = >data;
+   if (!sdd)
+   continue;
+   for_each_cpu(j, cpu_map) {
+   sd = *per_cpu_ptr(sdd->sd, j);
+   if (sd)
+   sd_llc_free(sd);
+   }
+   }
+}
+
 static struct sched_domain *build_sched_domain(struct 
sched_domain_topology_level *tl,
const struct cpumask *cpu_map, struct sched_domain_attr *attr,
struct sched_domain *child, int cpu)
@@ -1690,6 +1755,14 @@ static struct sched_domain *build_sched_domain(struct 
sched_domain_topology_leve
}
}
 
+   /*
+* Allocate shared sd data at last level cache.  Must be done after
+* domains are built above, but before the data is used in
+* cpu_attach_domain and descendants below.
+*/
+   if (sd_llc_alloc_all(cpu_map, ))
+   goto error;
+
/* Attach the domains */
rcu_read_lock();
for_each_cpu(i, cpu_map) {
-- 
1.8.3.1



[PATCH v2 00/10] steal tasks to improve CPU utilization

2018-11-05 Thread Steve Sistare
   9.3
  16  43.7790.1  41.7410.2   4.8

KVM 4-cpu
Intel(R) Xeon(R) CPU E5-2699 v3 @ 2.30GHz
tbench, average of 11 runs.

  clients%speedup
116.2
211.7
4 9.9
812.8
   1613.7

KVM 2-cpu
Intel(R) Xeon(R) CPU E5-2699 v3 @ 2.30GHz

  Benchmark %speedup
  specjbb2015_critical_jops  5.7
  mysql_sysb1.0.14_mutex_2  40.6
  mysql_sysb1.0.14_oltp_23.9

-- 2 Socket Results --

X6-2: 2 sockets * 10 cores * 2 hyperthreads = 40 CPUs
Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz
Average of 10 runs of: hackbench  process 10

--- base ----- new ---
  groupstime %stdevtime %stdev  %speedup
   1   7.9450.2   7.2198.7  10.0
   2   8.4440.4   6.6891.5  26.2
   3  12.1001.1   9.9622.0  21.4
   4  15.0010.4  13.1091.1  14.4
   8  27.9600.2  26.1270.3   7.0

X6-2: 2 sockets * 22 cores * 2 hyperthreads = 88 CPUs
Intel(R) Xeon(R) CPU E5-2699 v4 @ 2.20GHz
Average of 10 runs of: hackbench  process 10

--- base ----- new ---
  groupstime %stdevtime %stdev  %speedup
   1   5.8265.4   5.8405.0  -0.3
   2   5.0415.3   6.171   23.4 -18.4
   3   6.8392.1   6.3243.8   8.1
   4   8.1770.6   7.3183.6  11.7
   8  14.4290.7  13.9661.3   3.3
  16  26.4010.3  25.1491.5   4.9


X6-2: 2 sockets * 22 cores * 2 hyperthreads = 88 CPUs
Intel(R) Xeon(R) CPU E5-2699 v4 @ 2.20GHz
Oracle database OLTP, logging disabled, NVRAM storage

  Customers   Users   %speedup
120  40   -1.2
240  802.7
360 1208.9
480 1604.4
600 2003.0

X6-2: 2 sockets * 14 cores * 2 hyperthreads = 56 CPUs
Intel(R) Xeon(R) CPU E5-2690 v4 @ 2.60GHz
Results from the Oracle "Performance PIT".

  Benchmark   %speedup

  mysql_sysb1.0.14_fileio_56_rndrd19.6
  mysql_sysb1.0.14_fileio_56_seqrd12.1
  mysql_sysb1.0.14_fileio_56_rndwr 0.4
  mysql_sysb1.0.14_fileio_56_seqrewr  -0.3

  pgsql_sysb1.0.14_fileio_56_rndrd19.5
  pgsql_sysb1.0.14_fileio_56_seqrd 8.6
  pgsql_sysb1.0.14_fileio_56_rndwr 1.0
  pgsql_sysb1.0.14_fileio_56_seqrewr   0.5

  opatch_time_ASM_12.2.0.1.0_HP2M  7.5
  select-1_users-warm_asmm_ASM_12.2.0.1.0_HP2M 5.1
  select-1_users_asmm_ASM_12.2.0.1.0_HP2M  4.4
  swingbenchv3_asmm_soebench_ASM_12.2.0.1.0_HP2M   5.8

  lm3_memlat_L24.8
  lm3_memlat_L10.0

  ub_gcc_56CPUs-56copies_Pipe-based_Context_Switching 60.1
  ub_gcc_56CPUs-56copies_Shell_Scripts_1_concurrent5.2
  ub_gcc_56CPUs-56copies_Shell_Scripts_8_concurrent   -3.0
  ub_gcc_56CPUs-56copies_File_Copy_1024_bufsize_2000_maxblocks 2.4

X5-2: 2 sockets * 18 cores * 2 hyperthreads = 72 CPUs
Intel(R) Xeon(R) CPU E5-2699 v3 @ 2.30GHz

  NAS_OMP
  bench class   ncpu%improved(Mops)
  dcB   72  1.3
  isC   72  0.9
  isD   72  0.7

  sysbench mysql, average of 24 runs
  --- base --- --- new ---
  nthr   events  %stdev   events  %stdev %speedup
 1331.00.25331.00.24 -0.1
 2661.30.22661.80.22  0.0
 4   1297.00.88   1300.50.82  0.2
 8   2420.80.04   2420.50.04 -0.1
16   4826.30.07   4825.40.05 -0.1
32   8815.30.27   8830.20.18  0.1
64  12823.00.24  12823.60.26  0.0

--

Changes from v1 to v2:
  - Remove stray find_time hunk from patch 5
  - Fix "warning: label out defined but not used" for !CONFIG_SCHED_SMT
  - Set SCHED_STEAL_NODE_LIMIT_DEFAULT to 2
  - Steal iff avg_idle exceeds the cost of stealing

Steve Sistare (10):
  sched: Provide sparsemask, a reduced contention bitmap
  sched/topology: Provide hooks to allocate data shared per LLC
  sched/topology: Provide cfs_overload_cpus bitmap
  sched/fair: Dynamically update cfs_overload_cpus
  sched/fair: Hoist idle_stamp up from idle_balance
  sched/fair: Generalize the detach_task interface
  sched/fair: Provide can_migrate_task_llc
  sched/fair: Steal work from an overloaded CPU when CPU goes idle
  sched/fair: disable stealing if too many NUMA nodes
  sched/fair: Provide idle search schedstats

 include/linux/sched/topology.h |   1 +
 include/linux/sparsemask.h | 260 +++
 kernel/sched/core.c|  30 +++-
 kernel

[PATCH v2 00/10] steal tasks to improve CPU utilization

2018-11-05 Thread Steve Sistare
   9.3
  16  43.7790.1  41.7410.2   4.8

KVM 4-cpu
Intel(R) Xeon(R) CPU E5-2699 v3 @ 2.30GHz
tbench, average of 11 runs.

  clients%speedup
116.2
211.7
4 9.9
812.8
   1613.7

KVM 2-cpu
Intel(R) Xeon(R) CPU E5-2699 v3 @ 2.30GHz

  Benchmark %speedup
  specjbb2015_critical_jops  5.7
  mysql_sysb1.0.14_mutex_2  40.6
  mysql_sysb1.0.14_oltp_23.9

-- 2 Socket Results --

X6-2: 2 sockets * 10 cores * 2 hyperthreads = 40 CPUs
Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz
Average of 10 runs of: hackbench  process 10

--- base ----- new ---
  groupstime %stdevtime %stdev  %speedup
   1   7.9450.2   7.2198.7  10.0
   2   8.4440.4   6.6891.5  26.2
   3  12.1001.1   9.9622.0  21.4
   4  15.0010.4  13.1091.1  14.4
   8  27.9600.2  26.1270.3   7.0

X6-2: 2 sockets * 22 cores * 2 hyperthreads = 88 CPUs
Intel(R) Xeon(R) CPU E5-2699 v4 @ 2.20GHz
Average of 10 runs of: hackbench  process 10

--- base ----- new ---
  groupstime %stdevtime %stdev  %speedup
   1   5.8265.4   5.8405.0  -0.3
   2   5.0415.3   6.171   23.4 -18.4
   3   6.8392.1   6.3243.8   8.1
   4   8.1770.6   7.3183.6  11.7
   8  14.4290.7  13.9661.3   3.3
  16  26.4010.3  25.1491.5   4.9


X6-2: 2 sockets * 22 cores * 2 hyperthreads = 88 CPUs
Intel(R) Xeon(R) CPU E5-2699 v4 @ 2.20GHz
Oracle database OLTP, logging disabled, NVRAM storage

  Customers   Users   %speedup
120  40   -1.2
240  802.7
360 1208.9
480 1604.4
600 2003.0

X6-2: 2 sockets * 14 cores * 2 hyperthreads = 56 CPUs
Intel(R) Xeon(R) CPU E5-2690 v4 @ 2.60GHz
Results from the Oracle "Performance PIT".

  Benchmark   %speedup

  mysql_sysb1.0.14_fileio_56_rndrd19.6
  mysql_sysb1.0.14_fileio_56_seqrd12.1
  mysql_sysb1.0.14_fileio_56_rndwr 0.4
  mysql_sysb1.0.14_fileio_56_seqrewr  -0.3

  pgsql_sysb1.0.14_fileio_56_rndrd19.5
  pgsql_sysb1.0.14_fileio_56_seqrd 8.6
  pgsql_sysb1.0.14_fileio_56_rndwr 1.0
  pgsql_sysb1.0.14_fileio_56_seqrewr   0.5

  opatch_time_ASM_12.2.0.1.0_HP2M  7.5
  select-1_users-warm_asmm_ASM_12.2.0.1.0_HP2M 5.1
  select-1_users_asmm_ASM_12.2.0.1.0_HP2M  4.4
  swingbenchv3_asmm_soebench_ASM_12.2.0.1.0_HP2M   5.8

  lm3_memlat_L24.8
  lm3_memlat_L10.0

  ub_gcc_56CPUs-56copies_Pipe-based_Context_Switching 60.1
  ub_gcc_56CPUs-56copies_Shell_Scripts_1_concurrent5.2
  ub_gcc_56CPUs-56copies_Shell_Scripts_8_concurrent   -3.0
  ub_gcc_56CPUs-56copies_File_Copy_1024_bufsize_2000_maxblocks 2.4

X5-2: 2 sockets * 18 cores * 2 hyperthreads = 72 CPUs
Intel(R) Xeon(R) CPU E5-2699 v3 @ 2.30GHz

  NAS_OMP
  bench class   ncpu%improved(Mops)
  dcB   72  1.3
  isC   72  0.9
  isD   72  0.7

  sysbench mysql, average of 24 runs
  --- base --- --- new ---
  nthr   events  %stdev   events  %stdev %speedup
 1331.00.25331.00.24 -0.1
 2661.30.22661.80.22  0.0
 4   1297.00.88   1300.50.82  0.2
 8   2420.80.04   2420.50.04 -0.1
16   4826.30.07   4825.40.05 -0.1
32   8815.30.27   8830.20.18  0.1
64  12823.00.24  12823.60.26  0.0

--

Changes from v1 to v2:
  - Remove stray find_time hunk from patch 5
  - Fix "warning: label out defined but not used" for !CONFIG_SCHED_SMT
  - Set SCHED_STEAL_NODE_LIMIT_DEFAULT to 2
  - Steal iff avg_idle exceeds the cost of stealing

Steve Sistare (10):
  sched: Provide sparsemask, a reduced contention bitmap
  sched/topology: Provide hooks to allocate data shared per LLC
  sched/topology: Provide cfs_overload_cpus bitmap
  sched/fair: Dynamically update cfs_overload_cpus
  sched/fair: Hoist idle_stamp up from idle_balance
  sched/fair: Generalize the detach_task interface
  sched/fair: Provide can_migrate_task_llc
  sched/fair: Steal work from an overloaded CPU when CPU goes idle
  sched/fair: disable stealing if too many NUMA nodes
  sched/fair: Provide idle search schedstats

 include/linux/sched/topology.h |   1 +
 include/linux/sparsemask.h | 260 +++
 kernel/sched/core.c|  30 +++-
 kernel

[PATCH 08/10] sched/fair: Steal work from an overloaded CPU when CPU goes idle

2018-10-22 Thread Steve Sistare
 @ 2.20GHz
Average of 10 runs of: hackbench  process 10

--- base ----- new ---
  groupstime %stdevtime %stdev  %speedup
   1   5.8265.4   5.8405.0  -0.3
   2   5.0415.3   6.171   23.4 -18.4
   3   6.8392.1   6.3243.8   8.1
   4   8.1770.6   7.3183.6  11.7
   8  14.4290.7  13.9661.3   3.3
  16  26.4010.3  25.1491.5   4.9

X6-2: 2 sockets * 22 cores * 2 hyperthreads = 88 CPUs
Intel(R) Xeon(R) CPU E5-2699 v4 @ 2.20GHz
Oracle database OLTP, logging disabled, NVRAM storage

  Customers   Users   %speedup
120  40   -1.2
240  802.7
360 1208.9
480 1604.4
600 2003.0

X6-2: 2 sockets * 14 cores * 2 hyperthreads = 56 CPUs
Intel(R) Xeon(R) CPU E5-2690 v4 @ 2.60GHz
Results from the Oracle "Performance PIT".

  Benchmark   %speedup

  mysql_sysb1.0.14_fileio_56_rndrd19.6
  mysql_sysb1.0.14_fileio_56_seqrd12.1
  mysql_sysb1.0.14_fileio_56_rndwr 0.4
  mysql_sysb1.0.14_fileio_56_seqrewr  -0.3

  pgsql_sysb1.0.14_fileio_56_rndrd19.5
  pgsql_sysb1.0.14_fileio_56_seqrd 8.6
  pgsql_sysb1.0.14_fileio_56_rndwr 1.0
  pgsql_sysb1.0.14_fileio_56_seqrewr   0.5

  opatch_time_ASM_12.2.0.1.0_HP2M  7.5
  select-1_users-warm_asmm_ASM_12.2.0.1.0_HP2M 5.1
  select-1_users_asmm_ASM_12.2.0.1.0_HP2M  4.4
  swingbenchv3_asmm_soebench_ASM_12.2.0.1.0_HP2M   5.8

  lm3_memlat_L24.8
  lm3_memlat_L10.0

  ub_gcc_56CPUs-56copies_Pipe-based_Context_Switching 60.1
  ub_gcc_56CPUs-56copies_Shell_Scripts_1_concurrent5.2
  ub_gcc_56CPUs-56copies_Shell_Scripts_8_concurrent   -3.0
  ub_gcc_56CPUs-56copies_File_Copy_1024_bufsize_2000_maxblocks 2.4

X5-2: 2 sockets * 18 cores * 2 hyperthreads = 72 CPUs
Intel(R) Xeon(R) CPU E5-2699 v3 @ 2.30GHz

  NAS_OMP
  bench class   ncpu%improved(Mops)
  dcB   72  1.3
  isC   72  0.9
  isD   72  0.7

  sysbench mysql, average of 24 runs
  --- base --- --- new ---
  nthr   events  %stdev   events  %stdev %speedup
 1331.00.25331.00.24 -0.1
 2661.30.22661.80.22  0.0
 4   1297.00.88   1300.50.82  0.2
 8   2420.80.04   2420.50.04 -0.1
16   4826.30.07   4825.40.05 -0.1
32   8815.30.27   8830.20.18  0.1
64  12823.00.24  12823.60.26  0.0

-----

Signed-off-by: Steve Sistare 
---
 kernel/sched/fair.c | 160 ++--
 kernel/sched/features.h |   6 ++
 2 files changed, 161 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6548bed..cb86ec9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3730,6 +3730,9 @@ static void overload_clear(struct rq *rq)
 {
struct sparsemask *overload_cpus;
 
+   if (!sched_feat(STEAL))
+   return;
+
rcu_read_lock();
overload_cpus = rcu_dereference(rq->cfs_overload_cpus);
if (overload_cpus)
@@ -3741,6 +3744,9 @@ static void overload_set(struct rq *rq)
 {
struct sparsemask *overload_cpus;
 
+   if (!sched_feat(STEAL))
+   return;
+
rcu_read_lock();
overload_cpus = rcu_dereference(rq->cfs_overload_cpus);
if (overload_cpus)
@@ -3748,6 +3754,8 @@ static void overload_set(struct rq *rq)
rcu_read_unlock();
 }
 
+static int try_steal(struct rq *this_rq, struct rq_flags *rf);
+
 #else /* CONFIG_SMP */
 
 #define UPDATE_TG  0x0
@@ -3783,6 +3791,11 @@ static inline void overload_set(struct rq *rq) {}
 util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p,
 bool task_sleep) {}
 
+static inline int try_steal(struct rq *this_rq, struct rq_flags *rf)
+{
+   return 0;
+}
+
 #endif /* CONFIG_SMP */
 
 static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -6745,12 +6758,14 @@ static void check_preempt_wakeup(struct rq *rq, struct 
task_struct *p, int wake_
 
 idle:
/*
-* We must set idle_stamp _before_ calling idle_balance(), such that we
-* measure the duration of idle_balance() as idle time.
+* We must set idle_stamp _before_ calling try_steal() or
+* idle_balance(), such that we measure the duration as idle time.
 */
IF_SMP(rq->idle_stamp = rq_clock(rq);)
 
new_tasks = idle_balance(rq, rf);
+   if (new_tasks == 0)
+   new_tasks = try_steal(rq,

[PATCH 08/10] sched/fair: Steal work from an overloaded CPU when CPU goes idle

2018-10-22 Thread Steve Sistare
 @ 2.20GHz
Average of 10 runs of: hackbench  process 10

--- base ----- new ---
  groupstime %stdevtime %stdev  %speedup
   1   5.8265.4   5.8405.0  -0.3
   2   5.0415.3   6.171   23.4 -18.4
   3   6.8392.1   6.3243.8   8.1
   4   8.1770.6   7.3183.6  11.7
   8  14.4290.7  13.9661.3   3.3
  16  26.4010.3  25.1491.5   4.9

X6-2: 2 sockets * 22 cores * 2 hyperthreads = 88 CPUs
Intel(R) Xeon(R) CPU E5-2699 v4 @ 2.20GHz
Oracle database OLTP, logging disabled, NVRAM storage

  Customers   Users   %speedup
120  40   -1.2
240  802.7
360 1208.9
480 1604.4
600 2003.0

X6-2: 2 sockets * 14 cores * 2 hyperthreads = 56 CPUs
Intel(R) Xeon(R) CPU E5-2690 v4 @ 2.60GHz
Results from the Oracle "Performance PIT".

  Benchmark   %speedup

  mysql_sysb1.0.14_fileio_56_rndrd19.6
  mysql_sysb1.0.14_fileio_56_seqrd12.1
  mysql_sysb1.0.14_fileio_56_rndwr 0.4
  mysql_sysb1.0.14_fileio_56_seqrewr  -0.3

  pgsql_sysb1.0.14_fileio_56_rndrd19.5
  pgsql_sysb1.0.14_fileio_56_seqrd 8.6
  pgsql_sysb1.0.14_fileio_56_rndwr 1.0
  pgsql_sysb1.0.14_fileio_56_seqrewr   0.5

  opatch_time_ASM_12.2.0.1.0_HP2M  7.5
  select-1_users-warm_asmm_ASM_12.2.0.1.0_HP2M 5.1
  select-1_users_asmm_ASM_12.2.0.1.0_HP2M  4.4
  swingbenchv3_asmm_soebench_ASM_12.2.0.1.0_HP2M   5.8

  lm3_memlat_L24.8
  lm3_memlat_L10.0

  ub_gcc_56CPUs-56copies_Pipe-based_Context_Switching 60.1
  ub_gcc_56CPUs-56copies_Shell_Scripts_1_concurrent5.2
  ub_gcc_56CPUs-56copies_Shell_Scripts_8_concurrent   -3.0
  ub_gcc_56CPUs-56copies_File_Copy_1024_bufsize_2000_maxblocks 2.4

X5-2: 2 sockets * 18 cores * 2 hyperthreads = 72 CPUs
Intel(R) Xeon(R) CPU E5-2699 v3 @ 2.30GHz

  NAS_OMP
  bench class   ncpu%improved(Mops)
  dcB   72  1.3
  isC   72  0.9
  isD   72  0.7

  sysbench mysql, average of 24 runs
  --- base --- --- new ---
  nthr   events  %stdev   events  %stdev %speedup
 1331.00.25331.00.24 -0.1
 2661.30.22661.80.22  0.0
 4   1297.00.88   1300.50.82  0.2
 8   2420.80.04   2420.50.04 -0.1
16   4826.30.07   4825.40.05 -0.1
32   8815.30.27   8830.20.18  0.1
64  12823.00.24  12823.60.26  0.0

-----

Signed-off-by: Steve Sistare 
---
 kernel/sched/fair.c | 160 ++--
 kernel/sched/features.h |   6 ++
 2 files changed, 161 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6548bed..cb86ec9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3730,6 +3730,9 @@ static void overload_clear(struct rq *rq)
 {
struct sparsemask *overload_cpus;
 
+   if (!sched_feat(STEAL))
+   return;
+
rcu_read_lock();
overload_cpus = rcu_dereference(rq->cfs_overload_cpus);
if (overload_cpus)
@@ -3741,6 +3744,9 @@ static void overload_set(struct rq *rq)
 {
struct sparsemask *overload_cpus;
 
+   if (!sched_feat(STEAL))
+   return;
+
rcu_read_lock();
overload_cpus = rcu_dereference(rq->cfs_overload_cpus);
if (overload_cpus)
@@ -3748,6 +3754,8 @@ static void overload_set(struct rq *rq)
rcu_read_unlock();
 }
 
+static int try_steal(struct rq *this_rq, struct rq_flags *rf);
+
 #else /* CONFIG_SMP */
 
 #define UPDATE_TG  0x0
@@ -3783,6 +3791,11 @@ static inline void overload_set(struct rq *rq) {}
 util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p,
 bool task_sleep) {}
 
+static inline int try_steal(struct rq *this_rq, struct rq_flags *rf)
+{
+   return 0;
+}
+
 #endif /* CONFIG_SMP */
 
 static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -6745,12 +6758,14 @@ static void check_preempt_wakeup(struct rq *rq, struct 
task_struct *p, int wake_
 
 idle:
/*
-* We must set idle_stamp _before_ calling idle_balance(), such that we
-* measure the duration of idle_balance() as idle time.
+* We must set idle_stamp _before_ calling try_steal() or
+* idle_balance(), such that we measure the duration as idle time.
 */
IF_SMP(rq->idle_stamp = rq_clock(rq);)
 
new_tasks = idle_balance(rq, rf);
+   if (new_tasks == 0)
+   new_tasks = try_steal(rq,

[PATCH 03/10] sched/topology: Provide cfs_overload_cpus bitmap

2018-10-22 Thread Steve Sistare
Define and initialize a sparse bitmap of overloaded CPUs, per
last-level-cache scheduling domain, for use by the CFS scheduling class.
Save a pointer to cfs_overload_cpus in the rq for efficient access.

Signed-off-by: Steve Sistare 
---
 include/linux/sched/topology.h |  1 +
 kernel/sched/sched.h   |  2 ++
 kernel/sched/topology.c| 21 +++--
 3 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 2634774..8bac15d 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -72,6 +72,7 @@ struct sched_domain_shared {
atomic_tref;
atomic_tnr_busy_cpus;
int has_idle_cores;
+   struct sparsemask *cfs_overload_cpus;
 };
 
 struct sched_domain {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 455fa33..aadfe68 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -81,6 +81,7 @@
 
 struct rq;
 struct cpuidle_state;
+struct sparsemask;
 
 /* task_struct::on_rq states: */
 #define TASK_ON_RQ_QUEUED  1
@@ -805,6 +806,7 @@ struct rq {
struct cfs_rq   cfs;
struct rt_rqrt;
struct dl_rqdl;
+   struct sparsemask   *cfs_overload_cpus;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
/* list of leaf cfs_rq on this CPU: */
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index a2363f6..f18c416 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -3,6 +3,7 @@
  * Scheduler topology setup/handling methods
  */
 #include "sched.h"
+#include 
 
 DEFINE_MUTEX(sched_domains_mutex);
 
@@ -440,6 +441,7 @@ static void update_top_cache_domain(int cpu)
 static void
 cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
 {
+   struct sparsemask *cfs_overload_cpus;
struct rq *rq = cpu_rq(cpu);
struct sched_domain *tmp;
 
@@ -481,6 +483,10 @@ static void update_top_cache_domain(int cpu)
dirty_sched_domain_sysctl(cpu);
destroy_sched_domains(tmp);
 
+   sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
+   cfs_overload_cpus = (sd ? sd->shared->cfs_overload_cpus : NULL);
+   rcu_assign_pointer(rq->cfs_overload_cpus, cfs_overload_cpus);
+
update_top_cache_domain(cpu);
 }
 
@@ -1611,9 +1617,19 @@ static void __sdt_free(const struct cpumask *cpu_map)
}
 }
 
+#define ZALLOC_MASK(maskp, nelems, node) \
+   (!*(maskp) && !zalloc_sparsemask_node(maskp, nelems,  \
+ SPARSEMASK_DENSITY_DEFAULT, \
+ GFP_KERNEL, node))  \
+
 static int sd_llc_alloc(struct sched_domain *sd)
 {
-   /* Allocate sd->shared data here. Empty for now. */
+   struct sched_domain_shared *sds = sd->shared;
+   struct cpumask *span = sched_domain_span(sd);
+   int nid = cpu_to_node(cpumask_first(span));
+
+   if (ZALLOC_MASK(>cfs_overload_cpus, nr_cpu_ids, nid))
+   return 1;
 
return 0;
 }
@@ -1625,7 +1641,8 @@ static void sd_llc_free(struct sched_domain *sd)
if (!sds)
return;
 
-   /* Free data here. Empty for now. */
+   free_sparsemask(sds->cfs_overload_cpus);
+   sds->cfs_overload_cpus = NULL;
 }
 
 static int sd_llc_alloc_all(const struct cpumask *cpu_map, struct s_data *d)
-- 
1.8.3.1



[PATCH 03/10] sched/topology: Provide cfs_overload_cpus bitmap

2018-10-22 Thread Steve Sistare
Define and initialize a sparse bitmap of overloaded CPUs, per
last-level-cache scheduling domain, for use by the CFS scheduling class.
Save a pointer to cfs_overload_cpus in the rq for efficient access.

Signed-off-by: Steve Sistare 
---
 include/linux/sched/topology.h |  1 +
 kernel/sched/sched.h   |  2 ++
 kernel/sched/topology.c| 21 +++--
 3 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 2634774..8bac15d 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -72,6 +72,7 @@ struct sched_domain_shared {
atomic_tref;
atomic_tnr_busy_cpus;
int has_idle_cores;
+   struct sparsemask *cfs_overload_cpus;
 };
 
 struct sched_domain {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 455fa33..aadfe68 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -81,6 +81,7 @@
 
 struct rq;
 struct cpuidle_state;
+struct sparsemask;
 
 /* task_struct::on_rq states: */
 #define TASK_ON_RQ_QUEUED  1
@@ -805,6 +806,7 @@ struct rq {
struct cfs_rq   cfs;
struct rt_rqrt;
struct dl_rqdl;
+   struct sparsemask   *cfs_overload_cpus;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
/* list of leaf cfs_rq on this CPU: */
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index a2363f6..f18c416 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -3,6 +3,7 @@
  * Scheduler topology setup/handling methods
  */
 #include "sched.h"
+#include 
 
 DEFINE_MUTEX(sched_domains_mutex);
 
@@ -440,6 +441,7 @@ static void update_top_cache_domain(int cpu)
 static void
 cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
 {
+   struct sparsemask *cfs_overload_cpus;
struct rq *rq = cpu_rq(cpu);
struct sched_domain *tmp;
 
@@ -481,6 +483,10 @@ static void update_top_cache_domain(int cpu)
dirty_sched_domain_sysctl(cpu);
destroy_sched_domains(tmp);
 
+   sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
+   cfs_overload_cpus = (sd ? sd->shared->cfs_overload_cpus : NULL);
+   rcu_assign_pointer(rq->cfs_overload_cpus, cfs_overload_cpus);
+
update_top_cache_domain(cpu);
 }
 
@@ -1611,9 +1617,19 @@ static void __sdt_free(const struct cpumask *cpu_map)
}
 }
 
+#define ZALLOC_MASK(maskp, nelems, node) \
+   (!*(maskp) && !zalloc_sparsemask_node(maskp, nelems,  \
+ SPARSEMASK_DENSITY_DEFAULT, \
+ GFP_KERNEL, node))  \
+
 static int sd_llc_alloc(struct sched_domain *sd)
 {
-   /* Allocate sd->shared data here. Empty for now. */
+   struct sched_domain_shared *sds = sd->shared;
+   struct cpumask *span = sched_domain_span(sd);
+   int nid = cpu_to_node(cpumask_first(span));
+
+   if (ZALLOC_MASK(>cfs_overload_cpus, nr_cpu_ids, nid))
+   return 1;
 
return 0;
 }
@@ -1625,7 +1641,8 @@ static void sd_llc_free(struct sched_domain *sd)
if (!sds)
return;
 
-   /* Free data here. Empty for now. */
+   free_sparsemask(sds->cfs_overload_cpus);
+   sds->cfs_overload_cpus = NULL;
 }
 
 static int sd_llc_alloc_all(const struct cpumask *cpu_map, struct s_data *d)
-- 
1.8.3.1



[PATCH 02/10] sched/topology: Provide hooks to allocate data shared per LLC

2018-10-22 Thread Steve Sistare
Add functions sd_llc_alloc_all() and sd_llc_free_all() to allocate and
free data pointed to by struct sched_domain_shared at the last-level-cache
domain.  sd_llc_alloc_all() is called after the SD hierarchy is known, to
eliminate the unnecessary allocations that would occur if we instead
allocated in __sdt_alloc() and then figured out which shared nodes are
redundant.

Signed-off-by: Steve Sistare 
---
 kernel/sched/topology.c | 75 -
 1 file changed, 74 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 505a41c..a2363f6 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -10,6 +10,12 @@
 cpumask_var_t sched_domains_tmpmask;
 cpumask_var_t sched_domains_tmpmask2;
 
+struct s_data;
+static int sd_llc_alloc(struct sched_domain *sd);
+static void sd_llc_free(struct sched_domain *sd);
+static int sd_llc_alloc_all(const struct cpumask *cpu_map, struct s_data *d);
+static void sd_llc_free_all(const struct cpumask *cpu_map);
+
 #ifdef CONFIG_SCHED_DEBUG
 
 static int __init sched_debug_setup(char *str)
@@ -361,8 +367,10 @@ static void destroy_sched_domain(struct sched_domain *sd)
 */
free_sched_groups(sd->groups, 1);
 
-   if (sd->shared && atomic_dec_and_test(>shared->ref))
+   if (sd->shared && atomic_dec_and_test(>shared->ref)) {
+   sd_llc_free(sd);
kfree(sd->shared);
+   }
kfree(sd);
 }
 
@@ -993,6 +1001,7 @@ static void __free_domain_allocs(struct s_data *d, enum 
s_alloc what,
free_percpu(d->sd);
/* Fall through */
case sa_sd_storage:
+   sd_llc_free_all(cpu_map);
__sdt_free(cpu_map);
/* Fall through */
case sa_none:
@@ -1602,6 +1611,62 @@ static void __sdt_free(const struct cpumask *cpu_map)
}
 }
 
+static int sd_llc_alloc(struct sched_domain *sd)
+{
+   /* Allocate sd->shared data here. Empty for now. */
+
+   return 0;
+}
+
+static void sd_llc_free(struct sched_domain *sd)
+{
+   struct sched_domain_shared *sds = sd->shared;
+
+   if (!sds)
+   return;
+
+   /* Free data here. Empty for now. */
+}
+
+static int sd_llc_alloc_all(const struct cpumask *cpu_map, struct s_data *d)
+{
+   struct sched_domain *sd, *hsd;
+   int i;
+
+   for_each_cpu(i, cpu_map) {
+   /* Find highest domain that shares resources */
+   hsd = NULL;
+   for (sd = *per_cpu_ptr(d->sd, i); sd; sd = sd->parent) {
+   if (!(sd->flags & SD_SHARE_PKG_RESOURCES))
+   break;
+   hsd = sd;
+   }
+   if (hsd && sd_llc_alloc(hsd))
+   return 1;
+   }
+
+   return 0;
+}
+
+static void sd_llc_free_all(const struct cpumask *cpu_map)
+{
+   struct sched_domain_topology_level *tl;
+   struct sched_domain *sd;
+   struct sd_data *sdd;
+   int j;
+
+   for_each_sd_topology(tl) {
+   sdd = >data;
+   if (!sdd)
+   continue;
+   for_each_cpu(j, cpu_map) {
+   sd = *per_cpu_ptr(sdd->sd, j);
+   if (sd)
+   sd_llc_free(sd);
+   }
+   }
+}
+
 static struct sched_domain *build_sched_domain(struct 
sched_domain_topology_level *tl,
const struct cpumask *cpu_map, struct sched_domain_attr *attr,
struct sched_domain *child, int cpu)
@@ -1690,6 +1755,14 @@ static struct sched_domain *build_sched_domain(struct 
sched_domain_topology_leve
}
}
 
+   /*
+* Allocate shared sd data at last level cache.  Must be done after
+* domains are built above, but before the data is used in
+* cpu_attach_domain and descendants below.
+*/
+   if (sd_llc_alloc_all(cpu_map, ))
+   goto error;
+
/* Attach the domains */
rcu_read_lock();
for_each_cpu(i, cpu_map) {
-- 
1.8.3.1



[PATCH 02/10] sched/topology: Provide hooks to allocate data shared per LLC

2018-10-22 Thread Steve Sistare
Add functions sd_llc_alloc_all() and sd_llc_free_all() to allocate and
free data pointed to by struct sched_domain_shared at the last-level-cache
domain.  sd_llc_alloc_all() is called after the SD hierarchy is known, to
eliminate the unnecessary allocations that would occur if we instead
allocated in __sdt_alloc() and then figured out which shared nodes are
redundant.

Signed-off-by: Steve Sistare 
---
 kernel/sched/topology.c | 75 -
 1 file changed, 74 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 505a41c..a2363f6 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -10,6 +10,12 @@
 cpumask_var_t sched_domains_tmpmask;
 cpumask_var_t sched_domains_tmpmask2;
 
+struct s_data;
+static int sd_llc_alloc(struct sched_domain *sd);
+static void sd_llc_free(struct sched_domain *sd);
+static int sd_llc_alloc_all(const struct cpumask *cpu_map, struct s_data *d);
+static void sd_llc_free_all(const struct cpumask *cpu_map);
+
 #ifdef CONFIG_SCHED_DEBUG
 
 static int __init sched_debug_setup(char *str)
@@ -361,8 +367,10 @@ static void destroy_sched_domain(struct sched_domain *sd)
 */
free_sched_groups(sd->groups, 1);
 
-   if (sd->shared && atomic_dec_and_test(>shared->ref))
+   if (sd->shared && atomic_dec_and_test(>shared->ref)) {
+   sd_llc_free(sd);
kfree(sd->shared);
+   }
kfree(sd);
 }
 
@@ -993,6 +1001,7 @@ static void __free_domain_allocs(struct s_data *d, enum 
s_alloc what,
free_percpu(d->sd);
/* Fall through */
case sa_sd_storage:
+   sd_llc_free_all(cpu_map);
__sdt_free(cpu_map);
/* Fall through */
case sa_none:
@@ -1602,6 +1611,62 @@ static void __sdt_free(const struct cpumask *cpu_map)
}
 }
 
+static int sd_llc_alloc(struct sched_domain *sd)
+{
+   /* Allocate sd->shared data here. Empty for now. */
+
+   return 0;
+}
+
+static void sd_llc_free(struct sched_domain *sd)
+{
+   struct sched_domain_shared *sds = sd->shared;
+
+   if (!sds)
+   return;
+
+   /* Free data here. Empty for now. */
+}
+
+static int sd_llc_alloc_all(const struct cpumask *cpu_map, struct s_data *d)
+{
+   struct sched_domain *sd, *hsd;
+   int i;
+
+   for_each_cpu(i, cpu_map) {
+   /* Find highest domain that shares resources */
+   hsd = NULL;
+   for (sd = *per_cpu_ptr(d->sd, i); sd; sd = sd->parent) {
+   if (!(sd->flags & SD_SHARE_PKG_RESOURCES))
+   break;
+   hsd = sd;
+   }
+   if (hsd && sd_llc_alloc(hsd))
+   return 1;
+   }
+
+   return 0;
+}
+
+static void sd_llc_free_all(const struct cpumask *cpu_map)
+{
+   struct sched_domain_topology_level *tl;
+   struct sched_domain *sd;
+   struct sd_data *sdd;
+   int j;
+
+   for_each_sd_topology(tl) {
+   sdd = >data;
+   if (!sdd)
+   continue;
+   for_each_cpu(j, cpu_map) {
+   sd = *per_cpu_ptr(sdd->sd, j);
+   if (sd)
+   sd_llc_free(sd);
+   }
+   }
+}
+
 static struct sched_domain *build_sched_domain(struct 
sched_domain_topology_level *tl,
const struct cpumask *cpu_map, struct sched_domain_attr *attr,
struct sched_domain *child, int cpu)
@@ -1690,6 +1755,14 @@ static struct sched_domain *build_sched_domain(struct 
sched_domain_topology_leve
}
}
 
+   /*
+* Allocate shared sd data at last level cache.  Must be done after
+* domains are built above, but before the data is used in
+* cpu_attach_domain and descendants below.
+*/
+   if (sd_llc_alloc_all(cpu_map, ))
+   goto error;
+
/* Attach the domains */
rcu_read_lock();
for_each_cpu(i, cpu_map) {
-- 
1.8.3.1



[PATCH 01/10] sched: Provide sparsemask, a reduced contention bitmap

2018-10-22 Thread Steve Sistare
Provide struct sparsemask and functions to manipulate it.  A sparsemask is
a sparse bitmap.  It reduces cache contention vs the usual bitmap when many
threads concurrently set, clear, and visit elements, by reducing the number
of significant bits per cacheline.  For each 64 byte chunk of the mask,
only the first K bits of the first word are used, and the remaining bits
are ignored, where K is a creation time parameter.  Thus a sparsemask that
can represent a set of N elements is approximately (N/K * 64) bytes in
size.

Signed-off-by: Steve Sistare 
---
 include/linux/sparsemask.h | 260 +
 lib/Makefile   |   2 +-
 lib/sparsemask.c   | 142 +
 3 files changed, 403 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/sparsemask.h
 create mode 100644 lib/sparsemask.c

diff --git a/include/linux/sparsemask.h b/include/linux/sparsemask.h
new file mode 100644
index 000..d36a3be
--- /dev/null
+++ b/include/linux/sparsemask.h
@@ -0,0 +1,260 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * sparsemask.h - sparse bitmap operations
+ *
+ * Copyright (c) 2018 Oracle Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __LINUX_SPARSEMASK_H
+#define __LINUX_SPARSEMASK_H
+
+#include 
+#include 
+#include 
+
+/*
+ * A sparsemask is a sparse bitmap.  It reduces cache contention vs the usual
+ * bitmap when many threads concurrently set, clear, and visit elements.  For
+ * each 64 byte chunk of the mask, only the first K bits of the first word are
+ * used, and the remaining bits are ignored, where K is a creation time
+ * parameter.  Thus a sparsemask that can represent a set of N elements is
+ * approximately (N/K * 64) bytes in size.
+ *
+ * Clients pass and receive element numbers in the public API, and the
+ * implementation translates them to bit numbers to perform the bitmap
+ * operations.
+ *
+ * This file is partially derived from cpumask.h, and the public sparsemask
+ * operations are drop-in replacements for cpumask operations. However,
+ * sparsemask has no dependency on CPU definitions and can be used to
+ * represent any kind of elements.
+ */
+
+struct sparsemask {
+   short nelems;   /* current number of elements */
+   short density;  /* store 2^density elements per chunk */
+   unsigned long bits[0];  /* embedded array of chunks */
+};
+
+/* The maximum value for density, which implicitly defines the chunk size */
+
+#define _SMASK_DENSITY_MAX 6
+
+#define SMASK_DENSITY_TO_BYTES(density)(1U << (density))
+#define SMASK_DENSITY_TO_ELEMS(density)(1U << (density))
+
+/* The number of elements/bits/bytes/longs in a chunk */
+
+#define SMASK_ELEMS(mask)  SMASK_DENSITY_TO_ELEMS((mask)->density)
+#define SMASK_BYTESSMASK_DENSITY_TO_BYTES(_SMASK_DENSITY_MAX)
+#define SMASK_BITS (SMASK_BYTES * BITS_PER_BYTE)
+#define SMASK_LONGS(SMASK_BYTES / sizeof(long))
+
+/*
+ * Translate element index @elem to a bit/byte/long index.
+ * @density: the density of a chunk.
+ */
+
+#define _SMASK_ELEM_TO_BIT(elem, density)  \
+   ((elem) / SMASK_DENSITY_TO_ELEMS(density) * SMASK_BITS +\
+(elem) % SMASK_DENSITY_TO_ELEMS(density))
+
+#define _SMASK_ELEM_TO_BYTE(elem, density) \
+   (_SMASK_ELEM_TO_BIT(elem, density) / BITS_PER_BYTE)
+
+#define _SMASK_ELEM_TO_LONG(elem, density) \
+   (_SMASK_ELEM_TO_BYTE(elem, density) / sizeof(long))
+
+/* Translate @bit/@byte/@long index to an element index */
+
+#define _SMASK_BIT_TO_ELEM(bit, density)   \
+   ((bit) / SMASK_BITS * SMASK_DENSITY_TO_ELEMS(density) + \
+(bit) % SMASK_BITS)
+
+#define _SMASK_BYTE_TO_ELEM(byte, density) \
+   _SMASK_BIT_TO_ELEM((byte) * BITS_PER_BYTE, density)
+
+#define _SMASK_LONG_TO_ELEM(index, density)\
+   _SMASK_BYTE_TO_ELEM((index) * sizeof(long), density)
+
+/* Same translations as above, but taking sparsemask @m instead of density */
+
+#define SMASK_ELEM_TO_BYTE(elem, m)_SMASK_ELEM_TO_BYTE(elem, (m)->density)
+#define SMASK_ELEM_TO_BIT(elem, m) _SMASK_ELEM_TO_BIT(elem, (m)->density)
+#define SMASK_ELEM_TO_LONG(elem, m)_SMASK_ELEM_TO_LONG(elem, (m)->density)
+#define SMASK_BYTE_TO_ELEM(byte, m)_SMASK_BYTE_TO_ELEM(byte, (m)->density)
+#define SMASK_BIT_TO_ELEM(bit, m)  _SMASK_BIT_TO_ELEM(bit, (m)->density)
+#defi

[PATCH 01/10] sched: Provide sparsemask, a reduced contention bitmap

2018-10-22 Thread Steve Sistare
Provide struct sparsemask and functions to manipulate it.  A sparsemask is
a sparse bitmap.  It reduces cache contention vs the usual bitmap when many
threads concurrently set, clear, and visit elements, by reducing the number
of significant bits per cacheline.  For each 64 byte chunk of the mask,
only the first K bits of the first word are used, and the remaining bits
are ignored, where K is a creation time parameter.  Thus a sparsemask that
can represent a set of N elements is approximately (N/K * 64) bytes in
size.

Signed-off-by: Steve Sistare 
---
 include/linux/sparsemask.h | 260 +
 lib/Makefile   |   2 +-
 lib/sparsemask.c   | 142 +
 3 files changed, 403 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/sparsemask.h
 create mode 100644 lib/sparsemask.c

diff --git a/include/linux/sparsemask.h b/include/linux/sparsemask.h
new file mode 100644
index 000..d36a3be
--- /dev/null
+++ b/include/linux/sparsemask.h
@@ -0,0 +1,260 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * sparsemask.h - sparse bitmap operations
+ *
+ * Copyright (c) 2018 Oracle Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __LINUX_SPARSEMASK_H
+#define __LINUX_SPARSEMASK_H
+
+#include 
+#include 
+#include 
+
+/*
+ * A sparsemask is a sparse bitmap.  It reduces cache contention vs the usual
+ * bitmap when many threads concurrently set, clear, and visit elements.  For
+ * each 64 byte chunk of the mask, only the first K bits of the first word are
+ * used, and the remaining bits are ignored, where K is a creation time
+ * parameter.  Thus a sparsemask that can represent a set of N elements is
+ * approximately (N/K * 64) bytes in size.
+ *
+ * Clients pass and receive element numbers in the public API, and the
+ * implementation translates them to bit numbers to perform the bitmap
+ * operations.
+ *
+ * This file is partially derived from cpumask.h, and the public sparsemask
+ * operations are drop-in replacements for cpumask operations. However,
+ * sparsemask has no dependency on CPU definitions and can be used to
+ * represent any kind of elements.
+ */
+
+struct sparsemask {
+   short nelems;   /* current number of elements */
+   short density;  /* store 2^density elements per chunk */
+   unsigned long bits[0];  /* embedded array of chunks */
+};
+
+/* The maximum value for density, which implicitly defines the chunk size */
+
+#define _SMASK_DENSITY_MAX 6
+
+#define SMASK_DENSITY_TO_BYTES(density)(1U << (density))
+#define SMASK_DENSITY_TO_ELEMS(density)(1U << (density))
+
+/* The number of elements/bits/bytes/longs in a chunk */
+
+#define SMASK_ELEMS(mask)  SMASK_DENSITY_TO_ELEMS((mask)->density)
+#define SMASK_BYTESSMASK_DENSITY_TO_BYTES(_SMASK_DENSITY_MAX)
+#define SMASK_BITS (SMASK_BYTES * BITS_PER_BYTE)
+#define SMASK_LONGS(SMASK_BYTES / sizeof(long))
+
+/*
+ * Translate element index @elem to a bit/byte/long index.
+ * @density: the density of a chunk.
+ */
+
+#define _SMASK_ELEM_TO_BIT(elem, density)  \
+   ((elem) / SMASK_DENSITY_TO_ELEMS(density) * SMASK_BITS +\
+(elem) % SMASK_DENSITY_TO_ELEMS(density))
+
+#define _SMASK_ELEM_TO_BYTE(elem, density) \
+   (_SMASK_ELEM_TO_BIT(elem, density) / BITS_PER_BYTE)
+
+#define _SMASK_ELEM_TO_LONG(elem, density) \
+   (_SMASK_ELEM_TO_BYTE(elem, density) / sizeof(long))
+
+/* Translate @bit/@byte/@long index to an element index */
+
+#define _SMASK_BIT_TO_ELEM(bit, density)   \
+   ((bit) / SMASK_BITS * SMASK_DENSITY_TO_ELEMS(density) + \
+(bit) % SMASK_BITS)
+
+#define _SMASK_BYTE_TO_ELEM(byte, density) \
+   _SMASK_BIT_TO_ELEM((byte) * BITS_PER_BYTE, density)
+
+#define _SMASK_LONG_TO_ELEM(index, density)\
+   _SMASK_BYTE_TO_ELEM((index) * sizeof(long), density)
+
+/* Same translations as above, but taking sparsemask @m instead of density */
+
+#define SMASK_ELEM_TO_BYTE(elem, m)_SMASK_ELEM_TO_BYTE(elem, (m)->density)
+#define SMASK_ELEM_TO_BIT(elem, m) _SMASK_ELEM_TO_BIT(elem, (m)->density)
+#define SMASK_ELEM_TO_LONG(elem, m)_SMASK_ELEM_TO_LONG(elem, (m)->density)
+#define SMASK_BYTE_TO_ELEM(byte, m)_SMASK_BYTE_TO_ELEM(byte, (m)->density)
+#define SMASK_BIT_TO_ELEM(bit, m)  _SMASK_BIT_TO_ELEM(bit, (m)->density)
+#defi

  1   2   >