Re: [PATCH 2/2] mirror throttling

2013-02-14 Thread Alasdair G Kergon
On Wed, Jan 09, 2013 at 12:44:38AM -0500, Mikulas Patocka wrote:
> We maintain a history of kcopyd usage in variables io_period and
> total_period. The actual kcopyd activity is "(100 * io_period /
> total_period)" percent of time. If we exceed user-defined percentage
> threshold, we sleep.
 
Well, I'm going to take this pair of patches for now.

Some people do need this throttling today and this seems to me to be a
decent and simple way to give them a lever to deal with the problem.

I'm not proposing we expose this through LVM or other userspace tools at
this stage: people who need it should tune it directly through sysfs.

If need be, we can revisit this in future either by refining the
algorithm or making it adjustable per-device rather than per-module.
(Or by re-vamping kcopyd itself...)

The current version is:
  
http://people.redhat.com/agk/patches/linux/editing/dm-kcopyd-introduce-configurable-throttling.patch

Alasdair

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/2] mirror throttling

2013-02-14 Thread Alasdair G Kergon
On Wed, Jan 09, 2013 at 12:44:38AM -0500, Mikulas Patocka wrote:
 We maintain a history of kcopyd usage in variables io_period and
 total_period. The actual kcopyd activity is (100 * io_period /
 total_period) percent of time. If we exceed user-defined percentage
 threshold, we sleep.
 
Well, I'm going to take this pair of patches for now.

Some people do need this throttling today and this seems to me to be a
decent and simple way to give them a lever to deal with the problem.

I'm not proposing we expose this through LVM or other userspace tools at
this stage: people who need it should tune it directly through sysfs.

If need be, we can revisit this in future either by refining the
algorithm or making it adjustable per-device rather than per-module.
(Or by re-vamping kcopyd itself...)

The current version is:
  
http://people.redhat.com/agk/patches/linux/editing/dm-kcopyd-introduce-configurable-throttling.patch

Alasdair

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/2] mirror throttling

2013-01-08 Thread Mikulas Patocka
dm-kcopyd: use throttle

This patch allows the administrator to limit kcopyd rate.

We maintain a history of kcopyd usage in variables io_period and
total_period. The actual kcopyd activity is "(100 * io_period /
total_period)" percent of time. If we exceed user-defined percentage
threshold, we sleep.

Signed-off-by: Mikulas Patocka 

---
 drivers/md/dm-kcopyd.c |  110 +
 1 file changed, 110 insertions(+)

Index: linux-3.8-rc1-fast/drivers/md/dm-kcopyd.c
===
--- linux-3.8-rc1-fast.orig/drivers/md/dm-kcopyd.c  2013-01-02 
23:23:17.0 +0100
+++ linux-3.8-rc1-fast/drivers/md/dm-kcopyd.c   2013-01-02 23:23:25.0 
+0100
@@ -22,6 +22,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -51,6 +52,8 @@ struct dm_kcopyd_client {
struct workqueue_struct *kcopyd_wq;
struct work_struct kcopyd_work;
 
+   struct dm_kcopyd_throttle *throttle;
+
 /*
  * We maintain three lists of jobs:
  *
@@ -68,6 +71,108 @@ struct dm_kcopyd_client {
 
 static struct page_list zero_page_list;
 
+static DEFINE_SPINLOCK(throttle_spinlock);
+
+/*
+ * IO/IDLE accounting slowly decays after (1 << ACOUNT_INTERVAL_SHIFT) period.
+ * When total_period >= (1 << ACOUNT_INTERVAL_SHIFT) the counters are divided
+ * by 2.
+ */
+#define ACOUNT_INTERVAL_SHIFT  SHIFT_HZ
+
+/*
+ * Sleep this number of milliseconds.
+ *
+ * It is experimentally found value.
+ * Smaller values cause increased copy rate above the limit. The reason for
+ * this is unknown. A possible explanations could be jiffies rounding errors
+ * or read/write cache inside the disk.
+ */
+#define SLEEP_MSEC 100
+
+/*
+ * Maximum number of sleep events. There is a theoretical livelock if more
+ * kcopyd clients do work simultaneously, this limit allows us to get out of
+ * the livelock.
+ */
+#define MAX_SLEEPS 10
+
+static void io_job_start(struct dm_kcopyd_throttle *t)
+{
+   unsigned throttle, now, difference;
+   int slept, skew;
+
+   if (unlikely(!t))
+   return;
+
+   slept = 0;
+
+try_again:
+   spin_lock_irq(_spinlock);
+
+   throttle = ACCESS_ONCE(t->throttle);
+
+   if (likely(throttle >= 100))
+   goto skip_limit;
+
+   now = jiffies;
+   difference = now - t->last_jiffies;
+   t->last_jiffies = now;
+   if (t->num_io_jobs)
+   t->io_period += difference;
+   t->total_period += difference;
+
+   if (unlikely(t->total_period >= (1 << ACOUNT_INTERVAL_SHIFT))) {
+   int shift = fls(t->total_period >> ACOUNT_INTERVAL_SHIFT);
+   t->total_period >>= shift;
+   t->io_period >>= shift;
+   }
+
+   skew = t->io_period - throttle * t->total_period / 100;
+   /* skew = t->io_period * 100 / throttle - t->total_period; */
+   if (unlikely(skew > 0) && slept < MAX_SLEEPS) {
+   slept++;
+   spin_unlock_irq(_spinlock);
+   msleep(SLEEP_MSEC);
+   goto try_again;
+   }
+
+skip_limit:
+   t->num_io_jobs++;
+
+   spin_unlock_irq(_spinlock);
+}
+
+static void io_job_finish(struct dm_kcopyd_throttle *t)
+{
+   unsigned long flags;
+
+   if (unlikely(!t))
+   return;
+
+   spin_lock_irqsave(_spinlock, flags);
+
+   t->num_io_jobs--;
+
+   if (likely(ACCESS_ONCE(t->throttle) >= 100))
+   goto skip_limit;
+
+   if (!t->num_io_jobs) {
+   unsigned now, difference;
+
+   now = jiffies;
+   difference = now - t->last_jiffies;
+   t->last_jiffies = now;
+
+   t->io_period += difference;
+   t->total_period += difference;
+   }
+
+skip_limit:
+   spin_unlock_irqrestore(_spinlock, flags);
+}
+
+
 static void wake(struct dm_kcopyd_client *kc)
 {
queue_work(kc->kcopyd_wq, >kcopyd_work);
@@ -348,6 +453,8 @@ static void complete_io(unsigned long er
struct kcopyd_job *job = (struct kcopyd_job *) context;
struct dm_kcopyd_client *kc = job->kc;
 
+   io_job_finish(kc->throttle);
+
if (error) {
if (job->rw & WRITE)
job->write_err |= error;
@@ -389,6 +496,8 @@ static int run_io_job(struct kcopyd_job 
.client = job->kc->io_client,
};
 
+   io_job_start(job->kc->throttle);
+
if (job->rw == READ)
r = dm_io(_req, 1, >source, NULL);
else
@@ -708,6 +817,7 @@ struct dm_kcopyd_client *dm_kcopyd_clien
INIT_LIST_HEAD(>complete_jobs);
INIT_LIST_HEAD(>io_jobs);
INIT_LIST_HEAD(>pages_jobs);
+   kc->throttle = throttle;
 
kc->job_pool = mempool_create_slab_pool(MIN_JOBS, _job_cache);
if (!kc->job_pool)

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to 

[PATCH 2/2] mirror throttling

2013-01-08 Thread Mikulas Patocka
dm-kcopyd: use throttle

This patch allows the administrator to limit kcopyd rate.

We maintain a history of kcopyd usage in variables io_period and
total_period. The actual kcopyd activity is (100 * io_period /
total_period) percent of time. If we exceed user-defined percentage
threshold, we sleep.

Signed-off-by: Mikulas Patocka mpato...@redhat.com

---
 drivers/md/dm-kcopyd.c |  110 +
 1 file changed, 110 insertions(+)

Index: linux-3.8-rc1-fast/drivers/md/dm-kcopyd.c
===
--- linux-3.8-rc1-fast.orig/drivers/md/dm-kcopyd.c  2013-01-02 
23:23:17.0 +0100
+++ linux-3.8-rc1-fast/drivers/md/dm-kcopyd.c   2013-01-02 23:23:25.0 
+0100
@@ -22,6 +22,7 @@
 #include linux/vmalloc.h
 #include linux/workqueue.h
 #include linux/mutex.h
+#include linux/delay.h
 #include linux/device-mapper.h
 #include linux/dm-kcopyd.h
 
@@ -51,6 +52,8 @@ struct dm_kcopyd_client {
struct workqueue_struct *kcopyd_wq;
struct work_struct kcopyd_work;
 
+   struct dm_kcopyd_throttle *throttle;
+
 /*
  * We maintain three lists of jobs:
  *
@@ -68,6 +71,108 @@ struct dm_kcopyd_client {
 
 static struct page_list zero_page_list;
 
+static DEFINE_SPINLOCK(throttle_spinlock);
+
+/*
+ * IO/IDLE accounting slowly decays after (1  ACOUNT_INTERVAL_SHIFT) period.
+ * When total_period = (1  ACOUNT_INTERVAL_SHIFT) the counters are divided
+ * by 2.
+ */
+#define ACOUNT_INTERVAL_SHIFT  SHIFT_HZ
+
+/*
+ * Sleep this number of milliseconds.
+ *
+ * It is experimentally found value.
+ * Smaller values cause increased copy rate above the limit. The reason for
+ * this is unknown. A possible explanations could be jiffies rounding errors
+ * or read/write cache inside the disk.
+ */
+#define SLEEP_MSEC 100
+
+/*
+ * Maximum number of sleep events. There is a theoretical livelock if more
+ * kcopyd clients do work simultaneously, this limit allows us to get out of
+ * the livelock.
+ */
+#define MAX_SLEEPS 10
+
+static void io_job_start(struct dm_kcopyd_throttle *t)
+{
+   unsigned throttle, now, difference;
+   int slept, skew;
+
+   if (unlikely(!t))
+   return;
+
+   slept = 0;
+
+try_again:
+   spin_lock_irq(throttle_spinlock);
+
+   throttle = ACCESS_ONCE(t-throttle);
+
+   if (likely(throttle = 100))
+   goto skip_limit;
+
+   now = jiffies;
+   difference = now - t-last_jiffies;
+   t-last_jiffies = now;
+   if (t-num_io_jobs)
+   t-io_period += difference;
+   t-total_period += difference;
+
+   if (unlikely(t-total_period = (1  ACOUNT_INTERVAL_SHIFT))) {
+   int shift = fls(t-total_period  ACOUNT_INTERVAL_SHIFT);
+   t-total_period = shift;
+   t-io_period = shift;
+   }
+
+   skew = t-io_period - throttle * t-total_period / 100;
+   /* skew = t-io_period * 100 / throttle - t-total_period; */
+   if (unlikely(skew  0)  slept  MAX_SLEEPS) {
+   slept++;
+   spin_unlock_irq(throttle_spinlock);
+   msleep(SLEEP_MSEC);
+   goto try_again;
+   }
+
+skip_limit:
+   t-num_io_jobs++;
+
+   spin_unlock_irq(throttle_spinlock);
+}
+
+static void io_job_finish(struct dm_kcopyd_throttle *t)
+{
+   unsigned long flags;
+
+   if (unlikely(!t))
+   return;
+
+   spin_lock_irqsave(throttle_spinlock, flags);
+
+   t-num_io_jobs--;
+
+   if (likely(ACCESS_ONCE(t-throttle) = 100))
+   goto skip_limit;
+
+   if (!t-num_io_jobs) {
+   unsigned now, difference;
+
+   now = jiffies;
+   difference = now - t-last_jiffies;
+   t-last_jiffies = now;
+
+   t-io_period += difference;
+   t-total_period += difference;
+   }
+
+skip_limit:
+   spin_unlock_irqrestore(throttle_spinlock, flags);
+}
+
+
 static void wake(struct dm_kcopyd_client *kc)
 {
queue_work(kc-kcopyd_wq, kc-kcopyd_work);
@@ -348,6 +453,8 @@ static void complete_io(unsigned long er
struct kcopyd_job *job = (struct kcopyd_job *) context;
struct dm_kcopyd_client *kc = job-kc;
 
+   io_job_finish(kc-throttle);
+
if (error) {
if (job-rw  WRITE)
job-write_err |= error;
@@ -389,6 +496,8 @@ static int run_io_job(struct kcopyd_job 
.client = job-kc-io_client,
};
 
+   io_job_start(job-kc-throttle);
+
if (job-rw == READ)
r = dm_io(io_req, 1, job-source, NULL);
else
@@ -708,6 +817,7 @@ struct dm_kcopyd_client *dm_kcopyd_clien
INIT_LIST_HEAD(kc-complete_jobs);
INIT_LIST_HEAD(kc-io_jobs);
INIT_LIST_HEAD(kc-pages_jobs);
+   kc-throttle = throttle;
 
kc-job_pool = mempool_create_slab_pool(MIN_JOBS, _job_cache);
if (!kc-job_pool)