date:20150813

[PATCH v5 7/8] Define PERF_PMU_TXN_READ interface

2015-08-13 Thread Sukadev Bhattiprolu

Define a new PERF_PMU_TXN_READ interface to read a group of counters
at once.

pmu->start_txn()// Initialize before first event

for each event in group
pmu->read(event);   // Queue each event to be read

rc = pmu->commit_txn()  // Read/update all queued counters

Note that we use this interface with all PMUs.  PMUs that implement this
interface use the ->read() operation to _queue_ the counters to be read
and use ->commit_txn() to actually read all the queued counters at once.

PMUs that don't implement PERF_PMU_TXN_READ ignore ->start_txn() and
->commit_txn() and continue to read counters one at a time.

Thanks to input from Peter Zijlstra.

Signed-off-by: Sukadev Bhattiprolu 
---
Changelog[v4]
- [Peter Zijlstra] Add lockdep_assert_held() in perf_event_read_group().
  Make sure the entire transaction happens on the same CPU.
---
 include/linux/perf_event.h |1 +
 kernel/events/core.c   |   24 +++-
 2 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 44bf05f..da307ad 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -169,6 +169,7 @@ struct perf_event;
 #define PERF_EVENT_TXN 0x1
 
 #define PERF_PMU_TXN_ADD  0x1  /* txn to add/schedule event on PMU */
+#define PERF_PMU_TXN_READ 0x2  /* txn to read event group from PMU */
 
 /**
  * pmu::capabilities flags
diff --git a/kernel/events/core.c b/kernel/events/core.c
index e3ce047..fde2f43 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3189,6 +3189,7 @@ static void __perf_event_read(void *info)
struct perf_event *sub, *event = data->event;
struct perf_event_context *ctx = event->ctx;
struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+   struct pmu *pmu = event->pmu;
 
/*
 * If this is a task context, we need to check whether it is
@@ -3207,18 +3208,31 @@ static void __perf_event_read(void *info)
}
 
update_event_times(event);
-   if (event->state == PERF_EVENT_STATE_ACTIVE)
-   event->pmu->read(event);
+   if (event->state != PERF_EVENT_STATE_ACTIVE)
+   goto unlock;
 
-   if (!data->group)
+   if (!data->group) {
+   pmu->read(event);
+   data->ret = 0;
goto unlock;
+   }
+
+   pmu->start_txn(pmu, PERF_PMU_TXN_READ);
+
+   pmu->read(event);
 
list_for_each_entry(sub, &event->sibling_list, group_entry) {
update_event_times(sub);
-   if (sub->state == PERF_EVENT_STATE_ACTIVE)
+   if (sub->state == PERF_EVENT_STATE_ACTIVE) {
+   /*
+* Use sibling's PMU rather than @event's since
+* sibling could be on different (eg: software) PMU.
+*/
sub->pmu->read(sub);
+   }
}
-   data->ret = 0;
+
+   data->ret = pmu->commit_txn(pmu);
 
 unlock:
raw_spin_unlock(&ctx->lock);
-- 
1.7.9.5

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v5 5/8] perf: Invert perf_read_group() loops

2015-08-13 Thread Sukadev Bhattiprolu

From: Peter Zijlstra 

In order to enable the use of perf_event_read(.group = true), we need
to invert the sibling-child loop nesting of perf_read_group().

Currently we iterate the child list for each sibling, this precludes
using group reads. Flip things around so we iterate each group for
each child.

Signed-off-by: Peter Zijlstra (Intel) 
Signed-off-by: Sukadev Bhattiprolu 
---
Changes to Peter's patch:
- Add GFP_KERNEL to kzalloc().
- Pass in address of counter to atomic_read().
- Return event->size rather than leader->size (perf_read_group())
- Keep chkpatch happy.
---
 kernel/events/core.c |   85 --
 1 file changed, 55 insertions(+), 30 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 31ec842..2221ebe 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3795,50 +3795,75 @@ u64 perf_event_read_value(struct perf_event *event, u64 
*enabled, u64 *running)
 }
 EXPORT_SYMBOL_GPL(perf_event_read_value);
 
-static int perf_read_group(struct perf_event *event,
-  u64 read_format, char __user *buf)
+static void __perf_read_group_add(struct perf_event *leader,
+   u64 read_format, u64 *values)
 {
-   struct perf_event *leader = event->group_leader, *sub;
-   struct perf_event_context *ctx = leader->ctx;
-   int n = 0, size = 0, ret;
-   u64 count, enabled, running;
-   u64 values[5];
+   struct perf_event *sub;
+   int n = 1; /* skip @nr */
 
-   lockdep_assert_held(&ctx->mutex);
+   perf_event_read(leader, true);
+
+   /*
+* Since we co-schedule groups, {enabled,running} times of siblings
+* will be identical to those of the leader, so we only publish one
+* set.
+*/
+   if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
+   values[n++] += leader->total_time_enabled +
+   atomic64_read(&leader->child_total_time_enabled);
+   }
 
-   count = perf_event_read_value(leader, &enabled, &running);
+   if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
+   values[n++] += leader->total_time_running +
+   atomic64_read(&leader->child_total_time_running);
+   }
 
-   values[n++] = 1 + leader->nr_siblings;
-   if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
-   values[n++] = enabled;
-   if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
-   values[n++] = running;
-   values[n++] = count;
+   /*
+* Write {count,id} tuples for every sibling.
+*/
+   values[n++] += perf_event_count(leader);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(leader);
 
-   size = n * sizeof(u64);
+   list_for_each_entry(sub, &leader->sibling_list, group_entry) {
+   values[n++] += perf_event_count(sub);
+   if (read_format & PERF_FORMAT_ID)
+   values[n++] = primary_event_id(sub);
+   }
+}
 
-   if (copy_to_user(buf, values, size))
-   return -EFAULT;
+static int perf_read_group(struct perf_event *event,
+  u64 read_format, char __user *buf)
+{
+   struct perf_event *leader = event->group_leader, *child;
+   struct perf_event_context *ctx = leader->ctx;
+   int ret = event->read_size;
+   u64 *values;
 
-   ret = size;
+   lockdep_assert_held(&ctx->mutex);
 
-   list_for_each_entry(sub, &leader->sibling_list, group_entry) {
-   n = 0;
+   values = kzalloc(event->read_size, GFP_KERNEL);
+   if (!values)
+   return -ENOMEM;
 
-   values[n++] = perf_event_read_value(sub, &enabled, &running);
-   if (read_format & PERF_FORMAT_ID)
-   values[n++] = primary_event_id(sub);
+   values[0] = 1 + leader->nr_siblings;
+
+   /*
+* By locking the child_mutex of the leader we effectively
+* lock the child list of all siblings.. XXX explain how.
+*/
+   mutex_lock(&leader->child_mutex);
 
-   size = n * sizeof(u64);
+   __perf_read_group_add(leader, read_format, values);
+   list_for_each_entry(child, &leader->child_list, child_list)
+   __perf_read_group_add(child, read_format, values);
 
-   if (copy_to_user(buf + ret, values, size)) {
-   return -EFAULT;
-   }
+   mutex_unlock(&leader->child_mutex);
 
-   ret += size;
-   }
+   if (copy_to_user(buf, values, event->read_size))
+   ret = -EFAULT;
+
+   kfree(values);
 
return ret;
 }
-- 
1.7.9.5

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v5 2/8] perf: Split perf_event_read() and perf_event_count()

2015-08-13 Thread Sukadev Bhattiprolu

perf_event_read() does two things:

- call the PMU to read/update the counter value, and
- compute the total count of the event and its children

Not all callers need both. perf_event_reset() for instance needs the
first piece but doesn't need the second.  Similarly, when we implement
the ability to read a group of events using the transaction interface,
we would need the two pieces done independently.

Break up perf_event_read() and have it just read/update the counter
and have the callers compute the total count if necessary.

Signed-off-by: Sukadev Bhattiprolu 
---
 kernel/events/core.c |   14 --
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 4435bf5..f9ca8cb 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3212,7 +3212,7 @@ static inline u64 perf_event_count(struct perf_event 
*event)
return __perf_event_count(event);
 }
 
-static u64 perf_event_read(struct perf_event *event)
+static void perf_event_read(struct perf_event *event)
 {
/*
 * If event is enabled and currently active on a CPU, update the
@@ -3238,8 +3238,6 @@ static u64 perf_event_read(struct perf_event *event)
update_event_times(event);
raw_spin_unlock_irqrestore(&ctx->lock, flags);
}
-
-   return perf_event_count(event);
 }
 
 /*
@@ -3751,14 +3749,18 @@ u64 perf_event_read_value(struct perf_event *event, u64 
*enabled, u64 *running)
*running = 0;
 
mutex_lock(&event->child_mutex);
-   total += perf_event_read(event);
+
+   perf_event_read(event);
+   total += perf_event_count(event);
+
*enabled += event->total_time_enabled +
atomic64_read(&event->child_total_time_enabled);
*running += event->total_time_running +
atomic64_read(&event->child_total_time_running);
 
list_for_each_entry(child, &event->child_list, child_list) {
-   total += perf_event_read(child);
+   perf_event_read(child);
+   total += perf_event_count(child);
*enabled += child->total_time_enabled;
*running += child->total_time_running;
}
@@ -3918,7 +3920,7 @@ static unsigned int perf_poll(struct file *file, 
poll_table *wait)
 
 static void _perf_event_reset(struct perf_event *event)
 {
-   (void)perf_event_read(event);
+   perf_event_read(event);
local64_set(&event->count, 0);
perf_event_update_userpage(event);
 }
-- 
1.7.9.5

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v5 6/8] perf: Add return value for perf_event_read().

2015-08-13 Thread Sukadev Bhattiprolu

When we implement the ability to read several counters at once (using
the PERF_PMU_TXN_READ transaction interface), perf_event_read() can
fail when the 'group' parameter is true (eg: trying to read too many
events at once).

For now, have perf_event_read() return an integer. Ignore the return
value when 'group' parameter is false.

Signed-off-by: Sukadev Bhattiprolu 
---
 kernel/events/core.c |   45 ++---
 1 file changed, 34 insertions(+), 11 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 2221ebe..e3ce047 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3177,6 +3177,7 @@ void perf_event_exec(void)
 struct perf_read_data {
struct perf_event *event;
bool group;
+   int ret;
 };
 
 /*
@@ -3217,6 +3218,7 @@ static void __perf_event_read(void *info)
if (sub->state == PERF_EVENT_STATE_ACTIVE)
sub->pmu->read(sub);
}
+   data->ret = 0;
 
 unlock:
raw_spin_unlock(&ctx->lock);
@@ -3230,8 +3232,10 @@ static inline u64 perf_event_count(struct perf_event 
*event)
return __perf_event_count(event);
 }
 
-static void perf_event_read(struct perf_event *event, bool group)
+static int perf_event_read(struct perf_event *event, bool group)
 {
+   int ret = 0;
+
/*
 * If event is enabled and currently active on a CPU, update the
 * value in the event structure:
@@ -3240,9 +3244,11 @@ static void perf_event_read(struct perf_event *event, 
bool group)
struct perf_read_data data = {
.event = event,
.group = group,
+   .ret = 0,
};
smp_call_function_single(event->oncpu,
 __perf_event_read, &data, 1);
+   ret = data.ret;
} else if (event->state == PERF_EVENT_STATE_INACTIVE) {
struct perf_event_context *ctx = event->ctx;
unsigned long flags;
@@ -3263,6 +3269,8 @@ static void perf_event_read(struct perf_event *event, 
bool group)
update_event_times(event);
raw_spin_unlock_irqrestore(&ctx->lock, flags);
}
+
+   return ret;
 }
 
 /*
@@ -3775,7 +3783,7 @@ u64 perf_event_read_value(struct perf_event *event, u64 
*enabled, u64 *running)
 
mutex_lock(&event->child_mutex);
 
-   perf_event_read(event, false);
+   (void)perf_event_read(event, false);
total += perf_event_count(event);
 
*enabled += event->total_time_enabled +
@@ -3784,7 +3792,7 @@ u64 perf_event_read_value(struct perf_event *event, u64 
*enabled, u64 *running)
atomic64_read(&event->child_total_time_running);
 
list_for_each_entry(child, &event->child_list, child_list) {
-   perf_event_read(child, false);
+   (void)perf_event_read(child, false);
total += perf_event_count(child);
*enabled += child->total_time_enabled;
*running += child->total_time_running;
@@ -3795,13 +3803,16 @@ u64 perf_event_read_value(struct perf_event *event, u64 
*enabled, u64 *running)
 }
 EXPORT_SYMBOL_GPL(perf_event_read_value);
 
-static void __perf_read_group_add(struct perf_event *leader,
+static int __perf_read_group_add(struct perf_event *leader,
u64 read_format, u64 *values)
 {
struct perf_event *sub;
int n = 1; /* skip @nr */
+   int ret;
 
-   perf_event_read(leader, true);
+   ret = perf_event_read(leader, true);
+   if (ret)
+   return ret;
 
/*
 * Since we co-schedule groups, {enabled,running} times of siblings
@@ -3830,6 +3841,8 @@ static void __perf_read_group_add(struct perf_event 
*leader,
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(sub);
}
+
+   return 0;
 }
 
 static int perf_read_group(struct perf_event *event,
@@ -3837,7 +3850,7 @@ static int perf_read_group(struct perf_event *event,
 {
struct perf_event *leader = event->group_leader, *child;
struct perf_event_context *ctx = leader->ctx;
-   int ret = event->read_size;
+   int ret;
u64 *values;
 
lockdep_assert_held(&ctx->mutex);
@@ -3854,17 +3867,27 @@ static int perf_read_group(struct perf_event *event,
 */
mutex_lock(&leader->child_mutex);
 
-   __perf_read_group_add(leader, read_format, values);
-   list_for_each_entry(child, &leader->child_list, child_list)
-   __perf_read_group_add(child, read_format, values);
+   ret = __perf_read_group_add(leader, read_format, values);
+   if (ret)
+   goto unlock;
+
+   list_for_each_entry(child, &leader->child_list, child_list) {
+   ret = __perf_read_group_add(child, read_format, values);
+   if (ret)
+

[PATCH v5 3/8] perf: Rename perf_event_read_{one, group}, perf_read_hw

2015-08-13 Thread Sukadev Bhattiprolu

From: "Peter Zijlstra (Intel)" 

In order to free up the perf_event_read_group() name:

 s/perf_event_read_\(one\|group\)/perf_read_\1/g
 s/perf_read_hw/__perf_read/g

Signed-off-by: Peter Zijlstra (Intel) 
---
 kernel/events/core.c |   14 +++---
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index f9ca8cb..02095f4 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3675,7 +3675,7 @@ static void put_event(struct perf_event *event)
 * see the comment there.
 *
 *  2) there is a lock-inversion with mmap_sem through
-* perf_event_read_group(), which takes faults while
+* perf_read_group(), which takes faults while
 * holding ctx->mutex, however this is called after
 * the last filedesc died, so there is no possibility
 * to trigger the AB-BA case.
@@ -3770,7 +3770,7 @@ u64 perf_event_read_value(struct perf_event *event, u64 
*enabled, u64 *running)
 }
 EXPORT_SYMBOL_GPL(perf_event_read_value);
 
-static int perf_event_read_group(struct perf_event *event,
+static int perf_read_group(struct perf_event *event,
   u64 read_format, char __user *buf)
 {
struct perf_event *leader = event->group_leader, *sub;
@@ -3818,7 +3818,7 @@ static int perf_event_read_group(struct perf_event *event,
return ret;
 }
 
-static int perf_event_read_one(struct perf_event *event,
+static int perf_read_one(struct perf_event *event,
 u64 read_format, char __user *buf)
 {
u64 enabled, running;
@@ -3856,7 +3856,7 @@ static bool is_event_hup(struct perf_event *event)
  * Read the performance event - simple non blocking version for now
  */
 static ssize_t
-perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
+__perf_read(struct perf_event *event, char __user *buf, size_t count)
 {
u64 read_format = event->attr.read_format;
int ret;
@@ -3874,9 +3874,9 @@ perf_read_hw(struct perf_event *event, char __user *buf, 
size_t count)
 
WARN_ON_ONCE(event->ctx->parent_ctx);
if (read_format & PERF_FORMAT_GROUP)
-   ret = perf_event_read_group(event, read_format, buf);
+   ret = perf_read_group(event, read_format, buf);
else
-   ret = perf_event_read_one(event, read_format, buf);
+   ret = perf_read_one(event, read_format, buf);
 
return ret;
 }
@@ -3889,7 +3889,7 @@ perf_read(struct file *file, char __user *buf, size_t 
count, loff_t *ppos)
int ret;
 
ctx = perf_event_ctx_lock(event);
-   ret = perf_read_hw(event, buf, count);
+   ret = __perf_read(event, buf, count);
perf_event_ctx_unlock(event, ctx);
 
return ret;
-- 
1.7.9.5

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v5 1/8] perf: Add a flags parameter to pmu txn interfaces

2015-08-13 Thread Sukadev Bhattiprolu

Currently, the PMU interface allows reading only one counter at a time.
But some PMUs like the 24x7 counters in Power, support reading several
counters at once. To leveage this functionality, extend the transaction
interface to support a "transaction type".

The first type, PERF_PMU_TXN_ADD, refers to the existing transactions,
i.e. used to _schedule_ all the events on the PMU as a group. A second
transaction type, PERF_PMU_TXN_READ, will be used in a follow-on patch,
by the 24x7 counters to read several counters at once.

Extend the transaction interfaces to the PMU to accept a 'txn_flags'
parameter and use this parameter to ignore any transactions that are
not of type PERF_PMU_TXN_ADD.

Thanks to Peter Zijlstra for his input.

Signed-off-by: Sukadev Bhattiprolu 

---
Changelog[v4]
- [Peter Zijlstra] Fix an copy-paste error in power_pmu_cancel_txn().
- [Peter Zijlstra] Use __this_cpu_read() and __this_cpu_write().

Changelog[v3]
- [Peter Zijlstra] Ensure the nop_txn interfaces disable/enable
  PMU only for TXN_ADD transactions.
- [Peter Zijlstra] Cache the flags parameter in ->start_txn() and
  drop the flags parameter from ->commit_txn() and ->cancel_txn().
---
 arch/powerpc/perf/core-book3s.c  |   25 -
 arch/s390/kernel/perf_cpum_cf.c  |   24 +++-
 arch/sparc/kernel/perf_event.c   |   19 ++-
 arch/x86/kernel/cpu/perf_event.c |   27 +--
 arch/x86/kernel/cpu/perf_event.h |1 +
 include/linux/perf_event.h   |   14 +++---
 kernel/events/core.c |   31 ---
 7 files changed, 130 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index d90893b..b18efe4 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -50,6 +50,7 @@ struct cpu_hw_events {
 
unsigned int group_flag;
int n_txn_start;
+   int txn_flags;
 
/* BHRB bits */
u64 bhrb_filter;/* BHRB HW branch 
filter */
@@ -1586,11 +1587,19 @@ static void power_pmu_stop(struct perf_event *event, 
int ef_flags)
  * Start group events scheduling transaction
  * Set the flag to make pmu::enable() not perform the
  * schedulability test, it will be performed at commit time
+ *
+ * We only support PERF_PMU_TXN_ADD transactions. Save the
+ * transaction flags but otherwise ignore non-PERF_PMU_TXN_ADD
+ * transactions.
  */
-static void power_pmu_start_txn(struct pmu *pmu)
+static void power_pmu_start_txn(struct pmu *pmu, int txn_flags)
 {
struct cpu_hw_events *cpuhw = this_cpu_ptr(&cpu_hw_events);
 
+   cpuhw->txn_flags = txn_flags;
+   if (txn_flags & ~PERF_PMU_TXN_ADD)
+   return;
+
perf_pmu_disable(pmu);
cpuhw->group_flag |= PERF_EVENT_TXN;
cpuhw->n_txn_start = cpuhw->n_events;
@@ -1604,6 +1613,12 @@ static void power_pmu_start_txn(struct pmu *pmu)
 static void power_pmu_cancel_txn(struct pmu *pmu)
 {
struct cpu_hw_events *cpuhw = this_cpu_ptr(&cpu_hw_events);
+   int txn_flags;
+
+   txn_flags = cpuhw->txn_flags;
+   cpuhw->txn_flags = 0;
+   if (txn_flags & ~PERF_PMU_TXN_ADD)
+   return;
 
cpuhw->group_flag &= ~PERF_EVENT_TXN;
perf_pmu_enable(pmu);
@@ -1618,10 +1633,18 @@ static int power_pmu_commit_txn(struct pmu *pmu)
 {
struct cpu_hw_events *cpuhw;
long i, n;
+   int txn_flags;
 
if (!ppmu)
return -EAGAIN;
+
cpuhw = this_cpu_ptr(&cpu_hw_events);
+
+   txn_flags = cpuhw->txn_flags;
+   cpuhw->txn_flags = 0;
+   if (cpuhw->txn_flags & ~PERF_PMU_TXN_ADD)
+   return 0;
+
n = cpuhw->n_events;
if (check_excludes(cpuhw->event, cpuhw->flags, 0, n))
return -EAGAIN;
diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c
index 56fdad4..a6f9e7b 100644
--- a/arch/s390/kernel/perf_cpum_cf.c
+++ b/arch/s390/kernel/perf_cpum_cf.c
@@ -72,6 +72,7 @@ struct cpu_hw_events {
atomic_tctr_set[CPUMF_CTR_SET_MAX];
u64 state, tx_state;
unsigned intflags;
+   int txn_flags;
 };
 static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
.ctr_set = {
@@ -82,6 +83,7 @@ static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
},
.state = 0,
.flags = 0,
+   .txn_flags = 0,
 };
 
 static int get_counter_set(u64 event)
@@ -572,11 +574,19 @@ static void cpumf_pmu_del(struct perf_event *event, int 
flags)
 /*
  * Start group events scheduling transaction.
  * Set flags to perform a single test at commit time.
+ *
+ * We only support PERF_PMU_TXN_ADD transactions. Save the
+ * transaction flags but otherwise ignore non-PERF_PMU_TXN_ADD
+ * transactions.
  */
-static void cpumf_pmu_start_

[PATCH v5 0/8] perf: Implement group-read of events using txn interface

2015-08-13 Thread Sukadev Bhattiprolu

Unlike normal hardware PMCs, the 24x7 counters in Power8 are stored in
memory and accessed via a hypervisor call (HCALL).  A major aspect of the
HCALL is that it allows retireving _several_ counters at once (unlike
regular PMCs, which are read one at a time). By reading several counters
at once, we can get a more consistent snapshot of the system.

This patchset extends the transaction interface to accomplish submitting
several events to the PMU and have the PMU read them all at once. User is
expected to submit the set of events they want to read as an "event group".

In the kernel, we submit each event to the PMU using the following logic
(from Peter Zijlstra).

pmu->start_txn(pmu, PMU_TXN_READ);

leader->read();
for_each_sibling()
sibling->read();
pmu->commit_txn();

where:
- the ->read()s queue events to be submitted to the hypervisor, and,
- the ->commit_txn() issues the HCALL, retrieves the result and
  updates the event count.

Architectures/PMUs that don't need/implement PMU_TXN_READ type of transactions,
simply ignore the ->start_txn() and ->commit_txn() and continue to read the
counters one at a time in the ->read() call.

Compile/touch tested on x86. Need help testing on s390 and Sparc.

Thanks to Peter Zijlstra for his input/code.

Changelog[v5]
- Invert the sibling-child loop nesting in perf-read-group (re-org
  code and drop the patch that defined perf_event_aggregate()).

Changelog[v4]
- Ensure all the transactions operations happen on the same CPU so PMUs
  can use per-CPU buffers for the transaction.
- Add lockdep assert and fix a locking issue in perf_read_group().

Changelog [v3]
- Simple changes/reorg of patchset to split/rename functions
- [Peter Zijlstra] Save the transaction flags in ->start_txn() and
  drop the flags parameter from ->commit_txn() and ->cancel_txn().
- [Peter Zijlstra] The nop txn interfaces don't need to disable/enable
  PMU for PERF_PMU_TXN_READ transactions.

Changelog [v2]
- Use the transaction interface unconditionally to avoid special-case
  code. Architectures/PMUs that don't need the READ transaction types
  simply ignore the ->start_txn() and ->commit_txn() calls.



Peter Zijlstra (2):
  perf: Add group reads to perf_event_read()
  perf: Invert perf_read_group() loops

Peter Zijlstra (Intel) (1):
  perf: Rename perf_event_read_{one,group}, perf_read_hw

Sukadev Bhattiprolu (5):
  perf: Add a flags parameter to pmu txn interfaces
  perf: Split perf_event_read() and perf_event_count()
  perf: Add return value for perf_event_read().
  Define PERF_PMU_TXN_READ interface
  powerpc/perf/hv-24x7: Use PMU_TXN_READ interface

 arch/powerpc/perf/core-book3s.c  |   25 -
 arch/powerpc/perf/hv-24x7.c  |  166 +-
 arch/s390/kernel/perf_cpum_cf.c  |   24 -
 arch/sparc/kernel/perf_event.c   |   19 +++-
 arch/x86/kernel/cpu/perf_event.c |   27 -
 arch/x86/kernel/cpu/perf_event.h |1 +
 include/linux/perf_event.h   |   15 ++-
 kernel/events/core.c |  210 +-
 8 files changed, 429 insertions(+), 58 deletions(-)

-- 
1.7.9.5

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v4 11/11] cxl: Add CONFIG_CXL_EEH symbol

2015-08-13 Thread Ian Munsie

Once cxlflash has been merged we might drop this, but until then:

Acked-by: Ian Munsie 

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v4 10/11] cxl: EEH support

2015-08-13 Thread Ian Munsie

Acked-by: Ian Munsie 

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v4 09/11] cxl: Allow the kernel to trust that an image won't change on PERST.

2015-08-13 Thread Ian Munsie

Acked-by: Ian Munsie 

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v4 08/11] cxl: Don't remove AFUs/vPHBs in cxl_reset

2015-08-13 Thread Ian Munsie

Acked-by: Ian Munsie 

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v4 07/11] cxl: Refactor AFU init/teardown

2015-08-13 Thread Ian Munsie

Excerpts from Daniel Axtens's message of 2015-08-13 14:11:25 +1000:
> +rc = cxl_map_slice_regs(afu, adapter, dev);
> +if (rc)
> +return rc;
>  
> -if ((rc = cxl_map_slice_regs(afu, adapter, dev)))

Like the previous patch, mixing this coding style change in with this
patch makes the diff harder to follow than necessary (though not as hard
as the last one). If you happen to do a v5, please put the coding style
changes in a separate patch, but otherwise it looks fine:

Acked-by: Ian Munsie 

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v4 06/11] cxl: Refactor adaptor init/teardown

2015-08-13 Thread Ian Munsie

Excerpts from Daniel Axtens's message of 2015-08-13 14:11:24 +1000:
> +/* This should contain *only* operations that can safely be done in
> + * both creation and recovery.
> + */
> +static int cxl_configure_adapter(struct cxl *adapter, struct pci_dev *dev)
>  {
> -struct cxl *adapter;
> -bool free = true;
>  int rc;
>  
> +adapter->dev.parent = &dev->dev;
> +adapter->dev.release = cxl_release_adapter;
> +pci_set_drvdata(dev, adapter);

These seem a bit odd here (though perfectly harmless) - not sure these
need to be done again on recovery (but maybe I'm wrong?) - seems more
like something that should be done early in cxl_init_adapter?

> -if ((rc = cxl_update_image_control(adapter)))
> -goto err2;
> +rc = cxl_update_image_control(adapter);
> +if (rc)

These types of coding style changes should really be in a separate patch
to make it easier to see exactly how you have changed the init path in
this one. I know mpe wanted these changed and after looking at the diff
pretty carefully I realise that you haven't actually changed much
functionally so I'll let this pass, but if you happen to do another
respin please move the style changes into a separate patch.

Acked-by: Ian Munsie 

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH] powerpc/eeh: Probe after unbalanced kref check

2015-08-13 Thread Daniel Axtens

In the complete hotplug case, EEH PEs are supposed to be released
and set to NULL. Normally, this is done by eeh_remove_device(),
which is called from pcibios_release_device().

However, if something is holding a kref to the device, it will not
be released, and the PE will remain. eeh_add_device_late() has
a check for this which will explictly destroy the PE in this case.

This check in eeh_add_device_late() occurs after a call to
eeh_ops->probe(). On PowerNV, probe is a pointer to pnv_eeh_probe(),
which will exit without probing if there is an existing PE.

This means that on PowerNV, devices with outstanding krefs will not
be rediscovered by EEH correctly after a complete hotplug. This is
affecting CXL (CAPI) devices in the field.

Put the probe after the kref check so that the PE is destroyed
and affected devices are correctly rediscovered by EEH.

Fixes: d91dafc02f42 ("powerpc/eeh: Delay probing EEH device during hotplug")
Cc: sta...@vger.kernel.org
Cc: Gavin Shan 
Signed-off-by: Daniel Axtens 
---
 arch/powerpc/kernel/eeh.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index af9b597b10af..8e61d717915e 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -1116,9 +1116,6 @@ void eeh_add_device_late(struct pci_dev *dev)
return;
}
 
-   if (eeh_has_flag(EEH_PROBE_MODE_DEV))
-   eeh_ops->probe(pdn, NULL);
-
/*
 * The EEH cache might not be removed correctly because of
 * unbalanced kref to the device during unplug time, which
@@ -1142,6 +1139,9 @@ void eeh_add_device_late(struct pci_dev *dev)
dev->dev.archdata.edev = NULL;
}
 
+   if (eeh_has_flag(EEH_PROBE_MODE_DEV))
+   eeh_ops->probe(pdn, NULL);
+
edev->pdev = dev;
dev->dev.archdata.edev = edev;
 
-- 
2.1.4

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH] cxl: Plug irq_bitmap getting leaked in cxl_context

2015-08-13 Thread Vaibhav Jain

This patch plugs the leak of irq_bitmap, allocated as part of
initialization of cxl_context struct; during the call to
afu_allocate_irqs. The bitmap is now release during the call to function
afu_release_irqs.

Reported-by: Matthew R. Ochs 
Signed-off-by: Vaibhav Jain 
---
 drivers/misc/cxl/irq.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/drivers/misc/cxl/irq.c b/drivers/misc/cxl/irq.c
index 680cd26..c8f1f9d 100644
--- a/drivers/misc/cxl/irq.c
+++ b/drivers/misc/cxl/irq.c
@@ -511,4 +511,8 @@ void afu_release_irqs(struct cxl_context *ctx, void *cookie)
 
afu_irq_name_free(ctx);
cxl_release_irq_ranges(&ctx->irqs, ctx->afu->adapter);
+
+   kfree(ctx->irq_bitmap);
+   ctx->irq_bitmap = NULL;
+   ctx->irq_count = 0;
 }
-- 
2.2.1

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v2] video: fbdev: fsl: Fix the sleep function for FSL DIU module

2015-08-13 Thread Dongsheng Wang

From: Jason Jin 

For deep sleep, the diu module will power off, when wake up
from the deep sleep, the registers need to be reinitialized.

Signed-off-by: Jason Jin 
Signed-off-by: Wang Dongsheng 
---
*v2*
Changes:
- int i -> unsigned int i.

Rmove:
- struct mfb_info *mfbi;

diff --git a/drivers/video/fbdev/fsl-diu-fb.c b/drivers/video/fbdev/fsl-diu-fb.c
index 7fa2e6f..b335c1a 100644
--- a/drivers/video/fbdev/fsl-diu-fb.c
+++ b/drivers/video/fbdev/fsl-diu-fb.c
@@ -1628,9 +1628,16 @@ static int fsl_diu_suspend(struct platform_device 
*ofdev, pm_message_t state)
 static int fsl_diu_resume(struct platform_device *ofdev)
 {
struct fsl_diu_data *data;
+   unsigned int i;
 
data = dev_get_drvdata(&ofdev->dev);
-   enable_lcdc(data->fsl_diu_info);
+
+   fsl_diu_enable_interrupts(data);
+   update_lcdc(data->fsl_diu_info);
+   for (i = 0; i < NUM_AOIS; i++) {
+   if (data->mfb[i].count)
+   fsl_diu_enable_panel(&data->fsl_diu_info[i]);
+   }
 
return 0;
 }
-- 
2.1.0.27.g96db324

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

RE: [PATCH] video/fsl: Fix the sleep function for FSL DIU module

2015-08-13 Thread Wang Dongsheng

Hi Tabi,

> -Original Message-
> From: Timur Tabi [mailto:ti...@tabi.org]
> Sent: Tuesday, March 25, 2014 11:55 PM
> To: Wang Dongsheng-B40534
> Cc: Wood Scott-B07421; Jin Zhengxiong-R64188; Li Yang-Leo-R58472; linuxppc-
> d...@lists.ozlabs.org; linux-fb...@vger.kernel.org
> Subject: Re: [PATCH] video/fsl: Fix the sleep function for FSL DIU module
> 
> On 03/25/2014 02:56 AM, Dongsheng Wang wrote:
> > From: Jason Jin 
> >
> > For deep sleep, the diu module will power off, when wake up from the
> > deep sleep, the registers need to be reinitialized.
> >
> > Signed-off-by: Jason Jin 
> > Signed-off-by: Wang Dongsheng 
> >
> > diff --git a/drivers/video/fsl-diu-fb.c b/drivers/video/fsl-diu-fb.c
> > index e8758b9..7ec780c 100644
> > --- a/drivers/video/fsl-diu-fb.c
> > +++ b/drivers/video/fsl-diu-fb.c
> > @@ -1628,9 +1628,18 @@ static int fsl_diu_suspend(struct platform_device
> *ofdev, pm_message_t state)
> >   static int fsl_diu_resume(struct platform_device *ofdev)
> >   {
> > struct fsl_diu_data *data;
> > +   struct mfb_info *mfbi;
> 
> You don't need this, if ...
> 
> > +   int i;
> >
> > data = dev_get_drvdata(&ofdev->dev);
> > -   enable_lcdc(data->fsl_diu_info);
> > +   fsl_diu_enable_interrupts(data);
> > +   update_lcdc(data->fsl_diu_info);
> > +
> > +   for (i = 0; i < NUM_AOIS; i++) {
> > +   mfbi = &data->mfb[i];
> > +   if (mfbi->count)
> 
> ... you do this:
> 
>   if (data->mfb[i].count)
> 
> Also, 'i' should be an 'unsigned int'.
> 
> > +   fsl_diu_enable_panel(&data->fsl_diu_info[i]);
> > +   }
> >
> > return 0;
> >   }
> >
> 
> Other than that, this seems okay.
> 

Thanks, send v2 to update this patch.

Regards,
-Dongsheng
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v4 05/11] cxl: Clean up adapter MMIO unmap path.

2015-08-13 Thread Ian Munsie

Acked-by: Ian Munsie 

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v4 04/11] cxl: Make IRQ release idempotent

2015-08-13 Thread Ian Munsie

Acked-by: Ian Munsie 

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v4 03/11] cxl: Allocate and release the SPA with the AFU

2015-08-13 Thread Ian Munsie

Excerpts from Daniel Axtens's message of 2015-08-13 14:11:21 +1000:
> Previously the SPA was allocated and freed upon entering and leaving
> AFU-directed mode. This causes some issues for error recovery - contexts
> hold a pointer inside the SPA, and they may persist after the AFU has
> been detached.
> 
> We would ideally like to allocate the SPA when the AFU is allocated, and
> release it until the AFU is released. However, we don't know how big the
> SPA needs to be until we read the AFU descriptor.
> 
> Therefore, restructure the code:
> 
>  - Allocate the SPA only once, on the first attach.
> 
>  - Release the SPA only when the entire AFU is being released (not
>detached). Guard the release with a NULL check, so we don't free
>if it was never allocated (e.g. dedicated mode)

This is certainly an improvement, though in the long run I wonder if we
should consider making the contexts increase the refcount of the AFU so
that we can be sure that the AFU structure will outlive the contexts?

That would be a more significant rework though, and this patch is needed
either way and solves an immediate problem, so:

Acked-by: Ian Munsie 

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v4 02/11] cxl: Drop commands if the PCI channel is not in normal state

2015-08-13 Thread Ian Munsie

Acked-by: Ian Munsie 

Excerpts from Daniel Axtens's message of 2015-08-13 14:11:20 +1000:
> +/* Only warn if we detached while the link was OK.

Only because mpe is sure to pick this up (I personally don't mind) -
block comments should start with /* on a line by itself.

> +/* If the adapter has gone down, we can assume that we
...
> +/* We could be asked to terminate when the hw is down. That
...
> +/* We could be asked to remove when the hw is down. Again, if
...
> +/* If the adapter has gone away, we can't get any meaningful
...
> +/* Config space IO is based on phb->cfg_addr, which is based on

Ditto.

Cheers,
-Ian

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v4 01/11] cxl: Convert MMIO read/write macros to inline functions

2015-08-13 Thread Ian Munsie

Acked-by: Ian Munsie 

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH] book3s_hv_rmhandlers:Pass the correct trap argument to kvmhv_commence_exit

2015-08-13 Thread Sam Bobroff

On Thu, May 21, 2015 at 01:57:04PM +0530, Gautham R. Shenoy wrote:
> In guest_exit_cont we call kvmhv_commence_exit which expects the trap
> number as the argument. However r3 doesn't contain the trap number at
> this point and as a result we would be calling the function with a
> spurious trap number.
> 
> Fix this by copying r12 into r3 before calling kvmhv_commence_exit as
> r12 contains the trap number
> 
> Signed-off-by: Gautham R. Shenoy 

Hi Gautham,

I agree with your logic: r3 is quite clearly corrupted in that path. So:

Reviewed-by: Sam Bobroff 

Just one comment: Do you have a case of this causing some visible problem due
to the corrupted trap number? (I'll test the patch if you do.)

Cheers,
Sam.

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v3] powerpc: Add an inline function to update POWER8 HID0

2015-08-13 Thread Sam Bobroff

On Wed, Aug 05, 2015 at 12:38:31PM +0530, Gautham R. Shenoy wrote:
> Section 3.7 of Version 1.2 of the Power8 Processor User's Manual
> prescribes that updates to HID0 be preceded by a SYNC instruction and
> followed by an ISYNC instruction (Page 91).
> 
> Create an inline function name update_power8_hid0() which follows this
> recipe and invoke it from the static split core path.
> 
> Signed-off-by: Gautham R. Shenoy 

Hi Gautham,

I've tested this on a Power 8 machine and verified that it is able to change
split modes and that when doing so the new code is used.

Reviewed-by: Sam Bobroff 
Tested-by: Sam Bobroff 

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v2 1/2] powerpc/85xx: Add binding for SCFG

2015-08-13 Thread Dongsheng Wang

From: Wang Dongsheng 

SCFG provides SoC specific configuration and status registers for
the chip. Add this for powerpc platform.

Signed-off-by: Wang Dongsheng 
---
*V2*
- Remove scfg description in board.txt and create scfg.txt for scfg.
- Change "fsl,-scfg" to "fsl,-scfg"

diff --git a/Documentation/devicetree/bindings/powerpc/fsl/scfg.txt 
b/Documentation/devicetree/bindings/powerpc/fsl/scfg.txt
new file mode 100644
index 000..0532c46
--- /dev/null
+++ b/Documentation/devicetree/bindings/powerpc/fsl/scfg.txt
@@ -0,0 +1,18 @@
+Freescale Supplement configuration unit (SCFG)
+
+SCFG is the supplemental configuration unit, that provides SoC specific
+configuration and status registers for the chip. Such as getting PEX port
+status.
+
+Required properties:
+
+- compatible: should be "fsl,-scfg"
+- reg: should contain base address and length of SCFG memory-mapped
+registers
+
+Example:
+
+   scfg: global-utilities@fc000 {
+   compatible = "fsl,t1040-scfg";
+   reg = <0xfc000 0x1000>;
+   };
-- 
2.1.0.27.g96db324

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH V2] QorIQ/TMU: add thermal management support based on TMU

2015-08-13 Thread Eduardo Valentin

Hello Hongtao,

On Fri, Aug 14, 2015 at 03:15:22AM +, Hongtao Jia wrote:
> Hi Eduardo,
> 
> In previous mail I asked questions about including header files in device 
> tree.
> Don't bother, I have already figured out the solution.
> 
> Another questions is about cpu cooling:
> I found out that there is no explicit calling for registering cpu cooling
> device in the of-thermal style drivers.

Your understanding is correct.

> 
> And Samsung did it in cpufreq driver: drivers/cpufreq/exynos-cpufreq.c
> 

Yes.

> Should all the of-thermal driver use the same way?

of-thermal won't handle the cooling device registering. It is typically
registered by the cpufreq driver. Have a look in
drivers/cpufreq/cpufreq-dt.c

> Or is there any recommendation for registering cpu cooling device?
> (I enabled the CONFIG_CPUFREQ_DT and still got no cooling device registered)

If your system supports using cpufreq-dt, then it will handle
registering the cpucooling for you, if you configures the cooling dt
properties in your DT files.

How does your DT entry look like?

BR,

Eduardo 
> 
> Thanks.
> 
> ---
> Best Regards,
> Hongtao
> 
> 
> > -Original Message-
> > From: Linuxppc-dev [mailto:linuxppc-dev-
> > bounces+b38951=freescale@lists.ozlabs.org] On Behalf Of Hongtao Jia
> > Sent: Friday, August 07, 2015 4:15 PM
> > To: Eduardo Valentin
> > Cc: Wood Scott-B07421; linuxppc-dev@lists.ozlabs.org; linux-
> > p...@vger.kernel.org
> > Subject: RE: [PATCH V2] QorIQ/TMU: add thermal management support based
> > on TMU
> > 
> > Thanks for your comments.
> > Please see my questions inline.
> > 
> > Thanks.
> > ---
> > Best Regards,
> > Hongtao
> > 
> > 
> > > -Original Message-
> > > From: Eduardo Valentin [mailto:edubez...@gmail.com]
> > > Sent: Thursday, August 06, 2015 3:43 AM
> > > To: Jia Hongtao-B38951
> > > Cc: linux...@vger.kernel.org; linuxppc-dev@lists.ozlabs.org; Wood
> > > Scott-
> > > B07421
> > > Subject: Re: [PATCH V2] QorIQ/TMU: add thermal management support
> > > based on TMU
> > >
> > > On Thu, Jul 30, 2015 at 08:13:09AM +, Hongtao Jia wrote:
> > > > - "Any specific reason why not using OF thermal?"
> > > > - No, actually.
> > > >
> > > > I'd like to use OF thermal after some clarification.
> > > >
> > > > Regarding to "cooling-maps". For some cases there should be more
> > > > than one cpus as cooling device and they are independent.
> > > > 1. Let's say 4. So we need to provide 4 maps like map0-map3. Right?
> > >
> > > That would depend on the amount of sensors you have. Do you have one
> > > sensor per cpu? if the answer is yes, then you probably want to have
> > > four different map entries, yes, but one on each thermal zone of each
> > > cpu temperature sensor. if the answer is no, then you would need to
> > > have all the maps in the same thermal zone.
> > >
> > > > 2. "cooling-max-level" may vary depend on switch settings or firmware.
> > > Is that
> > > >OK if I do not provide "cooling-min-level" and "cooling-max-level"
> > > property?
> > >
> > > That is already achievable by using the cooling-device property of a
> > > cooling map.
> > >
> > > Please have a look in the example section of the
> > > Documentation/devicetree/bindings/thermal/thermal.txt
> > 
> > Yes, I read this file.
> > So in my understanding:
> > There is no need to provide "cooling-min-level" and "cooling-max-level"
> > property.
> > THERMAL_NO_LIMIT value in cooling device node will indicate the driver to
> > automatically parse the min and max state, right?
> > 
> > Talking about THERMAL_NO_LIMIT, I need to #include  > bindings/thermal/thermal.h> to provide the definition. But I got
> > compiling error when build dtb file.
> > I did some research and using "make t1040qds.dtb" in order to involve
> > preprocessor.
> > But with simply adding "#include " to
> > t1040si-post.dtsi at line 35 I still got error like this:
> > Error: arch/powerpc/boot/dts/fsl/t1040si-post.dtsi:35.1-9 syntax error
> > FATAL ERROR: Unable to parse input tree
> > 
> > Could you help me out here.
> > Thanks.
> > 
> > >
> > > Let me know if you need further clarification.
> > >
> > >
> > > BR,
> > >
> > > Eduardo Valentin
> > >
> > > >
> > > > Thanks.
> > > > -Hongtao
> > > >
> > > >
> > > > > -Original Message-
> > > > > From: Eduardo Valentin [mailto:edubez...@gmail.com]
> > > > > Sent: Thursday, July 30, 2015 2:56 PM
> > > > > To: Jia Hongtao-B38951
> > > > > Cc: linux...@vger.kernel.org; linuxppc-dev@lists.ozlabs.org; Wood
> > > > > Scott-
> > > > > B07421
> > > > > Subject: Re: [PATCH V2] QorIQ/TMU: add thermal management support
> > > > > based on TMU
> > > > >
> > > > > On Wed, Jul 29, 2015 at 02:19:39PM +0800, Jia Hongtao wrote:
> > > > > > It supports one critical trip point and one passive trip point.
> > > > > > The cpufreq is used as the cooling device to throttle CPUs when
> > > > > > the passive trip is crossed.
> > > > > >
> > > > > > Signed-off-by: Jia Hongtao 
> > > > > > ---
> > > > > > This patch base

Re: [PATCH 29/31] parisc: handle page-less SG entries

2015-08-13 Thread David Miller

From: James Bottomley 
Date: Thu, 13 Aug 2015 20:59:20 -0700

> On Thu, 2015-08-13 at 20:30 -0700, Dan Williams wrote:
>> On Thu, Aug 13, 2015 at 7:31 AM, Christoph Hellwig  wrote:
>> > On Wed, Aug 12, 2015 at 09:01:02AM -0700, Linus Torvalds wrote:
>> >> I'm assuming that anybody who wants to use the page-less
>> >> scatter-gather lists always does so on memory that isn't actually
>> >> virtually mapped at all, or only does so on sane architectures that
>> >> are cache coherent at a physical level, but I'd like that assumption
>> >> *documented* somewhere.
>> >
>> > It's temporarily mapped by kmap-like helpers.  That code isn't in
>> > this series. The most recent version of it is here:
>> >
>> > https://git.kernel.org/cgit/linux/kernel/git/djbw/nvdimm.git/commit/?h=pfn&id=de8237c99fdb4352be2193f3a7610e902b9bb2f0
>> >
>> > note that it's not doing the cache flushing it would have to do yet, but
>> > it's also only enabled for x86 at the moment.
>> 
>> For virtually tagged caches I assume we would temporarily map with
>> kmap_atomic_pfn_t(), similar to how drm_clflush_pages() implements
>> powerpc support.  However with DAX we could end up with multiple
>> virtual aliases for a page-less pfn.
> 
> At least on some PA architectures, you have to be very careful.
> Improperly managed, multiple aliases will cause the system to crash
> (actually a machine check in the cache chequerboard). For the most
> temperamental systems, we need the cache line flushed and the alias
> mapping ejected from the TLB cache before we access the same page at an
> inequivalent alias.

Also, I want to mention that on sparc64 we manage the cache aliasing
state in the page struct.

Until a page is mapped into userspace, we just record the most recent
cpu to store into that page with kernel side mappings.  Once the page
ends up being mapped or the cpu doing kernel side stores changes, we
actually perform the cache flush.

Generally speaking, I think that all actual physical memory the kernel
operates on should have a struct page backing it.  So this whole
discussion of operating on physical memory in scatter lists without
backing page structs feels really foreign to me.
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH 29/31] parisc: handle page-less SG entries

2015-08-13 Thread James Bottomley

On Thu, 2015-08-13 at 20:30 -0700, Dan Williams wrote:
> On Thu, Aug 13, 2015 at 7:31 AM, Christoph Hellwig  wrote:
> > On Wed, Aug 12, 2015 at 09:01:02AM -0700, Linus Torvalds wrote:
> >> I'm assuming that anybody who wants to use the page-less
> >> scatter-gather lists always does so on memory that isn't actually
> >> virtually mapped at all, or only does so on sane architectures that
> >> are cache coherent at a physical level, but I'd like that assumption
> >> *documented* somewhere.
> >
> > It's temporarily mapped by kmap-like helpers.  That code isn't in
> > this series. The most recent version of it is here:
> >
> > https://git.kernel.org/cgit/linux/kernel/git/djbw/nvdimm.git/commit/?h=pfn&id=de8237c99fdb4352be2193f3a7610e902b9bb2f0
> >
> > note that it's not doing the cache flushing it would have to do yet, but
> > it's also only enabled for x86 at the moment.
> 
> For virtually tagged caches I assume we would temporarily map with
> kmap_atomic_pfn_t(), similar to how drm_clflush_pages() implements
> powerpc support.  However with DAX we could end up with multiple
> virtual aliases for a page-less pfn.

At least on some PA architectures, you have to be very careful.
Improperly managed, multiple aliases will cause the system to crash
(actually a machine check in the cache chequerboard). For the most
temperamental systems, we need the cache line flushed and the alias
mapping ejected from the TLB cache before we access the same page at an
inequivalent alias.

James


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v3 6/6] powerpc/powernv: allocate sparse PE# when using M64 BAR in Single PE mode

2015-08-13 Thread Wei Yang

On Fri, Aug 14, 2015 at 11:03:00AM +1000, Gavin Shan wrote:
>On Thu, Aug 13, 2015 at 10:11:11PM +0800, Wei Yang wrote:
>>When M64 BAR is set to Single PE mode, the PE# assigned to VF could be
>>sparse.
>>
>>This patch restructures the patch to allocate sparse PE# for VFs when M64
>>BAR is set to Single PE mode.
>>
>>Signed-off-by: Wei Yang 
>>---
>> arch/powerpc/include/asm/pci-bridge.h |2 +-
>> arch/powerpc/platforms/powernv/pci-ioda.c |   59 
>> +++--
>> 2 files changed, 41 insertions(+), 20 deletions(-)
>>
>>diff --git a/arch/powerpc/include/asm/pci-bridge.h 
>>b/arch/powerpc/include/asm/pci-bridge.h
>>index 9d33ada..b026ef8 100644
>>--- a/arch/powerpc/include/asm/pci-bridge.h
>>+++ b/arch/powerpc/include/asm/pci-bridge.h
>>@@ -214,7 +214,7 @@ struct pci_dn {
>> #ifdef CONFIG_PCI_IOV
>>  u16 vfs_expanded;   /* number of VFs IOV BAR expanded */
>>  u16 num_vfs;/* number of VFs enabled*/
>>- int offset; /* PE# for the first VF PE */
>>+ int pe_num_map[MAX_M64_BAR];/* PE# for the first VF PE or array */
>
>Same question as to "m64_map". pdn for non-PF doesn't need it.
>

The same, I prefer the dynamic version.

>>  boolm64_single_mode;/* Use M64 BAR in Single Mode */
>> #define IODA_INVALID_M64(-1)
>>  int  m64_map[PCI_SRIOV_NUM_BARS][MAX_M64_BAR];
>>diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
>>b/arch/powerpc/platforms/powernv/pci-ioda.c
>>index 1e6ac86..7633538 100644
>>--- a/arch/powerpc/platforms/powernv/pci-ioda.c
>>+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>>@@ -1232,7 +1232,7 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, 
>>u16 num_vfs)
>>
>>  /* Map the M64 here */
>>  if (pdn->m64_single_mode) {
>>- pe_num = pdn->offset + j;
>>+ pe_num = pdn->pe_num_map[j];
>>  rc = opal_pci_map_pe_mmio_window(phb->opal_id,
>>  pe_num, OPAL_M64_WINDOW_TYPE,
>>  pdn->m64_map[i][j], 0);
>>@@ -1336,7 +1336,7 @@ void pnv_pci_sriov_disable(struct pci_dev *pdev)
>>  struct pnv_phb*phb;
>>  struct pci_dn *pdn;
>>  struct pci_sriov  *iov;
>>- u16 num_vfs;
>>+ u16 num_vfs, i;
>>
>>  bus = pdev->bus;
>>  hose = pci_bus_to_host(bus);
>>@@ -1350,14 +1350,17 @@ void pnv_pci_sriov_disable(struct pci_dev *pdev)
>>
>>  if (phb->type == PNV_PHB_IODA2) {
>>  if (!pdn->m64_single_mode)
>>- pnv_pci_vf_resource_shift(pdev, -pdn->offset);
>>+ pnv_pci_vf_resource_shift(pdev, -pdn->pe_num_map[0]);
>>
>>  /* Release M64 windows */
>>  pnv_pci_vf_release_m64(pdev);
>>
>>  /* Release PE numbers */
>>- bitmap_clear(phb->ioda.pe_alloc, pdn->offset, num_vfs);
>>- pdn->offset = 0;
>>+ if (pdn->m64_single_mode) {
>>+ for (i = 0; i < num_vfs; i++)
>>+ pnv_ioda_free_pe(phb, pdn->pe_num_map[i]);
>>+ } else
>>+ bitmap_clear(phb->ioda.pe_alloc, pdn->pe_num_map[0], 
>>num_vfs);
>>  }
>> }
>>
>>@@ -1383,7 +1386,10 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, 
>>u16 num_vfs)
>>
>>  /* Reserve PE for each VF */
>>  for (vf_index = 0; vf_index < num_vfs; vf_index++) {
>>- pe_num = pdn->offset + vf_index;
>>+ if (pdn->m64_single_mode)
>>+ pe_num = pdn->pe_num_map[vf_index];
>>+ else
>>+ pe_num = pdn->pe_num_map[0] + vf_index;
>>
>>  pe = &phb->ioda.pe_array[pe_num];
>>  pe->pe_number = pe_num;
>>@@ -1425,6 +1431,7 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 
>>num_vfs)
>>  struct pnv_phb*phb;
>>  struct pci_dn *pdn;
>>  intret;
>>+ u16i;
>>
>>  bus = pdev->bus;
>>  hose = pci_bus_to_host(bus);
>>@@ -1448,19 +1455,30 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 
>>num_vfs)
>>  }
>>
>>  /* Calculate available PE for required VFs */
>>- mutex_lock(&phb->ioda.pe_alloc_mutex);
>>- pdn->offset = bitmap_find_next_zero_area(
>>- phb->ioda.pe_alloc, phb->ioda.total_pe,
>>- 0, num_vfs, 0);
>>- if (pdn->offset >= phb->ioda.total_pe) {
>>+ if (pdn->m64_single_mode) {
>>+ for (i = 0; i < num_vfs; i++)
>>+ pdn->pe_num_map[i] = IODA_INVALID_PE;
>>+ for (i = 0; i < num_vfs; i++) {
>>+ pdn->pe_num_map[i] = pnv_ioda_alloc_pe(phb);
>>+ if (pdn->pe_num_map[i] == IODA_INVALID_PE) {
>>+

Re: [PATCH v3 3/6] powerpc/powernv: use one M64 BAR in Single PE mode for one VF BAR

2015-08-13 Thread Wei Yang

On Fri, Aug 14, 2015 at 10:52:21AM +1000, Gavin Shan wrote:
>On Thu, Aug 13, 2015 at 10:11:08PM +0800, Wei Yang wrote:
>>In current implementation, when VF BAR is bigger than 64MB, it uses 4 M64
>>BARs in Single PE mode to cover the number of VFs required to be enabled.
>>By doing so, several VFs would be in one VF Group and leads to interference
>>between VFs in the same group.
>>
>>This patch changes the design by using one M64 BAR in Single PE mode for
>>one VF BAR. This gives absolute isolation for VFs.
>>
>>Signed-off-by: Wei Yang 
>>---
>> arch/powerpc/include/asm/pci-bridge.h |6 +-
>> arch/powerpc/platforms/powernv/pci-ioda.c |  163 
>> +++--
>> 2 files changed, 62 insertions(+), 107 deletions(-)
>>
>>diff --git a/arch/powerpc/include/asm/pci-bridge.h 
>>b/arch/powerpc/include/asm/pci-bridge.h
>>index 712add5..9d33ada 100644
>>--- a/arch/powerpc/include/asm/pci-bridge.h
>>+++ b/arch/powerpc/include/asm/pci-bridge.h
>>@@ -187,6 +187,7 @@ static inline int isa_vaddr_is_ioport(void __iomem 
>>*address)
>>  */
>> struct iommu_table;
>>
>>+#define MAX_M64_BAR  16
>
>struct pnv_phb::m64_bar_idx is initialized to 15. Another macro is defined here
>as 16. Both of them can be used as maximal M64 BAR number. Obviously, they're
>duplicated. On the other hand, I don't think it's a good idea to have the 
>static
>"m64_map" because @pdn is created for every PCI devices, including VFs. non-PF
>don't "m64_map", together other fields like "m64_per_iov" at all. It's 
>obviously
>wasting memory. So it would be allocated dynamically when the PF's pdn is 
>created
>or in pnv_pci_ioda_fixup_iov_resources().
>

I prefer the dynamic one.

Alexey,

I changed to static defined based on your comments. So do you have some
concern on the dynamic version?

>In long run, it might be reasonable to move all SRIOV related fields in pci_dn
>to another data struct (struct pci_iov_dn?) and allocate that dynamically.
>
>>  int flags;
>> #define PCI_DN_FLAG_IOV_VF   0x01
>>@@ -214,10 +215,9 @@ struct pci_dn {
>>  u16 vfs_expanded;   /* number of VFs IOV BAR expanded */
>>  u16 num_vfs;/* number of VFs enabled*/
>>  int offset; /* PE# for the first VF PE */
>>-#define M64_PER_IOV 4
>>- int m64_per_iov;
>>+ boolm64_single_mode;/* Use M64 BAR in Single Mode */
>> #define IODA_INVALID_M64(-1)
>>- int m64_wins[PCI_SRIOV_NUM_BARS][M64_PER_IOV];
>>+ int  m64_map[PCI_SRIOV_NUM_BARS][MAX_M64_BAR];
>> #endif /* CONFIG_PCI_IOV */
>> #endif
>>  struct list_head child_list;
>>diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
>>b/arch/powerpc/platforms/powernv/pci-ioda.c
>>index 67b8f72..4da0f50 100644
>>--- a/arch/powerpc/platforms/powernv/pci-ioda.c
>>+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>>@@ -1162,15 +1162,14 @@ static int pnv_pci_vf_release_m64(struct pci_dev 
>>*pdev)
>>  pdn = pci_get_pdn(pdev);
>>
>>  for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
>>- for (j = 0; j < M64_PER_IOV; j++) {
>>- if (pdn->m64_wins[i][j] == IODA_INVALID_M64)
>>+ for (j = 0; j < MAX_M64_BAR; j++) {
>>+ if (pdn->m64_map[i][j] == IODA_INVALID_M64)
>>  continue;
>>  opal_pci_phb_mmio_enable(phb->opal_id,
>>- OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 0);
>>- clear_bit(pdn->m64_wins[i][j], 
>>&phb->ioda.m64_bar_alloc);
>>- pdn->m64_wins[i][j] = IODA_INVALID_M64;
>>+ OPAL_M64_WINDOW_TYPE, pdn->m64_map[i][j], 0);
>>+ clear_bit(pdn->m64_map[i][j], &phb->ioda.m64_bar_alloc);
>>+ pdn->m64_map[i][j] = IODA_INVALID_M64;
>>  }
>>-
>>  return 0;
>> }
>>
>>@@ -1187,8 +1186,7 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, 
>>u16 num_vfs)
>>  inttotal_vfs;
>>  resource_size_tsize, start;
>>  intpe_num;
>>- intvf_groups;
>>- intvf_per_group;
>>+ intm64_bars;
>>
>>  bus = pdev->bus;
>>  hose = pci_bus_to_host(bus);
>>@@ -1196,26 +1194,23 @@ static int pnv_pci_vf_assign_m64(struct pci_dev 
>>*pdev, u16 num_vfs)
>>  pdn = pci_get_pdn(pdev);
>>  total_vfs = pci_sriov_get_totalvfs(pdev);
>>
>>- /* Initialize the m64_wins to IODA_INVALID_M64 */
>>- for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
>>- for (j = 0; j < M64_PER_IOV; j++)
>>- pdn->m64_wins[i][j] = IODA_INVALID_M64;
>>+ if (pdn->m64_single_mode)
>>+ m64_bars = num_vfs;
>>+ else
>>+ m64_bars = 1;
>>+
>>+ /* Initialize the m64_map to IODA_INVALID_M64 */
>>+ for (i = 0; i < PCI_SRIOV_NUM_BARS ; i++)
>>+ for (j = 0; j < MAX_M64_BAR; j++)
>>+ pdn-

[PATCH v2 2/2] powerpc/mpc85xx:Add SCFG device tree support of T104x

2015-08-13 Thread Dongsheng Wang

From: Wang Dongsheng 

Signed-off-by: Wang Dongsheng 
---
*V2*
No changes.
diff --git a/arch/powerpc/boot/dts/fsl/t1040si-post.dtsi 
b/arch/powerpc/boot/dts/fsl/t1040si-post.dtsi
index 9e9f7e2..9770d02 100644
--- a/arch/powerpc/boot/dts/fsl/t1040si-post.dtsi
+++ b/arch/powerpc/boot/dts/fsl/t1040si-post.dtsi
@@ -484,6 +484,11 @@
reg= <0xea000 0x4000>;
};
 
+   scfg: global-utilities@fc000 {
+   compatible = "fsl,t1040-scfg";
+   reg = <0xfc000 0x1000>;
+   };
+
 /include/ "elo3-dma-0.dtsi"
 /include/ "elo3-dma-1.dtsi"
 /include/ "qoriq-espi-0.dtsi"
-- 
2.1.0.27.g96db324

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v3 2/6] powerpc/powernv: simplify the calculation of iov resource alignment

2015-08-13 Thread Wei Yang

On Fri, Aug 14, 2015 at 11:04:58AM +1000, Gavin Shan wrote:
>On Thu, Aug 13, 2015 at 10:11:07PM +0800, Wei Yang wrote:
>>The alignment of IOV BAR on PowerNV platform is the total size of the IOV
>>BAR. No matter whether the IOV BAR is extended with number of
>>roundup_pow_of_two(total_vfs) or number of max PE number (256), the total
>>size could be calculated by (vfs_expanded * VF_BAR_size).
>>
>>This patch simplifies the pnv_pci_iov_resource_alignment() by removing the
>>first case.
>>
>>Signed-off-by: Wei Yang 
>>Reviewed-by: Gavin Shan 
>>---
>> arch/powerpc/platforms/powernv/pci-ioda.c |   14 +-
>> 1 file changed, 9 insertions(+), 5 deletions(-)
>>
>>diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
>>b/arch/powerpc/platforms/powernv/pci-ioda.c
>>index 9ac324e..67b8f72 100644
>>--- a/arch/powerpc/platforms/powernv/pci-ioda.c
>>+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>>@@ -2987,12 +2987,16 @@ static resource_size_t 
>>pnv_pci_iov_resource_alignment(struct pci_dev *pdev,
>>int resno)
>> {
>>  struct pci_dn *pdn = pci_get_pdn(pdev);
>>- resource_size_t align, iov_align;
>>-
>>- iov_align = resource_size(&pdev->resource[resno]);
>>- if (iov_align)
>>- return iov_align;
>>+ resource_size_t align;
>>
>>+ /*
>>+  * On PowerNV platform, IOV BAR is mapped by M64 BAR to enable the
>>+  * SR-IOV. While from hardware perspective, the range mapped by M64
>>+  * BAR should be size aligned.
>>+  *
>>+  * This function return the total IOV BAR size if expanded or just the
>>+  * individual size if not.
>>+  */
>
>s/return/returns
>

Thanks :-)

>>  align = pci_iov_resource_size(pdev, resno);
>>  if (pdn->vfs_expanded)
>>  return pdn->vfs_expanded * align;
>>-- 
>>1.7.9.5
>>

-- 
Richard Yang
Help you, Help me

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH 3/3] powerpc/e6500: hw tablewalk: order the memory access when acquire/release tcd lock

2015-08-13 Thread Scott Wood

On Thu, 2015-08-13 at 19:51 +0800, Kevin Hao wrote:
> I didn't find anything unusual. But I think we do need to order the
> load/store of esel_next when acquire/release tcd lock. For acquire,
> add a data dependency to order the loads of lock and esel_next.
> For release, even there already have a "isync" here, but it doesn't
> guarantee any memory access order. So we still need "lwsync" for
> the two stores for lock and esel_next.

I was going to say that esel_next is just a hint and it doesn't really matter 
if we occasionally get the wrong value, unless it happens often enough to 
cause more performance degradation than the lwsync causes.  However, with the 
A-008139 workaround we do need to read the same value from esel_next both 
times.  It might be less costly to save/restore an additional register 
instead of lwsync, though.

-Scott

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH 29/31] parisc: handle page-less SG entries

2015-08-13 Thread Dan Williams

On Thu, Aug 13, 2015 at 7:31 AM, Christoph Hellwig  wrote:
> On Wed, Aug 12, 2015 at 09:01:02AM -0700, Linus Torvalds wrote:
>> I'm assuming that anybody who wants to use the page-less
>> scatter-gather lists always does so on memory that isn't actually
>> virtually mapped at all, or only does so on sane architectures that
>> are cache coherent at a physical level, but I'd like that assumption
>> *documented* somewhere.
>
> It's temporarily mapped by kmap-like helpers.  That code isn't in
> this series. The most recent version of it is here:
>
> https://git.kernel.org/cgit/linux/kernel/git/djbw/nvdimm.git/commit/?h=pfn&id=de8237c99fdb4352be2193f3a7610e902b9bb2f0
>
> note that it's not doing the cache flushing it would have to do yet, but
> it's also only enabled for x86 at the moment.

For virtually tagged caches I assume we would temporarily map with
kmap_atomic_pfn_t(), similar to how drm_clflush_pages() implements
powerpc support.  However with DAX we could end up with multiple
virtual aliases for a page-less pfn.
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

RE: [PATCH V2] QorIQ/TMU: add thermal management support based on TMU

2015-08-13 Thread Hongtao Jia

Hi Eduardo,

In previous mail I asked questions about including header files in device tree.
Don't bother, I have already figured out the solution.

Another questions is about cpu cooling:
I found out that there is no explicit calling for registering cpu cooling
device in the of-thermal style drivers.

And Samsung did it in cpufreq driver: drivers/cpufreq/exynos-cpufreq.c

Should all the of-thermal driver use the same way?
Or is there any recommendation for registering cpu cooling device?
(I enabled the CONFIG_CPUFREQ_DT and still got no cooling device registered)

Thanks.

---
Best Regards,
Hongtao


> -Original Message-
> From: Linuxppc-dev [mailto:linuxppc-dev-
> bounces+b38951=freescale@lists.ozlabs.org] On Behalf Of Hongtao Jia
> Sent: Friday, August 07, 2015 4:15 PM
> To: Eduardo Valentin
> Cc: Wood Scott-B07421; linuxppc-dev@lists.ozlabs.org; linux-
> p...@vger.kernel.org
> Subject: RE: [PATCH V2] QorIQ/TMU: add thermal management support based
> on TMU
> 
> Thanks for your comments.
> Please see my questions inline.
> 
> Thanks.
> ---
> Best Regards,
> Hongtao
> 
> 
> > -Original Message-
> > From: Eduardo Valentin [mailto:edubez...@gmail.com]
> > Sent: Thursday, August 06, 2015 3:43 AM
> > To: Jia Hongtao-B38951
> > Cc: linux...@vger.kernel.org; linuxppc-dev@lists.ozlabs.org; Wood
> > Scott-
> > B07421
> > Subject: Re: [PATCH V2] QorIQ/TMU: add thermal management support
> > based on TMU
> >
> > On Thu, Jul 30, 2015 at 08:13:09AM +, Hongtao Jia wrote:
> > > - "Any specific reason why not using OF thermal?"
> > > - No, actually.
> > >
> > > I'd like to use OF thermal after some clarification.
> > >
> > > Regarding to "cooling-maps". For some cases there should be more
> > > than one cpus as cooling device and they are independent.
> > > 1. Let's say 4. So we need to provide 4 maps like map0-map3. Right?
> >
> > That would depend on the amount of sensors you have. Do you have one
> > sensor per cpu? if the answer is yes, then you probably want to have
> > four different map entries, yes, but one on each thermal zone of each
> > cpu temperature sensor. if the answer is no, then you would need to
> > have all the maps in the same thermal zone.
> >
> > > 2. "cooling-max-level" may vary depend on switch settings or firmware.
> > Is that
> > >OK if I do not provide "cooling-min-level" and "cooling-max-level"
> > property?
> >
> > That is already achievable by using the cooling-device property of a
> > cooling map.
> >
> > Please have a look in the example section of the
> > Documentation/devicetree/bindings/thermal/thermal.txt
> 
> Yes, I read this file.
> So in my understanding:
> There is no need to provide "cooling-min-level" and "cooling-max-level"
> property.
> THERMAL_NO_LIMIT value in cooling device node will indicate the driver to
> automatically parse the min and max state, right?
> 
> Talking about THERMAL_NO_LIMIT, I need to #include  bindings/thermal/thermal.h> to provide the definition. But I got
> compiling error when build dtb file.
> I did some research and using "make t1040qds.dtb" in order to involve
> preprocessor.
> But with simply adding "#include " to
> t1040si-post.dtsi at line 35 I still got error like this:
> Error: arch/powerpc/boot/dts/fsl/t1040si-post.dtsi:35.1-9 syntax error
> FATAL ERROR: Unable to parse input tree
> 
> Could you help me out here.
> Thanks.
> 
> >
> > Let me know if you need further clarification.
> >
> >
> > BR,
> >
> > Eduardo Valentin
> >
> > >
> > > Thanks.
> > > -Hongtao
> > >
> > >
> > > > -Original Message-
> > > > From: Eduardo Valentin [mailto:edubez...@gmail.com]
> > > > Sent: Thursday, July 30, 2015 2:56 PM
> > > > To: Jia Hongtao-B38951
> > > > Cc: linux...@vger.kernel.org; linuxppc-dev@lists.ozlabs.org; Wood
> > > > Scott-
> > > > B07421
> > > > Subject: Re: [PATCH V2] QorIQ/TMU: add thermal management support
> > > > based on TMU
> > > >
> > > > On Wed, Jul 29, 2015 at 02:19:39PM +0800, Jia Hongtao wrote:
> > > > > It supports one critical trip point and one passive trip point.
> > > > > The cpufreq is used as the cooling device to throttle CPUs when
> > > > > the passive trip is crossed.
> > > > >
> > > > > Signed-off-by: Jia Hongtao 
> > > > > ---
> > > > > This patch based on:
> > > > > http://patchwork.ozlabs.org/patch/482987/
> > > > >
> > > > > Changes for V2:
> > > > > * Add tmu-range parse.
> > > > > * Use default trend hook.
> > > > > * Using latest thermal_zone_bind_cooling_device API.
> > > > > * Add calibration check during initialization.
> > > > > * Disable/enalbe device when suspend/resume.
> > > > >
> > > > >  drivers/thermal/Kconfig |  11 ++
> > > > >  drivers/thermal/Makefile|   1 +
> > > > >  drivers/thermal/qoriq_thermal.c | 406
> > > > > 
> > > > >  3 files changed, 418 insertions(+)  create mode 100644
> > > > > drivers/thermal/qoriq_thermal.c
> > > > >
> > > > > diff --git a/drivers/thermal/Kconfig b/drivers/thermal/Kconfig
> > > >

[PATCH 1/1] powerpc/xmon: Paged output for paca display

2015-08-13 Thread Sam Bobroff

The paca display is already more than 24 lines, which can be problematic
if you have an old school 80x24 terminal, or more likely you are on a
virtual terminal which does not scroll for whatever reason.

This adds an optional letter to the "dp" and "dpa" xmon commands
("dpp" and "dppa"), which will enable a "per-page" display (with 16
line pages): the first page  will be displayed and if there was data
that didn't fit, it will display a message indicating that the user can
use enter to display the next page. The intent is that this feels
similar to the way the memory display functions work.

This is implemented by running over the entire output both for the
initial command and for each subsequent page: the visible part is
clipped out by checking line numbers. Handling the empty command as
"more" is done by writing a special command into a static buffer that
indicates where to move the sliding visibility window. This is similar
to the approach used for the memory dump commands except that the
state data is encoded into the "last_cmd" string, rather than a set of
static variables. The memory dump commands could probably be rewritten
to make use of the same buffer and remove their other static
variables.

Sample output:

0:mon> dpp1
paca for cpu 0x1 @ cfdc0480:
 possible = yes
 present  = yes
 online   = yes
 lock_token   = 0x8000  (0x8)
 paca_index   = 0x1 (0xa)
 kernel_toc   = 0xc0eb2400  (0x10)
 kernelbase   = 0xc000  (0x18)
 kernel_msr   = 0xb0001032  (0x20)
 emergency_sp = 0xc0003ffe8000  (0x28)
 mc_emergency_sp  = 0xc0003ffe4000  (0x2e0)
 in_mce   = 0x0 (0x2e8)
 data_offset  = 0x7f17  (0x30)
 hw_cpu_id= 0x8 (0x38)
 cpu_start= 0x1 (0x3a)
 kexec_state  = 0x0 (0x3b)
[Enter for next page]
0:mon>
 __current= 0xc0007e696620  (0x290)
 kstack   = 0xc0007e6ebe30  (0x298)
 stab_rr  = 0xb (0x2a0)
 saved_r1 = 0xc0007ef37860  (0x2a8)
 trap_save= 0x0 (0x2b8)
 soft_enabled = 0x0 (0x2ba)
 irq_happened = 0x1 (0x2bb)
 io_sync  = 0x0 (0x2bc)
 irq_work_pending = 0x0 (0x2bd)
 nap_state_lost   = 0x0 (0x2be)
0:mon>

(Based on a similar patch by Michael Ellerman 
"[v2] powerpc/xmon: Allow limiting the size of the paca display".
This patch is an alternative and cannot coexist with the original.)

Signed-off-by: Sam Bobroff 
---

 arch/powerpc/xmon/xmon.c | 82 
 1 file changed, 62 insertions(+), 20 deletions(-)

diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index e599259..9157286 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -72,6 +72,7 @@ static int xmon_gate;
 
 static unsigned long in_xmon __read_mostly = 0;
 
+static char last_cmd_buf[128];
 static unsigned long adrs;
 static int size = 1;
 #define MAX_DUMP (128 * 1024)
@@ -204,8 +205,8 @@ Commands:\n\
   dldump the kernel log buffer\n"
 #ifdef CONFIG_PPC64
   "\
-  dp[#]dump paca for current cpu, or cpu #\n\
-  dpa  dump paca for all possible cpus\n"
+  dp[p][#] dump paca for current cpu, or cpu # (p = paged)\n\
+  dp[p]a   dump paca for all possible cpus (p = paged)\n"
 #endif
   "\
   dr   dump stream of raw bytes\n\
@@ -2070,7 +2071,17 @@ static void xmon_rawdump (unsigned long adrs, long ndump)
 }
 
 #ifdef CONFIG_PPC64
-static void dump_one_paca(int cpu)
+static bool line_visible(unsigned long start, unsigned long count,
+unsigned long *line) {
+   bool rv = (!count
+   || ((*line >= start) && (*line < (start + count;
+
+   (*line)++;
+   return rv;
+}
+
+static void dump_one_paca(int cpu, unsigned long start,
+ unsigned long count, unsigned long *line)
 {
struct paca_struct *p;
 
@@ -2084,15 +2095,22 @@ static void dump_one_paca(int cpu)
 
p = &paca[cpu];
 
-   printf("paca for cpu 0x%x @ %p:\n", cpu, p);
+#define VPRINT(...) do { \
+   if (line_visible(start, count, line)) \
+   printf(__VA_ARGS__); \
+} while (0)
+   VPRINT("paca for cpu 0x%x @ %p:\n", cpu, p);
 
-   printf(" %-*s = %s\n", 16, "possible", cpu_possible(cpu) ? "yes" : 
"no");
-   printf(" %-*s = %s\n", 16, "present", cpu_present(cpu) ? "yes" : "no");
-   printf(" %-*s = %s\n", 16, "online", cpu_online(cpu) ? "yes" : "no");
+   VPRINT(" %-*s = %s\n", 16, "possible", cpu_possible(cpu) ? "yes" : 
"no");
+   VPRINT(" %-*s = %s\n", 16, "present", cpu_present(cpu) ? "yes" : "no");
+   VPRINT(" %-*s = %s\n", 16, "online", cpu_online(cpu) ? "yes" : "no");
+#undef VPRINT
 
-#define DUMP(paca, name, format) \
-   printf(" %-*s = %#-*"format"\t(0x%lx)\n", 16, #name, 1

Re: [V3] powerpc/irq: Enable some more exceptions in /proc/interrupts interface

2015-08-13 Thread Michael Ellerman

On Thu, 2015-08-06 at 18:54 +0530, Anshuman Khandual wrote:
> On 08/04/2015 03:27 PM, Michael Ellerman wrote:
> > On Mon, 2015-13-07 at 08:16:06 UTC, Anshuman Khandual wrote:
> >> This patch enables facility unavailable exceptions for generic facility,
> >> FPU, ALTIVEC and VSX in /proc/interrupts listing by incrementing their
> >> newly added IRQ statistical counters as and when these exceptions happen.
> >> This also adds couple of helper functions which will be called from within
> >> the interrupt handler context to update their statistics. Similarly this
> >> patch also enables alignment and program check exceptions as well.
> > 
> > ...
> > 
> >> diff --git a/arch/powerpc/kernel/exceptions-64s.S 
> >> b/arch/powerpc/kernel/exceptions-64s.S
> >> index 0a0399c2..a86180c 100644
> >> --- a/arch/powerpc/kernel/exceptions-64s.S
> >> +++ b/arch/powerpc/kernel/exceptions-64s.S
> >> @@ -1158,6 +1158,7 @@ BEGIN_FTR_SECTION
> >>  END_FTR_SECTION_IFSET(CPU_FTR_TM)
> >>  #endif
> >>bl  load_up_fpu
> >> +  bl  fpu_unav_exceptions_count
> > 
> > Is it safe to call C code here?
> 
> Hmm, is it not ? I had that question but was not really sure. Dont
> understand the difference between 'fast_exception_return' and
> 'ret_from_except' completely.

If you're "not really sure" it's correct, please say so in the change log!

I'd rather you didn't send me patches with possibly subtle bugs in core code.

cheers



___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH] kvm:powerpc:Fix return statements for wrapper functions in the file book3s_64_mmu_hv.c

2015-08-13 Thread Michael Ellerman

On Wed, 2015-08-12 at 21:06 +0200, Alexander Graf wrote:
> 
> On 10.08.15 17:27, Nicholas Krause wrote:
> > This fixes the wrapper functions kvm_umap_hva_hv and the function
> > kvm_unmap_hav_range_hv to return the return value of the function
> > kvm_handle_hva or kvm_handle_hva_range that they are wrapped to
> > call internally rather then always making the caller of these
> > wrapper functions think they always run successfully by returning
> > the value of zero directly.
> > 
> > Signed-off-by: Nicholas Krause 
> 
> Paul, could you please take on this one?

Paul's away for a while can you take it directly?

cheers


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: RFC: Reducing the number of non volatile GPRs in the ppc64 kernel

2015-08-13 Thread Michael Ellerman

On Wed, 2015-08-05 at 14:03 +1000, Anton Blanchard wrote:
> Hi,
> 
> While looking at traces of kernel workloads, I noticed places where gcc
> used a large number of non volatiles. Some of these functions
> did very little work, and we spent most of our time saving the
> non volatiles to the stack and reading them back.
> 
> It made me wonder if we have the right ratio of volatile to non
> volatile GPRs. Since the kernel is completely self contained, we could
> potentially change that ratio.
> 
> Attached is a quick hack to gcc and the kernel to decrease the number
> of non volatile GPRs to 8. I'm not sure if this is a good idea (and if
> the volatile to non volatile ratio is right), but this gives us
> something to play with.

OK, interesting idea. Can't say I'd ever though of that.

I'm thinking we'd want some pretty solid analysis of the resulting code-gen and
real world perf before we made a switch like that.

Presumably it's going to hurt our null syscall, due to the added save/restores,
but hopefully help with paths that do actual work.

If the caller is actually using the non-volatiles then presumably it will be a
wash, because the caller will have to do the save anyway. Though maybe it would
still be a win because the caller can do the saves & restores when it needs to
rather than all in a block.

I'm also not clear on how it would affect folks who build modules separate from
the kernel. We'd have to make sure they had the right GCC, or things would go
badly wrong, unless it can be done with command line flags? I don't know how
much we care about that but distros presumably do.

cheers

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v2] powerpc/xmon: Allow limiting the size of the paca display

2015-08-13 Thread Sam Bobroff

On Wed, Aug 12, 2015 at 09:55:25PM +1000, Michael Ellerman wrote:
> The paca display is already more than 24 lines, which can be problematic
> if you have an old school 80x24 terminal, or more likely you are on a
> virtual terminal which does not scroll for whatever reason.
> 
> We'd like to expand the paca display even more, so add a way to limit
> the number of lines that are displayed.
> 
> This adds a third form of 'dp' which is 'dp # #', where the first number
> is the cpu, and the second is the number of lines to display.
> 
> Example output:
> 
>   5:mon> dp 3 6
>   paca for cpu 0x3 @ cfe00c00:
>possible = yes
>present  = yes
>online   = yes
>lock_token   = 0x8000  (0xa)
>paca_index   = 0x3 (0x8)

Michael,

This patch inspired me to do the additional work to make the output paged, more
like the memory dump commands.

I'll post it shortly as "powerpc/xmon: Paged output for paca display".

Cheers,
Sam.

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v3 2/6] powerpc/powernv: simplify the calculation of iov resource alignment

2015-08-13 Thread Gavin Shan

On Thu, Aug 13, 2015 at 10:11:07PM +0800, Wei Yang wrote:
>The alignment of IOV BAR on PowerNV platform is the total size of the IOV
>BAR. No matter whether the IOV BAR is extended with number of
>roundup_pow_of_two(total_vfs) or number of max PE number (256), the total
>size could be calculated by (vfs_expanded * VF_BAR_size).
>
>This patch simplifies the pnv_pci_iov_resource_alignment() by removing the
>first case.
>
>Signed-off-by: Wei Yang 
>Reviewed-by: Gavin Shan 
>---
> arch/powerpc/platforms/powernv/pci-ioda.c |   14 +-
> 1 file changed, 9 insertions(+), 5 deletions(-)
>
>diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
>b/arch/powerpc/platforms/powernv/pci-ioda.c
>index 9ac324e..67b8f72 100644
>--- a/arch/powerpc/platforms/powernv/pci-ioda.c
>+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>@@ -2987,12 +2987,16 @@ static resource_size_t 
>pnv_pci_iov_resource_alignment(struct pci_dev *pdev,
> int resno)
> {
>   struct pci_dn *pdn = pci_get_pdn(pdev);
>-  resource_size_t align, iov_align;
>-
>-  iov_align = resource_size(&pdev->resource[resno]);
>-  if (iov_align)
>-  return iov_align;
>+  resource_size_t align;
>
>+  /*
>+   * On PowerNV platform, IOV BAR is mapped by M64 BAR to enable the
>+   * SR-IOV. While from hardware perspective, the range mapped by M64
>+   * BAR should be size aligned.
>+   *
>+   * This function return the total IOV BAR size if expanded or just the
>+   * individual size if not.
>+   */

s/return/returns

>   align = pci_iov_resource_size(pdev, resno);
>   if (pdn->vfs_expanded)
>   return pdn->vfs_expanded * align;
>-- 
>1.7.9.5
>

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v3 6/6] powerpc/powernv: allocate sparse PE# when using M64 BAR in Single PE mode

2015-08-13 Thread Gavin Shan

On Thu, Aug 13, 2015 at 10:11:11PM +0800, Wei Yang wrote:
>When M64 BAR is set to Single PE mode, the PE# assigned to VF could be
>sparse.
>
>This patch restructures the patch to allocate sparse PE# for VFs when M64
>BAR is set to Single PE mode.
>
>Signed-off-by: Wei Yang 
>---
> arch/powerpc/include/asm/pci-bridge.h |2 +-
> arch/powerpc/platforms/powernv/pci-ioda.c |   59 +++--
> 2 files changed, 41 insertions(+), 20 deletions(-)
>
>diff --git a/arch/powerpc/include/asm/pci-bridge.h 
>b/arch/powerpc/include/asm/pci-bridge.h
>index 9d33ada..b026ef8 100644
>--- a/arch/powerpc/include/asm/pci-bridge.h
>+++ b/arch/powerpc/include/asm/pci-bridge.h
>@@ -214,7 +214,7 @@ struct pci_dn {
> #ifdef CONFIG_PCI_IOV
>   u16 vfs_expanded;   /* number of VFs IOV BAR expanded */
>   u16 num_vfs;/* number of VFs enabled*/
>-  int offset; /* PE# for the first VF PE */
>+  int pe_num_map[MAX_M64_BAR];/* PE# for the first VF PE or array */

Same question as to "m64_map". pdn for non-PF doesn't need it.

>   boolm64_single_mode;/* Use M64 BAR in Single Mode */
> #define IODA_INVALID_M64(-1)
>   int  m64_map[PCI_SRIOV_NUM_BARS][MAX_M64_BAR];
>diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
>b/arch/powerpc/platforms/powernv/pci-ioda.c
>index 1e6ac86..7633538 100644
>--- a/arch/powerpc/platforms/powernv/pci-ioda.c
>+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>@@ -1232,7 +1232,7 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, 
>u16 num_vfs)
>
>   /* Map the M64 here */
>   if (pdn->m64_single_mode) {
>-  pe_num = pdn->offset + j;
>+  pe_num = pdn->pe_num_map[j];
>   rc = opal_pci_map_pe_mmio_window(phb->opal_id,
>   pe_num, OPAL_M64_WINDOW_TYPE,
>   pdn->m64_map[i][j], 0);
>@@ -1336,7 +1336,7 @@ void pnv_pci_sriov_disable(struct pci_dev *pdev)
>   struct pnv_phb*phb;
>   struct pci_dn *pdn;
>   struct pci_sriov  *iov;
>-  u16 num_vfs;
>+  u16 num_vfs, i;
>
>   bus = pdev->bus;
>   hose = pci_bus_to_host(bus);
>@@ -1350,14 +1350,17 @@ void pnv_pci_sriov_disable(struct pci_dev *pdev)
>
>   if (phb->type == PNV_PHB_IODA2) {
>   if (!pdn->m64_single_mode)
>-  pnv_pci_vf_resource_shift(pdev, -pdn->offset);
>+  pnv_pci_vf_resource_shift(pdev, -pdn->pe_num_map[0]);
>
>   /* Release M64 windows */
>   pnv_pci_vf_release_m64(pdev);
>
>   /* Release PE numbers */
>-  bitmap_clear(phb->ioda.pe_alloc, pdn->offset, num_vfs);
>-  pdn->offset = 0;
>+  if (pdn->m64_single_mode) {
>+  for (i = 0; i < num_vfs; i++)
>+  pnv_ioda_free_pe(phb, pdn->pe_num_map[i]);
>+  } else
>+  bitmap_clear(phb->ioda.pe_alloc, pdn->pe_num_map[0], 
>num_vfs);
>   }
> }
>
>@@ -1383,7 +1386,10 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, 
>u16 num_vfs)
>
>   /* Reserve PE for each VF */
>   for (vf_index = 0; vf_index < num_vfs; vf_index++) {
>-  pe_num = pdn->offset + vf_index;
>+  if (pdn->m64_single_mode)
>+  pe_num = pdn->pe_num_map[vf_index];
>+  else
>+  pe_num = pdn->pe_num_map[0] + vf_index;
>
>   pe = &phb->ioda.pe_array[pe_num];
>   pe->pe_number = pe_num;
>@@ -1425,6 +1431,7 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 
>num_vfs)
>   struct pnv_phb*phb;
>   struct pci_dn *pdn;
>   intret;
>+  u16i;
>
>   bus = pdev->bus;
>   hose = pci_bus_to_host(bus);
>@@ -1448,19 +1455,30 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 
>num_vfs)
>   }
>
>   /* Calculate available PE for required VFs */
>-  mutex_lock(&phb->ioda.pe_alloc_mutex);
>-  pdn->offset = bitmap_find_next_zero_area(
>-  phb->ioda.pe_alloc, phb->ioda.total_pe,
>-  0, num_vfs, 0);
>-  if (pdn->offset >= phb->ioda.total_pe) {
>+  if (pdn->m64_single_mode) {
>+  for (i = 0; i < num_vfs; i++)
>+  pdn->pe_num_map[i] = IODA_INVALID_PE;
>+  for (i = 0; i < num_vfs; i++) {
>+  pdn->pe_num_map[i] = pnv_ioda_alloc_pe(phb);
>+  if (pdn->pe_num_map[i] == IODA_INVALID_PE) {
>+  ret = -EBUSY;
>+  goto m64_failed;
>+  }
>+  }
>+

Re: [PATCH v3 5/6] powerpc/powernv: boundary the total VF BAR size instead of the individual one

2015-08-13 Thread Gavin Shan

On Thu, Aug 13, 2015 at 10:11:10PM +0800, Wei Yang wrote:
>Each VF could have 6 BARs at most. When the total BAR size exceeds the
>gate, after expanding it will also exhaust the M64 Window.
>
>This patch limits the boundary by checking the total VF BAR size instead of
>the individual BAR.
>
>Signed-off-by: Wei Yang 

Reviewed-by: Gavin Shan 

>---
> arch/powerpc/platforms/powernv/pci-ioda.c |   13 +++--
> 1 file changed, 7 insertions(+), 6 deletions(-)
>
>diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
>b/arch/powerpc/platforms/powernv/pci-ioda.c
>index 3e8c0b4..1e6ac86 100644
>--- a/arch/powerpc/platforms/powernv/pci-ioda.c
>+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>@@ -2688,7 +2688,7 @@ static void pnv_pci_ioda_fixup_iov_resources(struct 
>pci_dev *pdev)
>   struct pnv_phb *phb;
>   struct resource *res;
>   int i;
>-  resource_size_t size, gate;
>+  resource_size_t size, gate, total_vf_bar_sz;
>   struct pci_dn *pdn;
>   int mul, total_vfs;
>
>@@ -2715,6 +2715,7 @@ static void pnv_pci_ioda_fixup_iov_resources(struct 
>pci_dev *pdev)
>* which will exhaust the M64 Space and limit the system flexibility.
>*/
>   gate = phb->ioda.m64_segsize >> 1;
>+  total_vf_bar_sz = 0;
>
>   for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
>   res = &pdev->resource[i + PCI_IOV_RESOURCES];
>@@ -2727,13 +2728,13 @@ static void pnv_pci_ioda_fixup_iov_resources(struct 
>pci_dev *pdev)
>   return;
>   }
>
>-  size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
>+  total_vf_bar_sz += pci_iov_resource_size(pdev,
>+  i + PCI_IOV_RESOURCES);
>
>   /* bigger than or equal to gate */
>-  if (size >= gate) {
>-  dev_info(&pdev->dev, "PowerNV: VF BAR%d: %pR IOV size "
>-  "is bigger than %lld, roundup power2\n",
>-   i, res, gate);
>+  if (total_vf_bar_sz >= gate) {
>+  dev_info(&pdev->dev, "PowerNV: VF BAR Total IOV size "
>+  "is bigger than %lld, roundup power2\n", gate);

dev_info(&pdev->dev, "PowerNV: Total VF BAR size %lld "
 "is bigger than %lld, roundup power2\n",
 total_vf_bar_sz, gate);

>   mul = roundup_pow_of_two(total_vfs);
>   pdn->m64_single_mode = true;
>   break;
>-- 
>1.7.9.5
>

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v3 4/6] powerpc/powernv: replace the hard coded boundary with gate

2015-08-13 Thread Gavin Shan

On Thu, Aug 13, 2015 at 10:11:09PM +0800, Wei Yang wrote:
>At the moment 64bit-prefetchable window can be maximum 64GB, which is
>currently got from device tree. This means that in shared mode the maximum
>supported VF BAR size is 64GB/256=256MB. While this size could exhaust the
>whole 64bit-prefetchable window. This is a design decision to set a
>boundary to 64MB of the VF BAR size. Since VF BAR size with 64MB would
>occupy a quarter of the 64bit-prefetchable window, this is affordable.
>
>This patch replaces magic limit of 64MB with (m64_segsize >> 1) and adds
>comment to explain the reason for it.
>
>Signed-off-by: Wei Yang 

Reviewed-by: Gavin Shan 

>---
> arch/powerpc/platforms/powernv/pci-ioda.c |   22 +-
> 1 file changed, 17 insertions(+), 5 deletions(-)
>
>diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
>b/arch/powerpc/platforms/powernv/pci-ioda.c
>index 4da0f50..3e8c0b4 100644
>--- a/arch/powerpc/platforms/powernv/pci-ioda.c
>+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>@@ -2688,7 +2688,7 @@ static void pnv_pci_ioda_fixup_iov_resources(struct 
>pci_dev *pdev)
>   struct pnv_phb *phb;
>   struct resource *res;
>   int i;
>-  resource_size_t size;
>+  resource_size_t size, gate;
>   struct pci_dn *pdn;
>   int mul, total_vfs;
>
>@@ -2704,6 +2704,17 @@ static void pnv_pci_ioda_fixup_iov_resources(struct 
>pci_dev *pdev)
>
>   total_vfs = pci_sriov_get_totalvfs(pdev);
>   mul = phb->ioda.total_pe;
>+  /*
>+   * If bigger than or equal to half of M64 segment size, just round up
>+   * power of two.
>+   *
>+   * Generally, one M64 BAR maps one IOV BAR. To avoid conflict with
>+   * other devices, IOV BAR size is expanded to be (total_pe *
>+   * VF_BAR_size).  When VF_BAR_size is half of M64 segment size , the
>+   * expanded size would equal to half of the whole M64 Space size,
>+   * which will exhaust the M64 Space and limit the system flexibility.
>+   */

s/M64 Space/M64 space

>+  gate = phb->ioda.m64_segsize >> 1;
>
>   for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
>   res = &pdev->resource[i + PCI_IOV_RESOURCES];
>@@ -2718,10 +2729,11 @@ static void pnv_pci_ioda_fixup_iov_resources(struct 
>pci_dev *pdev)
>
>   size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
>
>-  /* bigger than 64M */
>-  if (size > (1 << 26)) {
>-  dev_info(&pdev->dev, "PowerNV: VF BAR%d: %pR IOV size 
>is bigger than 64M, roundup power2\n",
>-   i, res);
>+  /* bigger than or equal to gate */
>+  if (size >= gate) {
>+  dev_info(&pdev->dev, "PowerNV: VF BAR%d: %pR IOV size "
>+  "is bigger than %lld, roundup power2\n",
>+   i, res, gate);
>   mul = roundup_pow_of_two(total_vfs);
>   pdn->m64_single_mode = true;
>   break;
>-- 
>1.7.9.5
>

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v3 3/6] powerpc/powernv: use one M64 BAR in Single PE mode for one VF BAR

2015-08-13 Thread Gavin Shan

On Thu, Aug 13, 2015 at 10:11:08PM +0800, Wei Yang wrote:
>In current implementation, when VF BAR is bigger than 64MB, it uses 4 M64
>BARs in Single PE mode to cover the number of VFs required to be enabled.
>By doing so, several VFs would be in one VF Group and leads to interference
>between VFs in the same group.
>
>This patch changes the design by using one M64 BAR in Single PE mode for
>one VF BAR. This gives absolute isolation for VFs.
>
>Signed-off-by: Wei Yang 
>---
> arch/powerpc/include/asm/pci-bridge.h |6 +-
> arch/powerpc/platforms/powernv/pci-ioda.c |  163 +++--
> 2 files changed, 62 insertions(+), 107 deletions(-)
>
>diff --git a/arch/powerpc/include/asm/pci-bridge.h 
>b/arch/powerpc/include/asm/pci-bridge.h
>index 712add5..9d33ada 100644
>--- a/arch/powerpc/include/asm/pci-bridge.h
>+++ b/arch/powerpc/include/asm/pci-bridge.h
>@@ -187,6 +187,7 @@ static inline int isa_vaddr_is_ioport(void __iomem 
>*address)
>  */
> struct iommu_table;
>
>+#define MAX_M64_BAR  16

struct pnv_phb::m64_bar_idx is initialized to 15. Another macro is defined here
as 16. Both of them can be used as maximal M64 BAR number. Obviously, they're
duplicated. On the other hand, I don't think it's a good idea to have the static
"m64_map" because @pdn is created for every PCI devices, including VFs. non-PF
don't "m64_map", together other fields like "m64_per_iov" at all. It's obviously
wasting memory. So it would be allocated dynamically when the PF's pdn is 
created
or in pnv_pci_ioda_fixup_iov_resources().

In long run, it might be reasonable to move all SRIOV related fields in pci_dn
to another data struct (struct pci_iov_dn?) and allocate that dynamically.

>   int flags;
> #define PCI_DN_FLAG_IOV_VF0x01
>@@ -214,10 +215,9 @@ struct pci_dn {
>   u16 vfs_expanded;   /* number of VFs IOV BAR expanded */
>   u16 num_vfs;/* number of VFs enabled*/
>   int offset; /* PE# for the first VF PE */
>-#define M64_PER_IOV 4
>-  int m64_per_iov;
>+  boolm64_single_mode;/* Use M64 BAR in Single Mode */
> #define IODA_INVALID_M64(-1)
>-  int m64_wins[PCI_SRIOV_NUM_BARS][M64_PER_IOV];
>+  int  m64_map[PCI_SRIOV_NUM_BARS][MAX_M64_BAR];
> #endif /* CONFIG_PCI_IOV */
> #endif
>   struct list_head child_list;
>diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
>b/arch/powerpc/platforms/powernv/pci-ioda.c
>index 67b8f72..4da0f50 100644
>--- a/arch/powerpc/platforms/powernv/pci-ioda.c
>+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>@@ -1162,15 +1162,14 @@ static int pnv_pci_vf_release_m64(struct pci_dev *pdev)
>   pdn = pci_get_pdn(pdev);
>
>   for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
>-  for (j = 0; j < M64_PER_IOV; j++) {
>-  if (pdn->m64_wins[i][j] == IODA_INVALID_M64)
>+  for (j = 0; j < MAX_M64_BAR; j++) {
>+  if (pdn->m64_map[i][j] == IODA_INVALID_M64)
>   continue;
>   opal_pci_phb_mmio_enable(phb->opal_id,
>-  OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 0);
>-  clear_bit(pdn->m64_wins[i][j], 
>&phb->ioda.m64_bar_alloc);
>-  pdn->m64_wins[i][j] = IODA_INVALID_M64;
>+  OPAL_M64_WINDOW_TYPE, pdn->m64_map[i][j], 0);
>+  clear_bit(pdn->m64_map[i][j], &phb->ioda.m64_bar_alloc);
>+  pdn->m64_map[i][j] = IODA_INVALID_M64;
>   }
>-
>   return 0;
> }
>
>@@ -1187,8 +1186,7 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, 
>u16 num_vfs)
>   inttotal_vfs;
>   resource_size_tsize, start;
>   intpe_num;
>-  intvf_groups;
>-  intvf_per_group;
>+  intm64_bars;
>
>   bus = pdev->bus;
>   hose = pci_bus_to_host(bus);
>@@ -1196,26 +1194,23 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, 
>u16 num_vfs)
>   pdn = pci_get_pdn(pdev);
>   total_vfs = pci_sriov_get_totalvfs(pdev);
>
>-  /* Initialize the m64_wins to IODA_INVALID_M64 */
>-  for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
>-  for (j = 0; j < M64_PER_IOV; j++)
>-  pdn->m64_wins[i][j] = IODA_INVALID_M64;
>+  if (pdn->m64_single_mode)
>+  m64_bars = num_vfs;
>+  else
>+  m64_bars = 1;
>+
>+  /* Initialize the m64_map to IODA_INVALID_M64 */
>+  for (i = 0; i < PCI_SRIOV_NUM_BARS ; i++)
>+  for (j = 0; j < MAX_M64_BAR; j++)
>+  pdn->m64_map[i][j] = IODA_INVALID_M64;

It would be done in pnv_pci_ioda_fixup_iov_resources(). That means it will
be done for once if hotplug isn't considered. The code here will be called
on every attempt to enable SRIOV capability, which isn't necessary, right?

>
>-  if

Re: [PATCH v3 1/6] powerpc/powernv: don't enable SRIOV when VF BAR has non 64bit-prefetchable BAR

2015-08-13 Thread Gavin Shan

On Thu, Aug 13, 2015 at 10:11:06PM +0800, Wei Yang wrote:
>On PHB_IODA2, we enable SRIOV devices by mapping IOV BAR with M64 BARs. If
>a SRIOV device's IOV BAR is not 64bit-prefetchable, this is not assigned
>from 64bit prefetchable window, which means M64 BAR can't work on it.
>
>This patch makes this explicit.
>
>Signed-off-by: Wei Yang 

Reviewed-by: Gavin Shan 

>---
> arch/powerpc/platforms/powernv/pci-ioda.c |   25 +
> 1 file changed, 9 insertions(+), 16 deletions(-)
>
>diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
>b/arch/powerpc/platforms/powernv/pci-ioda.c
>index 5738d31..9ac324e 100644
>--- a/arch/powerpc/platforms/powernv/pci-ioda.c
>+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>@@ -908,9 +908,6 @@ static int pnv_pci_vf_resource_shift(struct pci_dev *dev, 
>int offset)
>   if (!res->flags || !res->parent)
>   continue;
>
>-  if (!pnv_pci_is_mem_pref_64(res->flags))
>-  continue;
>-
>   /*
>* The actual IOV BAR range is determined by the start address
>* and the actual size for num_vfs VFs BAR.  This check is to
>@@ -939,9 +936,6 @@ static int pnv_pci_vf_resource_shift(struct pci_dev *dev, 
>int offset)
>   if (!res->flags || !res->parent)
>   continue;
>
>-  if (!pnv_pci_is_mem_pref_64(res->flags))
>-  continue;
>-
>   size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
>   res2 = *res;
>   res->start += size * offset;
>@@ -1221,9 +1215,6 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, 
>u16 num_vfs)
>   if (!res->flags || !res->parent)
>   continue;
>
>-  if (!pnv_pci_is_mem_pref_64(res->flags))
>-  continue;
>-
>   for (j = 0; j < vf_groups; j++) {
>   do {
>   win = 
> find_next_zero_bit(&phb->ioda.m64_bar_alloc,
>@@ -1510,6 +1501,12 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 
>num_vfs)
>   pdn = pci_get_pdn(pdev);
>
>   if (phb->type == PNV_PHB_IODA2) {
>+  if (!pdn->vfs_expanded) {
>+  dev_info(&pdev->dev, "don't support this SRIOV device"
>+  " with non 64bit-prefetchable IOV BAR\n");
>+  return -ENOSPC;
>+  }
>+
>   /* Calculate available PE for required VFs */
>   mutex_lock(&phb->ioda.pe_alloc_mutex);
>   pdn->offset = bitmap_find_next_zero_area(
>@@ -2774,9 +2771,10 @@ static void pnv_pci_ioda_fixup_iov_resources(struct 
>pci_dev *pdev)
>   if (!res->flags || res->parent)
>   continue;
>   if (!pnv_pci_is_mem_pref_64(res->flags)) {
>-  dev_warn(&pdev->dev, " non M64 VF BAR%d: %pR\n",
>+  dev_warn(&pdev->dev, "Don't support SR-IOV with"
>+  " non M64 VF BAR%d: %pR. \n",
>i, res);
>-  continue;
>+  return;
>   }
>
>   size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
>@@ -2795,11 +2793,6 @@ static void pnv_pci_ioda_fixup_iov_resources(struct 
>pci_dev *pdev)
>   res = &pdev->resource[i + PCI_IOV_RESOURCES];
>   if (!res->flags || res->parent)
>   continue;
>-  if (!pnv_pci_is_mem_pref_64(res->flags)) {
>-  dev_warn(&pdev->dev, "Skipping expanding VF BAR%d: 
>%pR\n",
>-   i, res);
>-  continue;
>-  }
>
>   dev_dbg(&pdev->dev, " Fixing VF BAR%d: %pR to\n", i, res);
>   size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
>-- 
>1.7.9.5
>

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: RFC: prepare for struct scatterlist entries without page backing

2015-08-13 Thread Julian Calaby

Hi Christoph,

On Fri, Aug 14, 2015 at 12:35 AM, Christoph Hellwig  wrote:
> On Thu, Aug 13, 2015 at 09:37:37AM +1000, Julian Calaby wrote:
>> I.e. ~90% of this patch set seems to be just mechanically dropping
>> BUG_ON()s and converting open coded stuff to use accessor functions
>> (which should be macros or get inlined, right?) - and the remaining
>> bit is not flushing if we don't have a physical page somewhere.
>
> Which is was 90%.  By lines changed most actually is the diffs for
> the cache flushing.

I was talking in terms of changes made, not lines changed: by my
recollection, about a third of the patches didn't touch flush calls
and most of the lines changed looked like refactoring so that making
the flush call conditional would be easier.

I guess it smelled like you were doing lots of distinct changes in a
single patch and I got my numbers wrong.

>> Would it make sense to split this patch set into a few bits: one to
>> drop all the useless BUG_ON()s, one to convert all the open coded
>> stuff to accessor functions, then another to do the actual page-less
>> sg stuff?
>
> Without the ifs the BUG_ON() actually are useful to assert we
> never feed the sort of physical addresses we can't otherwise support,
> so I don't think that part is doable.

My point is that there's a couple of patches that only remove
BUG_ON()s, which implies that for that particular driver it doesn't
matter if there's a physical page or not, so therefore that code is
purely "documentation".

Thanks,

-- 
Julian Calaby

Email: julian.cal...@gmail.com
Profile: http://www.google.com/profiles/julian.calaby/
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v2 7/7] pmem, dax: have direct_access use __pmem annotation

2015-08-13 Thread Dan Williams

On Thu, Aug 13, 2015 at 9:51 AM, Ross Zwisler
 wrote:
> Update the annotation for the kaddr pointer returned by direct_access()
> so that it is a __pmem pointer.  This is consistent with the PMEM driver
> and with how this direct_access() pointer is used in the DAX code.
>
> Signed-off-by: Ross Zwisler 
> ---
>  Documentation/filesystems/Locking |  3 ++-
>  arch/powerpc/sysdev/axonram.c |  7 ---
>  drivers/block/brd.c   |  4 ++--
>  drivers/nvdimm/pmem.c |  4 ++--
>  drivers/s390/block/dcssblk.c  | 10 +
>  fs/block_dev.c|  2 +-
>  fs/dax.c  | 44 
> +--
>  include/linux/blkdev.h|  8 +++
>  8 files changed, 45 insertions(+), 37 deletions(-)
>
> diff --git a/Documentation/filesystems/Locking 
> b/Documentation/filesystems/Locking
> index 6a34a0f..06d4434 100644
> --- a/Documentation/filesystems/Locking
> +++ b/Documentation/filesystems/Locking
> @@ -397,7 +397,8 @@ prototypes:
> int (*release) (struct gendisk *, fmode_t);
> int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned 
> long);
> int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, 
> unsigned long);
> -   int (*direct_access) (struct block_device *, sector_t, void **, 
> unsigned long *);
> +   int (*direct_access) (struct block_device *, sector_t, void __pmem **,
> +   unsigned long *);

So this collides with the __pfn_t work.  I think the we have a
reasonable chance of getting that in to 4.3, so I'd wait to see if we
hit any major roadblocks with that set [1] before merging these.

[1]: https://lists.01.org/pipermail/linux-nvdimm/2015-August/001803.html
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v2 7/7] pmem, dax: have direct_access use __pmem annotation

2015-08-13 Thread Ross Zwisler

Update the annotation for the kaddr pointer returned by direct_access()
so that it is a __pmem pointer.  This is consistent with the PMEM driver
and with how this direct_access() pointer is used in the DAX code.

Signed-off-by: Ross Zwisler 
---
 Documentation/filesystems/Locking |  3 ++-
 arch/powerpc/sysdev/axonram.c |  7 ---
 drivers/block/brd.c   |  4 ++--
 drivers/nvdimm/pmem.c |  4 ++--
 drivers/s390/block/dcssblk.c  | 10 +
 fs/block_dev.c|  2 +-
 fs/dax.c  | 44 +--
 include/linux/blkdev.h|  8 +++
 8 files changed, 45 insertions(+), 37 deletions(-)

diff --git a/Documentation/filesystems/Locking 
b/Documentation/filesystems/Locking
index 6a34a0f..06d4434 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -397,7 +397,8 @@ prototypes:
int (*release) (struct gendisk *, fmode_t);
int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned 
long);
-   int (*direct_access) (struct block_device *, sector_t, void **, 
unsigned long *);
+   int (*direct_access) (struct block_device *, sector_t, void __pmem **,
+   unsigned long *);
int (*media_changed) (struct gendisk *);
void (*unlock_native_capacity) (struct gendisk *);
int (*revalidate_disk) (struct gendisk *);
diff --git a/arch/powerpc/sysdev/axonram.c b/arch/powerpc/sysdev/axonram.c
index ee90db1..a2be2a6 100644
--- a/arch/powerpc/sysdev/axonram.c
+++ b/arch/powerpc/sysdev/axonram.c
@@ -141,13 +141,14 @@ axon_ram_make_request(struct request_queue *queue, struct 
bio *bio)
  */
 static long
 axon_ram_direct_access(struct block_device *device, sector_t sector,
-  void **kaddr, unsigned long *pfn, long size)
+  void __pmem **kaddr, unsigned long *pfn, long size)
 {
struct axon_ram_bank *bank = device->bd_disk->private_data;
loff_t offset = (loff_t)sector << AXON_RAM_SECTOR_SHIFT;
+   void *addr = (void *)(bank->ph_addr + offset);
 
-   *kaddr = (void *)(bank->ph_addr + offset);
-   *pfn = virt_to_phys(*kaddr) >> PAGE_SHIFT;
+   *kaddr = (void __pmem *)addr;
+   *pfn = virt_to_phys(addr) >> PAGE_SHIFT;
 
return bank->size - offset;
 }
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 5750b39..2691bb6 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -371,7 +371,7 @@ static int brd_rw_page(struct block_device *bdev, sector_t 
sector,
 
 #ifdef CONFIG_BLK_DEV_RAM_DAX
 static long brd_direct_access(struct block_device *bdev, sector_t sector,
-   void **kaddr, unsigned long *pfn, long size)
+   void __pmem **kaddr, unsigned long *pfn, long size)
 {
struct brd_device *brd = bdev->bd_disk->private_data;
struct page *page;
@@ -381,7 +381,7 @@ static long brd_direct_access(struct block_device *bdev, 
sector_t sector,
page = brd_insert_page(brd, sector);
if (!page)
return -ENOSPC;
-   *kaddr = page_address(page);
+   *kaddr = (void __pmem *)page_address(page);
*pfn = page_to_pfn(page);
 
/*
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index ade9eb9..68f6a6a 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -92,7 +92,7 @@ static int pmem_rw_page(struct block_device *bdev, sector_t 
sector,
 }
 
 static long pmem_direct_access(struct block_device *bdev, sector_t sector,
- void **kaddr, unsigned long *pfn, long size)
+ void __pmem **kaddr, unsigned long *pfn, long size)
 {
struct pmem_device *pmem = bdev->bd_disk->private_data;
size_t offset = sector << 9;
@@ -101,7 +101,7 @@ static long pmem_direct_access(struct block_device *bdev, 
sector_t sector,
return -ENODEV;
 
/* FIXME convert DAX to comprehend that this mapping has a lifetime */
-   *kaddr = (void __force *) pmem->virt_addr + offset;
+   *kaddr = pmem->virt_addr + offset;
*pfn = (pmem->phys_addr + offset) >> PAGE_SHIFT;
 
return pmem->size - offset;
diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
index da21281..2c5a397 100644
--- a/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c
@@ -29,7 +29,7 @@ static int dcssblk_open(struct block_device *bdev, fmode_t 
mode);
 static void dcssblk_release(struct gendisk *disk, fmode_t mode);
 static void dcssblk_make_request(struct request_queue *q, struct bio *bio);
 static long dcssblk_direct_access(struct block_device *bdev, sector_t secnum,
-void **kaddr, unsigned long *pfn, long size);
+void __pmem **kaddr, unsigned long *pfn, long size);
 
 static char dcssblk_segments[DCSSBLK_PAR

[PATCH v2 0/7] dax: I/O path enhancements

2015-08-13 Thread Ross Zwisler

The goal of this series is to enhance the DAX I/O path so that all operations
that store data (I/O writes, zeroing blocks, punching holes, etc.) properly
synchronize the stores to media using the PMEM API.  This ensures that the data
DAX is writing is durable on media before the operation completes.

Patches 1-4 are a few random cleanups.

Changes from v1:
 - Removed patches to PMEM for the "read flush" _DSM flag.  These are different
   enough that they deserve their own series, and they have a separate baseline
   which is currently moving (Dan's memremap() series).
 - Added clear_pmem() PMEM API to zero DAX memory and flush it in one call.
   (Dave)
 - Open coded flushing in arch_wb_cache_pmem() instead of adding a generic
   clwb_flush_range().  This allowed me to avoid having extra memory barriers
   and instead rely completely on arch_wmb_pmem() for ordering. (Dave)
 - Moved the arch implementation of the PMEM API into it's own arch header
   (Christoph).

Ross Zwisler (7):
  brd: make rd_size static
  pmem, x86: move x86 PMEM API to new pmem.h header
  pmem: remove layer when calling arch_has_wmb_pmem()
  pmem, x86: clean up conditional pmem includes
  pmem: add wb_cache_pmem() and clear_pmem()
  dax: update I/O path to do proper PMEM flushing
  pmem, dax: have direct_access use __pmem annotation

 Documentation/filesystems/Locking |   3 +-
 MAINTAINERS   |   1 +
 arch/powerpc/sysdev/axonram.c |   7 ++-
 arch/x86/include/asm/cacheflush.h |  71 --
 arch/x86/include/asm/pmem.h   | 123 ++
 drivers/block/brd.c   |   6 +-
 drivers/nvdimm/pmem.c |   4 +-
 drivers/s390/block/dcssblk.c  |  10 ++--
 fs/block_dev.c|   2 +-
 fs/dax.c  |  73 ++
 include/linux/blkdev.h|   8 +--
 include/linux/pmem.h  |  66 
 12 files changed, 247 insertions(+), 127 deletions(-)
 create mode 100644 arch/x86/include/asm/pmem.h

-- 
2.1.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: RFC: Reducing the number of non volatile GPRs in the ppc64 kernel

2015-08-13 Thread Anton Blanchard

Hi,

Here is another instruction trace from a kernel context switch trace.
Quite a lot of register and CR save/restore code.

Regards,
Anton

c02943d8  mfcrr12
c02943dc  std r20,-96(r1)
c02943e0  std r21,-88(r1)
c02943e4  rldicl. r9,r4,63,63
c02943e8  std r22,-80(r1)
c02943ec  mflrr0
c02943f0  std r24,-64(r1)
c02943f4  std r25,-56(r1)
c02943f8  std r26,-48(r1)
c02943fc  std r27,-40(r1)
c0294400  std r31,-8(r1)
c0294404  std r15,-136(r1)
c0294408  stw r12,8(r1)
c029440c  std r16,-128(r1)
c0294410  mcrfcr4,cr0
c0294414  std r0,16(r1)
c0294418  std r17,-120(r1)
c029441c  std r18,-112(r1)
c0294420  std r19,-104(r1)
c0294424  std r23,-72(r1)
c0294428  std r28,-32(r1)
c029442c  std r29,-24(r1)
c0294430  std r30,-16(r1)
c0294434  stdur1,-272(r1)
c0294438  cmpwi   cr7,r6,1
c029443c  rlwinm  r31,r4,4,1,31
c0294440  li  r9,0
c029  rotlwi  r31,r31,28
c0294448  mr  r24,r6
c029444c  mr  r26,r4
c0294450  mr  r25,r3
c0294454  mr  r22,r5
c0294458  mr  r21,r7
c029445c  mr  r20,r8
c0294460  std r9,120(r1)
c0294464  std r9,112(r1)
c0294468  clrldi  r27,r31,32
c029446c  beq cr7,c0294888  
c0294888  ld  r29,0(r5)
c029488c  addir29,r29,-32
c0294890  beq c0294478  
c0294478  lwz r9,516(r25)
c029447c  and r10,r9,r31
c0294480  cmpwi   r10,0
c0294484  bne c02945d0  
c0294488  cmpdi   cr7,r29,0
c029448c  beq cr7,c02948c4  
c0294490  lwz r9,264(r29)
c0294494  and r10,r9,r31
c0294498  cmpwi   r10,0
c029449c  beq c02948c4  
c02948c4  li  r3,0
c02948c8  b   c02947cc  
c02947cc  addir1,r1,272
c02947d0  ld  r0,16(r1)
c02947d4  lwz r12,8(r1)
c02947d8  ld  r15,-136(r1)
c02947dc  ld  r16,-128(r1)
c02947e0  mtlrr0
c02947e4  ld  r17,-120(r1)
c02947e8  ld  r18,-112(r1)
c02947ec  mtocrf  32,r12
c02947f0  mtocrf  16,r12
c02947f4  mtocrf  8,r12
c02947f8  ld  r19,-104(r1)
c02947fc  ld  r20,-96(r1)
c0294800  ld  r21,-88(r1)
c0294804  ld  r22,-80(r1)
c0294808  ld  r23,-72(r1)
c029480c  ld  r24,-64(r1)
c0294810  ld  r25,-56(r1)
c0294814  ld  r26,-48(r1)
c0294818  ld  r27,-40(r1)
c029481c  ld  r28,-32(r1)
c0294820  ld  r29,-24(r1)
c0294824  ld  r30,-16(r1)
c0294828  ld  r31,-8(r1)
c029482c  blr
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH 09/10] Define PERF_PMU_TXN_READ interface

2015-08-13 Thread Peter Zijlstra

On Thu, Aug 13, 2015 at 01:04:28PM -0700, Sukadev Bhattiprolu wrote:

> | > | +static int perf_read_group(struct perf_event *event,
> | > | +  u64 read_format, char __user *buf)
> | > | +{
> | > | +   struct perf_event *leader = event->group_leader, *child;
> | > | +   struct perf_event_context *ctx = leader->ctx;
> | > | +   int ret = leader->read_size;

> One other question, We return leader->read_size but allocate/copy_to_user
> the sibling's event->read_size. We consistently use read_format from the
> 'event' being read, rather than its 'group_leader', so we are ok in terms
> of what we copy into values[] for each event in the group.
> 
> But, can the leader's read_format (and hence its read_size) differ from
> its sibling's read_size? If so, in the current code, we return the event's
> read_size but in the new code, we return the leader's read_size.

Hmm, good spotting that. I'm fairly sure I didn't do that on purpose.

I think we should use event->read_size there too and have the lot
consistent. I don't think we require read_format to be uniform across
siblings.
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH 09/10] Define PERF_PMU_TXN_READ interface

2015-08-13 Thread Sukadev Bhattiprolu

Peter Zijlstra [pet...@infradead.org] wrote:
| On Tue, Aug 11, 2015 at 09:14:00PM -0700, Sukadev Bhattiprolu wrote:
| > | +static void __perf_read_group_add(struct perf_event *leader, u64 
read_format, u64 *values)
| > |  {
| > | + struct perf_event *sub;
| > | + int n = 1; /* skip @nr */
| > 
| > This n = 1 is to skip over the values[0] = 1 + nr_siblings in the
| > caller.
| > 
| > Anyway, in __perf_read_group_add() we always start with n = 1, however
| > ...
| > | 
| > | + perf_event_read(leader, true);
| > | +
| > | + /*
| > | +  * Since we co-schedule groups, {enabled,running} times of siblings
| > | +  * will be identical to those of the leader, so we only publish one
| > | +  * set.
| > | +  */
| > | + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
| > | + values[n++] += leader->total_time_enabled +
| > | + atomic64_read(leader->child_total_time_enabled);
| 
| Note how this is an in-place addition,

Ah, yes, Sorry I missed that. It make sense now and my tests seem to
be running fine.

| 
| > | + }
| > | 
| > | + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
| > | + values[n++] += leader->total_time_running +
| > | + atomic64_read(leader->child_total_time_running);
| 
| and here,
| 
| > | + }
| > | 
| > | + /*
| > | +  * Write {count,id} tuples for every sibling.
| > | +  */
| > | + values[n++] += perf_event_count(leader);
| 
| and here,
| 
| 
| > |   if (read_format & PERF_FORMAT_ID)
| > |   values[n++] = primary_event_id(leader);
| 
| and this will always assign the same value.
| 
| > | + list_for_each_entry(sub, &leader->sibling_list, group_entry) {
| > | + values[n++] += perf_event_count(sub);
| > | + if (read_format & PERF_FORMAT_ID)
| > | + values[n++] = primary_event_id(sub);
| 
| Same for these, therefore,
| 
| > | + }
| > | +}
| > | 
| > | +static int perf_read_group(struct perf_event *event,
| > | +u64 read_format, char __user *buf)
| > | +{
| > | + struct perf_event *leader = event->group_leader, *child;
| > | + struct perf_event_context *ctx = leader->ctx;
| > | + int ret = leader->read_size;

One other question, We return leader->read_size but allocate/copy_to_user
the sibling's event->read_size. We consistently use read_format from the
'event' being read, rather than its 'group_leader', so we are ok in terms
of what we copy into values[] for each event in the group.

But, can the leader's read_format (and hence its read_size) differ from
its sibling's read_size? If so, in the current code, we return the event's
read_size but in the new code, we return the leader's read_size.

| > | + u64 *values;
| > | 
| > | + lockdep_assert_held(&ctx->mutex);
| > | 
| > | + values = kzalloc(event->read_size);
| > | + if (!values)
| > | + return -ENOMEM;
| > | 
| > | + values[0] = 1 + leader->nr_siblings;
| > | 
| > | + /*
| > | +  * By locking the child_mutex of the leader we effectively
| > | +  * lock the child list of all siblings.. XXX explain how.
| > | +  */
| > | + mutex_lock(&leader->child_mutex);
| > | 
| > | + __perf_read_group_add(leader, read_format, values);
| > 
| > ... we don't copy_to_user() here,
| > 
| > | + list_for_each_entry(child, &leader->child_list, child_list)
| > | + __perf_read_group_add(child, read_format, values);
| > 
| > so won't we overwrite the values[], if we always start at n = 1
| > in __perf_read_group_add()?
| 
| yes and no, we have to re-iterate the same values for each child as they
| all have the same group, but we add the time and count fields, we do not
| overwrite. The _add() suffix was supposed to be a hint ;-)
| 
| > | + mutex_unlock(&leader->child_mutex);
| > | +
| > | + if (copy_to_user(buf, values, event->read_size))
| > | + ret = -EFAULT;
| > | +
| > | + kfree(values);
| > | 
| > |   return ret;
| > |  }
| 
| Where previously we would iterate the group and for each member
| iterate/sum all the child values together before copying the value out,
| we now, because we need to read groups together, need to first iterate
| the child list and sum whole groups.

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH 2/3] powerpc/e6500: hw tablewalk: optimize a bit for tcd lock acquiring codes

2015-08-13 Thread Scott Wood

On Thu, 2015-08-13 at 19:51 +0800, Kevin Hao wrote:
> It makes no sense to put the instructions for calculating the lock
> value (cpu number + 1) and the clearing of eq bit of cr1 in lbarx/stbcx
> loop. And when the lock is acquired by the other thread, the current
> lock value has no chance to equal with the lock value used by current
> cpu. So we can skip the comparing for these two lock values in the
> lbz/bne loop.
> 
> Signed-off-by: Kevin Hao 
> ---
>  arch/powerpc/mm/tlb_low_64e.S | 10 +-
>  1 file changed, 5 insertions(+), 5 deletions(-)
> 
> diff --git a/arch/powerpc/mm/tlb_low_64e.S b/arch/powerpc/mm/tlb_low_64e.S
> index 765b419883f2..e4185581c5a7 100644
> --- a/arch/powerpc/mm/tlb_low_64e.S
> +++ b/arch/powerpc/mm/tlb_low_64e.S
> @@ -308,11 +308,11 @@ BEGIN_FTR_SECTION   /* CPU_FTR_SMT */
>*
>* MAS6:IND should be already set based on MAS4
>*/
> -1:   lbarx   r15,0,r11
>   lhz r10,PACAPACAINDEX(r13)
> - cmpdi   r15,0
> - cmpdi   cr1,r15,1   /* set cr1.eq = 0 for non-recursive */
>   addir10,r10,1
> + crclr   cr1*4+eq/* set cr1.eq = 0 for non-recursive */
> +1:   lbarx   r15,0,r11
> + cmpdi   r15,0
>   bne 2f

You're optimizing the contended case at the expense of introducing stalls in 
the uncontended case.  Does it really matter if there are more instructions 
in the loop?  This change just means that you'll spin in the loop for more 
iterations (if it even does that -- I think the cycles per loop iteration 
might be the same before and after, due to load latency and pairing) while 
waiting for the other thread to release the lock.

Do you have any benchmark results for this patch?

-Scott

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH 5/5] dma-mapping: consolidate dma_set_mask

2015-08-13 Thread Christoph Hellwig

On Thu, Aug 13, 2015 at 04:25:05PM +0100, Russell King - ARM Linux wrote:
> On Thu, Aug 13, 2015 at 05:04:08PM +0200, Christoph Hellwig wrote:
> > diff --git a/arch/arm/common/dmabounce.c b/arch/arm/common/dmabounce.c
> > index 1143c4d..260f52a 100644
> > --- a/arch/arm/common/dmabounce.c
> > +++ b/arch/arm/common/dmabounce.c
> > @@ -440,14 +440,6 @@ static void dmabounce_sync_for_device(struct device 
> > *dev,
> > arm_dma_ops.sync_single_for_device(dev, handle, size, dir);
> >  }
> >  
> > -static int dmabounce_set_mask(struct device *dev, u64 dma_mask)
> > -{
> > -   if (dev->archdata.dmabounce)
> > -   return 0;
> > -
> > -   return arm_dma_ops.set_dma_mask(dev, dma_mask);
> 
> Are you sure about this?  A user of dmabounce gets to request any mask
> with the original code (even though it was never written back... which
> is a separate bug.)  After this, it seems that this will get limited
> by the dma_supported() check.  As this old code is about bouncing any
> buffer into DMA-able memory, it doesn't care about the DMA mask.

I think you're right.  With the default dma_supported implementation
it would be fine, but ARM uses a custom one.  I'll keep the arm
specific dma_set_mask implementation for the next round.
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH 2/5] dma-mapping: consolidate dma_{alloc,free}_noncoherent

2015-08-13 Thread Christoph Hellwig

On Thu, Aug 13, 2015 at 04:20:40PM +0100, Russell King - ARM Linux wrote:
> > -/*
> > - * Dummy noncoherent implementation.  We don't provide a dma_cache_sync
> > - * function so drivers using this API are highlighted with build warnings.
> > - */
> 
> I'd like a similar comment to remain after this patch explaining that we
> don't support non-coherent allocations and that it'll be highlighted by
> the lack of dma_cache_sync, otherwise I'm sure we'll start to get patches
> to add the thing.

I'll keep a modified version of this comment in the ARM dma-mapping.h
in addition to an explanation near the new common dma_alloc_noncoherent
definition, thanks!

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: RFC: prepare for struct scatterlist entries without page backing

2015-08-13 Thread Boaz Harrosh

On 08/13/2015 05:40 PM, Christoph Hellwig wrote:
> On Wed, Aug 12, 2015 at 03:42:47PM +0300, Boaz Harrosh wrote:
>> The support I have suggested and submitted for zone-less sections.
>> (In my add_persistent_memory() patchset)
>>
>> Would work perfectly well and transparent for all such multimedia cases.
>> (All hacks removed). In fact I have loaded pmem (with-pages) on a VRAM
>> a few times and it is great easy fun. (I wanted to experiment with cached
>> memory over a pcie)
> 
> And everyone agree that it was both buggy and incomplete.
> 

What? No one ever said anything about bugs. Is the first ever I hear of it.
I was always in the notion that no one even tried it out.

I'm smoking these page-full nvidimms for more than a year. With RDMA to
pears and swap out to disks. So is not that bad I would say

> Dan has done a respin of the page backed nvdimm work with most of
> these comments addressed.
> 

I would love some comments. All I got so far is silence. (And I do not
like Dan's patches comments will come next week)

> I have to say I hate both pfn-based I/O [1] and page backed nvdimms with
> passion, so we're looking into the lesser evil with an open mind.
> 
> [1] not the SGL part posted here, which I think is quite sane.  The bio
> side is much worse, though.
> 

What can I say. I like the page-backed nvdimms. And the long term for me
is 2M pages. I hope we can sit one day soon and you explain to me whats
evil about it. I would really really like to understand

Thanks though
Boaz

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH 5/5] dma-mapping: consolidate dma_set_mask

2015-08-13 Thread Russell King - ARM Linux

On Thu, Aug 13, 2015 at 05:04:08PM +0200, Christoph Hellwig wrote:
> diff --git a/arch/arm/common/dmabounce.c b/arch/arm/common/dmabounce.c
> index 1143c4d..260f52a 100644
> --- a/arch/arm/common/dmabounce.c
> +++ b/arch/arm/common/dmabounce.c
> @@ -440,14 +440,6 @@ static void dmabounce_sync_for_device(struct device *dev,
>   arm_dma_ops.sync_single_for_device(dev, handle, size, dir);
>  }
>  
> -static int dmabounce_set_mask(struct device *dev, u64 dma_mask)
> -{
> - if (dev->archdata.dmabounce)
> - return 0;
> -
> - return arm_dma_ops.set_dma_mask(dev, dma_mask);

Are you sure about this?  A user of dmabounce gets to request any mask
with the original code (even though it was never written back... which
is a separate bug.)  After this, it seems that this will get limited
by the dma_supported() check.  As this old code is about bouncing any
buffer into DMA-able memory, it doesn't care about the DMA mask.

-- 
FTTC broadband for 0.8mile line: currently at 10.5Mbps down 400kbps up
according to speedtest.net.
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH 2/5] dma-mapping: consolidate dma_{alloc, free}_noncoherent

2015-08-13 Thread Russell King - ARM Linux

On Thu, Aug 13, 2015 at 05:04:05PM +0200, Christoph Hellwig wrote:
> diff --git a/arch/arm/include/asm/dma-mapping.h 
> b/arch/arm/include/asm/dma-mapping.h
> index 2ae3424..ab521d5 100644
> --- a/arch/arm/include/asm/dma-mapping.h
> +++ b/arch/arm/include/asm/dma-mapping.h
> @@ -175,21 +175,6 @@ static inline int dma_mapping_error(struct device *dev, 
> dma_addr_t dma_addr)
>   return dma_addr == DMA_ERROR_CODE;
>  }
>  
> -/*
> - * Dummy noncoherent implementation.  We don't provide a dma_cache_sync
> - * function so drivers using this API are highlighted with build warnings.
> - */

I'd like a similar comment to remain after this patch explaining that we
don't support non-coherent allocations and that it'll be highlighted by
the lack of dma_cache_sync, otherwise I'm sure we'll start to get patches
to add the thing.

-- 
FTTC broadband for 0.8mile line: currently at 10.5Mbps down 400kbps up
according to speedtest.net.
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH 5/5] dma-mapping: consolidate dma_set_mask

2015-08-13 Thread Christoph Hellwig

Almost everyone implements dma_set_mask the same way, although some time
that's hidden in ->set_dma_mask methods.

Move this implementation to common code, including a callout to override
the post-check action, and remove duplicate instaces in methods as well.

Unfortunately some architectures overload unrelated semantics like changing
the dma_ops into it so we still need to allow for an architecture override
for now.

Signed-off-by: Christoph Hellwig 
---
 arch/alpha/include/asm/dma-mapping.h  |  5 -
 arch/alpha/kernel/pci-noop.c  | 10 --
 arch/alpha/kernel/pci_iommu.c | 11 ---
 arch/arm/common/dmabounce.c   |  9 -
 arch/arm/include/asm/dma-mapping.h|  5 -
 arch/arm/mm/dma-mapping.c | 16 
 arch/arm/xen/mm.c |  1 -
 arch/arm64/include/asm/dma-mapping.h  |  9 -
 arch/h8300/include/asm/dma-mapping.h  |  5 -
 arch/hexagon/include/asm/dma-mapping.h|  1 -
 arch/hexagon/kernel/dma.c | 11 ---
 arch/ia64/include/asm/dma-mapping.h   |  9 -
 arch/microblaze/include/asm/dma-mapping.h | 14 --
 arch/mips/include/asm/dma-mapping.h   | 16 
 arch/openrisc/include/asm/dma-mapping.h   |  9 -
 arch/powerpc/include/asm/dma-mapping.h|  4 +++-
 arch/powerpc/platforms/cell/iommu.c   |  3 ---
 arch/s390/include/asm/dma-mapping.h   |  2 --
 arch/s390/pci/pci_dma.c   | 10 --
 arch/sh/include/asm/dma-mapping.h | 14 --
 arch/sparc/include/asm/dma-mapping.h  |  5 +++--
 arch/tile/include/asm/dma-mapping.h   |  5 +++--
 arch/unicore32/include/asm/dma-mapping.h  | 10 --
 arch/x86/include/asm/dma-mapping.h|  2 --
 arch/x86/kernel/pci-dma.c | 11 ---
 drivers/xen/swiotlb-xen.c | 12 
 include/asm-generic/dma-mapping-common.h  | 16 
 include/xen/swiotlb-xen.h |  2 --
 28 files changed, 25 insertions(+), 202 deletions(-)

diff --git a/arch/alpha/include/asm/dma-mapping.h 
b/arch/alpha/include/asm/dma-mapping.h
index 9d763e5..72a8ca7 100644
--- a/arch/alpha/include/asm/dma-mapping.h
+++ b/arch/alpha/include/asm/dma-mapping.h
@@ -12,11 +12,6 @@ static inline struct dma_map_ops *get_dma_ops(struct device 
*dev)
 
 #include 
 
-static inline int dma_set_mask(struct device *dev, u64 mask)
-{
-   return get_dma_ops(dev)->set_dma_mask(dev, mask);
-}
-
 #define dma_cache_sync(dev, va, size, dir)   ((void)0)
 
 #endif /* _ALPHA_DMA_MAPPING_H */
diff --git a/arch/alpha/kernel/pci-noop.c b/arch/alpha/kernel/pci-noop.c
index df24b76..2b1f4a1 100644
--- a/arch/alpha/kernel/pci-noop.c
+++ b/arch/alpha/kernel/pci-noop.c
@@ -166,15 +166,6 @@ static int alpha_noop_supported(struct device *dev, u64 
mask)
return mask < 0x00ffUL ? 0 : 1;
 }
 
-static int alpha_noop_set_mask(struct device *dev, u64 mask)
-{
-   if (!dev->dma_mask || !dma_supported(dev, mask))
-   return -EIO;
-
-   *dev->dma_mask = mask;
-   return 0;
-}
-
 struct dma_map_ops alpha_noop_ops = {
.alloc  = alpha_noop_alloc_coherent,
.free   = alpha_noop_free_coherent,
@@ -182,7 +173,6 @@ struct dma_map_ops alpha_noop_ops = {
.map_sg = alpha_noop_map_sg,
.mapping_error  = alpha_noop_mapping_error,
.dma_supported  = alpha_noop_supported,
-   .set_dma_mask   = alpha_noop_set_mask,
 };
 
 struct dma_map_ops *dma_ops = &alpha_noop_ops;
diff --git a/arch/alpha/kernel/pci_iommu.c b/arch/alpha/kernel/pci_iommu.c
index eddee77..8969bf2 100644
--- a/arch/alpha/kernel/pci_iommu.c
+++ b/arch/alpha/kernel/pci_iommu.c
@@ -939,16 +939,6 @@ static int alpha_pci_mapping_error(struct device *dev, 
dma_addr_t dma_addr)
return dma_addr == 0;
 }
 
-static int alpha_pci_set_mask(struct device *dev, u64 mask)
-{
-   if (!dev->dma_mask ||
-   !pci_dma_supported(alpha_gendev_to_pci(dev), mask))
-   return -EIO;
-
-   *dev->dma_mask = mask;
-   return 0;
-}
-
 struct dma_map_ops alpha_pci_ops = {
.alloc  = alpha_pci_alloc_coherent,
.free   = alpha_pci_free_coherent,
@@ -958,7 +948,6 @@ struct dma_map_ops alpha_pci_ops = {
.unmap_sg   = alpha_pci_unmap_sg,
.mapping_error  = alpha_pci_mapping_error,
.dma_supported  = alpha_pci_supported,
-   .set_dma_mask   = alpha_pci_set_mask,
 };
 
 struct dma_map_ops *dma_ops = &alpha_pci_ops;
diff --git a/arch/arm/common/dmabounce.c b/arch/arm/common/dmabounce.c
index 1143c4d..260f52a 100644
--- a/arch/arm/common/dmabounce.c
+++ b/arch/arm/common/dmabounce.c
@@ -440,14 +440,6 @@ static void dmabounce_sync_for_device(struct device *dev,
arm_dma_ops.sync_single_for_device(de

[PATCH 3/5] dma-mapping: cosolidate dma_mapping_error

2015-08-13 Thread Christoph Hellwig

Currently there are three valid implementations of dma_mapping_error:

 (1) call ->mapping_error
 (2) check for a hardcoded error code
 (3) always return 0

This patch provides a common implementation that calls ->mapping_error
if present, then checks for DMA_ERROR_CODE if defined or otherwise
returns 0.

Signed-off-by: Christoph Hellwig 
---
 arch/alpha/include/asm/dma-mapping.h  |  5 -
 arch/arm/include/asm/dma-mapping.h|  9 -
 arch/arm64/include/asm/dma-mapping.h  |  7 ---
 arch/h8300/include/asm/dma-mapping.h  |  5 -
 arch/hexagon/include/asm/dma-mapping.h| 11 +--
 arch/ia64/include/asm/dma-mapping.h   |  7 ---
 arch/microblaze/include/asm/dma-mapping.h | 11 ---
 arch/mips/include/asm/dma-mapping.h   |  8 
 arch/openrisc/include/asm/dma-mapping.h   |  5 -
 arch/powerpc/include/asm/dma-mapping.h| 17 ++---
 arch/s390/include/asm/dma-mapping.h   | 10 --
 arch/sh/include/asm/dma-mapping.h | 13 ++---
 arch/sparc/include/asm/dma-mapping.h  |  6 --
 arch/tile/include/asm/dma-mapping.h   |  7 ---
 arch/unicore32/include/asm/dma-mapping.h  | 10 --
 arch/x86/include/asm/dma-mapping.h| 11 ---
 include/asm-generic/dma-mapping-common.h  | 14 ++
 17 files changed, 19 insertions(+), 137 deletions(-)

diff --git a/arch/alpha/include/asm/dma-mapping.h 
b/arch/alpha/include/asm/dma-mapping.h
index 0552bf0..80ac3e8 100644
--- a/arch/alpha/include/asm/dma-mapping.h
+++ b/arch/alpha/include/asm/dma-mapping.h
@@ -12,11 +12,6 @@ static inline struct dma_map_ops *get_dma_ops(struct device 
*dev)
 
 #include 
 
-static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
-{
-   return get_dma_ops(dev)->mapping_error(dev, dma_addr);
-}
-
 static inline int dma_supported(struct device *dev, u64 mask)
 {
return get_dma_ops(dev)->dma_supported(dev, mask);
diff --git a/arch/arm/include/asm/dma-mapping.h 
b/arch/arm/include/asm/dma-mapping.h
index ab521d5..2fa33d7 100644
--- a/arch/arm/include/asm/dma-mapping.h
+++ b/arch/arm/include/asm/dma-mapping.h
@@ -166,15 +166,6 @@ static inline bool dma_capable(struct device *dev, 
dma_addr_t addr, size_t size)
 
 static inline void dma_mark_clean(void *addr, size_t size) { }
 
-/*
- * DMA errors are defined by all-bits-set in the DMA address.
- */
-static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
-{
-   debug_dma_mapping_error(dev, dma_addr);
-   return dma_addr == DMA_ERROR_CODE;
-}
-
 extern int dma_supported(struct device *dev, u64 mask);
 
 extern int arm_dma_set_mask(struct device *dev, u64 dma_mask);
diff --git a/arch/arm64/include/asm/dma-mapping.h 
b/arch/arm64/include/asm/dma-mapping.h
index 178e60b..f45f444 100644
--- a/arch/arm64/include/asm/dma-mapping.h
+++ b/arch/arm64/include/asm/dma-mapping.h
@@ -84,13 +84,6 @@ static inline phys_addr_t dma_to_phys(struct device *dev, 
dma_addr_t dev_addr)
return (phys_addr_t)dev_addr;
 }
 
-static inline int dma_mapping_error(struct device *dev, dma_addr_t dev_addr)
-{
-   struct dma_map_ops *ops = get_dma_ops(dev);
-   debug_dma_mapping_error(dev, dev_addr);
-   return ops->mapping_error(dev, dev_addr);
-}
-
 static inline int dma_supported(struct device *dev, u64 mask)
 {
struct dma_map_ops *ops = get_dma_ops(dev);
diff --git a/arch/h8300/include/asm/dma-mapping.h 
b/arch/h8300/include/asm/dma-mapping.h
index 72465ce..5eef053 100644
--- a/arch/h8300/include/asm/dma-mapping.h
+++ b/arch/h8300/include/asm/dma-mapping.h
@@ -20,9 +20,4 @@ static inline int dma_set_mask(struct device *dev, u64 mask)
return 0;
 }
 
-static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
-{
-   return 0;
-}
-
 #endif
diff --git a/arch/hexagon/include/asm/dma-mapping.h 
b/arch/hexagon/include/asm/dma-mapping.h
index 58d2d8f..e661192 100644
--- a/arch/hexagon/include/asm/dma-mapping.h
+++ b/arch/hexagon/include/asm/dma-mapping.h
@@ -31,6 +31,7 @@
 
 struct device;
 extern int bad_dma_address;
+#define DMA_ERROR_CODE bad_dma_address
 
 extern struct dma_map_ops *dma_ops;
 
@@ -57,14 +58,4 @@ static inline bool dma_capable(struct device *dev, 
dma_addr_t addr, size_t size)
return addr + size - 1 <= *dev->dma_mask;
 }
 
-static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
-{
-   struct dma_map_ops *dma_ops = get_dma_ops(dev);
-
-   if (dma_ops->mapping_error)
-   return dma_ops->mapping_error(dev, dma_addr);
-
-   return (dma_addr == bad_dma_address);
-}
-
 #endif
diff --git a/arch/ia64/include/asm/dma-mapping.h 
b/arch/ia64/include/asm/dma-mapping.h
index a925ff0..27b713d 100644
--- a/arch/ia64/include/asm/dma-mapping.h
+++ b/arch/ia64/include/asm/dma-mapping.h
@@ -27,13 +27,6 @@ extern void machvec_dma_sync_sg(struct device *, struct 
scatterlist *, int,
 
 #include 
 
-static inline int dma_mapping_error(st

[PATCH 2/5] dma-mapping: consolidate dma_{alloc,free}_noncoherent

2015-08-13 Thread Christoph Hellwig

Most architectures do not support non-coherent allocations and either
define dma_{alloc,free}_noncoherent to their coherent versions or stub
them out.

Openrisc uses dma_{alloc,free}_attrs to implement them, and only Mips
implements them directly.

This patch moves the Openrisc version to common code, and handles the
DMA_ATTR_NON_CONSISTENT case in the mips dma_map_ops instance.

Note that actual non-coherent allocations require a dma_cache_sync
implementation, so if non-coherent allocations didn't work on
an architecture before this patch they still won't work after it.

Signed-off-by: Christoph Hellwig 
---
 arch/alpha/include/asm/dma-mapping.h  |  3 ---
 arch/arm/include/asm/dma-mapping.h| 15 ---
 arch/arm64/include/asm/dma-mapping.h  | 14 --
 arch/h8300/include/asm/dma-mapping.h  |  3 ---
 arch/hexagon/include/asm/dma-mapping.h|  3 ---
 arch/ia64/include/asm/dma-mapping.h   |  3 ---
 arch/microblaze/include/asm/dma-mapping.h |  3 ---
 arch/mips/include/asm/dma-mapping.h   |  6 --
 arch/mips/mm/dma-default.c| 20 +++-
 arch/openrisc/include/asm/dma-mapping.h   | 20 
 arch/powerpc/include/asm/dma-mapping.h|  3 ---
 arch/s390/include/asm/dma-mapping.h   |  3 ---
 arch/sh/include/asm/dma-mapping.h |  3 ---
 arch/sparc/include/asm/dma-mapping.h  |  3 ---
 arch/tile/include/asm/dma-mapping.h   |  3 ---
 arch/unicore32/include/asm/dma-mapping.h  |  3 ---
 arch/x86/include/asm/dma-mapping.h|  3 ---
 include/asm-generic/dma-mapping-common.h  | 18 ++
 18 files changed, 33 insertions(+), 96 deletions(-)

diff --git a/arch/alpha/include/asm/dma-mapping.h 
b/arch/alpha/include/asm/dma-mapping.h
index 9fef5bd..0552bf0 100644
--- a/arch/alpha/include/asm/dma-mapping.h
+++ b/arch/alpha/include/asm/dma-mapping.h
@@ -27,9 +27,6 @@ static inline int dma_set_mask(struct device *dev, u64 mask)
return get_dma_ops(dev)->set_dma_mask(dev, mask);
 }
 
-#define dma_alloc_noncoherent(d, s, h, f)  dma_alloc_coherent(d, s, h, f)
-#define dma_free_noncoherent(d, s, v, h)   dma_free_coherent(d, s, v, h)
-
 #define dma_cache_sync(dev, va, size, dir)   ((void)0)
 
 #endif /* _ALPHA_DMA_MAPPING_H */
diff --git a/arch/arm/include/asm/dma-mapping.h 
b/arch/arm/include/asm/dma-mapping.h
index 2ae3424..ab521d5 100644
--- a/arch/arm/include/asm/dma-mapping.h
+++ b/arch/arm/include/asm/dma-mapping.h
@@ -175,21 +175,6 @@ static inline int dma_mapping_error(struct device *dev, 
dma_addr_t dma_addr)
return dma_addr == DMA_ERROR_CODE;
 }
 
-/*
- * Dummy noncoherent implementation.  We don't provide a dma_cache_sync
- * function so drivers using this API are highlighted with build warnings.
- */
-static inline void *dma_alloc_noncoherent(struct device *dev, size_t size,
-   dma_addr_t *handle, gfp_t gfp)
-{
-   return NULL;
-}
-
-static inline void dma_free_noncoherent(struct device *dev, size_t size,
-   void *cpu_addr, dma_addr_t handle)
-{
-}
-
 extern int dma_supported(struct device *dev, u64 mask);
 
 extern int arm_dma_set_mask(struct device *dev, u64 dma_mask);
diff --git a/arch/arm64/include/asm/dma-mapping.h 
b/arch/arm64/include/asm/dma-mapping.h
index 5e11b3f..178e60b 100644
--- a/arch/arm64/include/asm/dma-mapping.h
+++ b/arch/arm64/include/asm/dma-mapping.h
@@ -118,19 +118,5 @@ static inline void dma_mark_clean(void *addr, size_t size)
 {
 }
 
-/*
- * There is no dma_cache_sync() implementation, so just return NULL here.
- */
-static inline void *dma_alloc_noncoherent(struct device *dev, size_t size,
- dma_addr_t *handle, gfp_t flags)
-{
-   return NULL;
-}
-
-static inline void dma_free_noncoherent(struct device *dev, size_t size,
-   void *cpu_addr, dma_addr_t handle)
-{
-}
-
 #endif /* __KERNEL__ */
 #endif /* __ASM_DMA_MAPPING_H */
diff --git a/arch/h8300/include/asm/dma-mapping.h 
b/arch/h8300/include/asm/dma-mapping.h
index 826aa9b..72465ce 100644
--- a/arch/h8300/include/asm/dma-mapping.h
+++ b/arch/h8300/include/asm/dma-mapping.h
@@ -20,9 +20,6 @@ static inline int dma_set_mask(struct device *dev, u64 mask)
return 0;
 }
 
-#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
-#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
-
 static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
 {
return 0;
diff --git a/arch/hexagon/include/asm/dma-mapping.h 
b/arch/hexagon/include/asm/dma-mapping.h
index c20d3ca..58d2d8f 100644
--- a/arch/hexagon/include/asm/dma-mapping.h
+++ b/arch/hexagon/include/asm/dma-mapping.h
@@ -34,9 +34,6 @@ extern int bad_dma_address;
 
 extern struct dma_map_ops *dma_ops;
 
-#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
-#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
-
 static inline s

[PATCH 1/5] dma-mapping: consolidate dma_{alloc, free}_{attrs, coherent}

2015-08-13 Thread Christoph Hellwig

The coherent DMA allocator works the same over all architectures supporting
dma_map operations.

This patch consolidates them and converges the minor differences:

 - the debug_dma helpers are now called from all architectures, including
   those that were previously missing them
 - dma_alloc_from_coherent and dma_release_from_coherent are now always
   called from the generic alloc/free routines instead of the ops
   dma-mapping-common.h always includes dma-coherent.h to get the defintions
   for them, or the stubs if the architecture doesn't support this feature
 - checks for ->alloc / ->free presence are removed.  There is only one
   magic instead of dma_map_ops without them (mic_dma_ops) and that one
   is x86 only anyway.

Besides that only x86 needs special treatment to replace a default devices
if none is passed and tweak the gfp_flags.  An optional arch hook is provided
for that.

Signed-off-by: Christoph Hellwig 
---
 arch/alpha/include/asm/dma-mapping.h  | 18 --
 arch/arm/include/asm/dma-mapping.h| 29 
 arch/arm/mm/dma-mapping.c | 11 --
 arch/arm64/include/asm/dma-mapping.h  | 33 --
 arch/h8300/include/asm/dma-mapping.h  | 26 --
 arch/hexagon/include/asm/dma-mapping.h| 33 --
 arch/ia64/include/asm/dma-mapping.h   | 25 -
 arch/microblaze/include/asm/dma-mapping.h | 31 -
 arch/mips/cavium-octeon/dma-octeon.c  |  8 -
 arch/mips/include/asm/dma-mapping.h   | 31 -
 arch/mips/loongson64/common/dma-swiotlb.c |  8 -
 arch/mips/mm/dma-default.c|  7 
 arch/mips/netlogic/common/nlm-dma.c   |  8 -
 arch/openrisc/include/asm/dma-mapping.h   | 30 
 arch/powerpc/include/asm/dma-mapping.h| 33 --
 arch/s390/include/asm/dma-mapping.h   | 31 -
 arch/sh/include/asm/dma-mapping.h | 37 
 arch/sparc/include/asm/dma-mapping.h  | 26 --
 arch/tile/include/asm/dma-mapping.h   | 27 --
 arch/unicore32/include/asm/dma-mapping.h  | 24 -
 arch/x86/include/asm/dma-mapping.h| 16 ++---
 arch/x86/kernel/pci-dma.c | 49 +-
 drivers/xen/swiotlb-xen.c |  6 
 include/asm-generic/dma-mapping-common.h  | 58 +++
 24 files changed, 70 insertions(+), 535 deletions(-)

diff --git a/arch/alpha/include/asm/dma-mapping.h 
b/arch/alpha/include/asm/dma-mapping.h
index dfa32f0..9fef5bd 100644
--- a/arch/alpha/include/asm/dma-mapping.h
+++ b/arch/alpha/include/asm/dma-mapping.h
@@ -12,24 +12,6 @@ static inline struct dma_map_ops *get_dma_ops(struct device 
*dev)
 
 #include 
 
-#define dma_alloc_coherent(d,s,h,f)dma_alloc_attrs(d,s,h,f,NULL)
-
-static inline void *dma_alloc_attrs(struct device *dev, size_t size,
-   dma_addr_t *dma_handle, gfp_t gfp,
-   struct dma_attrs *attrs)
-{
-   return get_dma_ops(dev)->alloc(dev, size, dma_handle, gfp, attrs);
-}
-
-#define dma_free_coherent(d,s,c,h) dma_free_attrs(d,s,c,h,NULL)
-
-static inline void dma_free_attrs(struct device *dev, size_t size,
- void *vaddr, dma_addr_t dma_handle,
- struct dma_attrs *attrs)
-{
-   get_dma_ops(dev)->free(dev, size, vaddr, dma_handle, attrs);
-}
-
 static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
 {
return get_dma_ops(dev)->mapping_error(dev, dma_addr);
diff --git a/arch/arm/include/asm/dma-mapping.h 
b/arch/arm/include/asm/dma-mapping.h
index b52101d..2ae3424 100644
--- a/arch/arm/include/asm/dma-mapping.h
+++ b/arch/arm/include/asm/dma-mapping.h
@@ -8,7 +8,6 @@
 #include 
 #include 
 
-#include 
 #include 
 
 #include 
@@ -209,21 +208,6 @@ extern int arm_dma_set_mask(struct device *dev, u64 
dma_mask);
 extern void *arm_dma_alloc(struct device *dev, size_t size, dma_addr_t *handle,
   gfp_t gfp, struct dma_attrs *attrs);
 
-#define dma_alloc_coherent(d, s, h, f) dma_alloc_attrs(d, s, h, f, NULL)
-
-static inline void *dma_alloc_attrs(struct device *dev, size_t size,
-  dma_addr_t *dma_handle, gfp_t flag,
-  struct dma_attrs *attrs)
-{
-   struct dma_map_ops *ops = get_dma_ops(dev);
-   void *cpu_addr;
-   BUG_ON(!ops);
-
-   cpu_addr = ops->alloc(dev, size, dma_handle, flag, attrs);
-   debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr);
-   return cpu_addr;
-}
-
 /**
  * arm_dma_free - free memory allocated by arm_dma_alloc
  * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices
@@ -241,19 +225,6 @@ static inline void *dma_alloc_attrs(struct device *dev, 
size_t size,
 extern void arm_dma_free(struct device *dev, size_t size,

provide more common DMA API functions

2015-08-13 Thread Christoph Hellwig

Since 2009 we have a nice asm-generic header implementing lots of DMA API
functions for architectures using struct dma_map_ops, but unfortunately
it's still missing a lot of APIs that all architectures still have to
duplicate.

This series consolidates the remaining functions, although we still
need arch opt outs for two of them as a few architectures have very
non-standard implementations.

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH 4/5] dma-mapping: consolidate dma_supported

2015-08-13 Thread Christoph Hellwig

Most architectures just call into ->dma_supported, but some also return 1
if the method is not present, or 0 if no dma ops are present (although
that should never happeb). Consolidate this more broad version into
common code.

Also fix h8300 which inorrectly always returned 0, which would have been
a problem if it's dma_set_mask implementation wasn't a similarly buggy
noop.

As a few architectures have much more elaborate implementations, we
still allow for arch overrides.

Signed-off-by: Christoph Hellwig 
---
 arch/alpha/include/asm/dma-mapping.h  |  5 -
 arch/arm/include/asm/dma-mapping.h|  5 +++--
 arch/arm64/include/asm/dma-mapping.h  |  6 --
 arch/h8300/include/asm/dma-mapping.h  |  5 -
 arch/hexagon/include/asm/dma-mapping.h|  1 +
 arch/ia64/include/asm/dma-mapping.h   |  6 --
 arch/microblaze/include/asm/dma-mapping.h | 11 ---
 arch/mips/include/asm/dma-mapping.h   |  6 --
 arch/openrisc/include/asm/dma-mapping.h   |  5 +++--
 arch/powerpc/include/asm/dma-mapping.h| 11 ---
 arch/s390/include/asm/dma-mapping.h   |  9 -
 arch/sh/include/asm/dma-mapping.h | 10 --
 arch/sparc/include/asm/dma-mapping.h  |  1 +
 arch/tile/include/asm/dma-mapping.h   |  6 --
 arch/unicore32/include/asm/dma-mapping.h  | 10 --
 arch/x86/include/asm/dma-mapping.h|  4 +++-
 include/asm-generic/dma-mapping-common.h  | 13 +
 17 files changed, 24 insertions(+), 90 deletions(-)

diff --git a/arch/alpha/include/asm/dma-mapping.h 
b/arch/alpha/include/asm/dma-mapping.h
index 80ac3e8..9d763e5 100644
--- a/arch/alpha/include/asm/dma-mapping.h
+++ b/arch/alpha/include/asm/dma-mapping.h
@@ -12,11 +12,6 @@ static inline struct dma_map_ops *get_dma_ops(struct device 
*dev)
 
 #include 
 
-static inline int dma_supported(struct device *dev, u64 mask)
-{
-   return get_dma_ops(dev)->dma_supported(dev, mask);
-}
-
 static inline int dma_set_mask(struct device *dev, u64 mask)
 {
return get_dma_ops(dev)->set_dma_mask(dev, mask);
diff --git a/arch/arm/include/asm/dma-mapping.h 
b/arch/arm/include/asm/dma-mapping.h
index 2fa33d7..b90d247 100644
--- a/arch/arm/include/asm/dma-mapping.h
+++ b/arch/arm/include/asm/dma-mapping.h
@@ -38,6 +38,9 @@ static inline void set_dma_ops(struct device *dev, struct 
dma_map_ops *ops)
dev->archdata.dma_ops = ops;
 }
 
+#define HAVE_ARCH_DMA_SUPPORTED 1
+extern int dma_supported(struct device *dev, u64 mask);
+
 #include 
 
 static inline int dma_set_mask(struct device *dev, u64 mask)
@@ -166,8 +169,6 @@ static inline bool dma_capable(struct device *dev, 
dma_addr_t addr, size_t size)
 
 static inline void dma_mark_clean(void *addr, size_t size) { }
 
-extern int dma_supported(struct device *dev, u64 mask);
-
 extern int arm_dma_set_mask(struct device *dev, u64 dma_mask);
 
 /**
diff --git a/arch/arm64/include/asm/dma-mapping.h 
b/arch/arm64/include/asm/dma-mapping.h
index f45f444..f519a58 100644
--- a/arch/arm64/include/asm/dma-mapping.h
+++ b/arch/arm64/include/asm/dma-mapping.h
@@ -84,12 +84,6 @@ static inline phys_addr_t dma_to_phys(struct device *dev, 
dma_addr_t dev_addr)
return (phys_addr_t)dev_addr;
 }
 
-static inline int dma_supported(struct device *dev, u64 mask)
-{
-   struct dma_map_ops *ops = get_dma_ops(dev);
-   return ops->dma_supported(dev, mask);
-}
-
 static inline int dma_set_mask(struct device *dev, u64 mask)
 {
if (!dev->dma_mask || !dma_supported(dev, mask))
diff --git a/arch/h8300/include/asm/dma-mapping.h 
b/arch/h8300/include/asm/dma-mapping.h
index 5eef053..48d652e 100644
--- a/arch/h8300/include/asm/dma-mapping.h
+++ b/arch/h8300/include/asm/dma-mapping.h
@@ -10,11 +10,6 @@ static inline struct dma_map_ops *get_dma_ops(struct device 
*dev)
 
 #include 
 
-static inline int dma_supported(struct device *dev, u64 mask)
-{
-   return 0;
-}
-
 static inline int dma_set_mask(struct device *dev, u64 mask)
 {
return 0;
diff --git a/arch/hexagon/include/asm/dma-mapping.h 
b/arch/hexagon/include/asm/dma-mapping.h
index e661192..36e8de7 100644
--- a/arch/hexagon/include/asm/dma-mapping.h
+++ b/arch/hexagon/include/asm/dma-mapping.h
@@ -43,6 +43,7 @@ static inline struct dma_map_ops *get_dma_ops(struct device 
*dev)
return dma_ops;
 }
 
+#define HAVE_ARCH_DMA_SUPPORTED 1
 extern int dma_supported(struct device *dev, u64 mask);
 extern int dma_set_mask(struct device *dev, u64 mask);
 extern int dma_is_consistent(struct device *dev, dma_addr_t dma_handle);
diff --git a/arch/ia64/include/asm/dma-mapping.h 
b/arch/ia64/include/asm/dma-mapping.h
index 27b713d..7982caa 100644
--- a/arch/ia64/include/asm/dma-mapping.h
+++ b/arch/ia64/include/asm/dma-mapping.h
@@ -27,12 +27,6 @@ extern void machvec_dma_sync_sg(struct device *, struct 
scatterlist *, int,
 
 #include 
 
-static inline int dma_supported(struct device *dev, u64 mask)
-{
-   struct dma_map_ops *ops = platform_dma_get_ops(dev);
-   r

Re: RFC: prepare for struct scatterlist entries without page backing

2015-08-13 Thread Christoph Hellwig

On Thu, Aug 13, 2015 at 09:37:37AM +1000, Julian Calaby wrote:
> I.e. ~90% of this patch set seems to be just mechanically dropping
> BUG_ON()s and converting open coded stuff to use accessor functions
> (which should be macros or get inlined, right?) - and the remaining
> bit is not flushing if we don't have a physical page somewhere.

Which is was 90%.  By lines changed most actually is the diffs for
the cache flushing.

> Would it make sense to split this patch set into a few bits: one to
> drop all the useless BUG_ON()s, one to convert all the open coded
> stuff to accessor functions, then another to do the actual page-less
> sg stuff?

Without the ifs the BUG_ON() actually are useful to assert we
never feed the sort of physical addresses we can't otherwise support,
so I don't think that part is doable.

A simple series to make more use of sg_phys and add sg_pfn might
still be useful, though.
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH 31/31] dma-mapping-common: skip kmemleak checks for page-less SG entries

2015-08-13 Thread Christoph Hellwig

On Wed, Aug 12, 2015 at 09:05:15AM -0700, Linus Torvalds wrote:
> [ Again, I'm responding to one random patch - this pattern was in
> other patches too.  ]
> 
> A question: do we actually expect to mix page-less and pageful SG
> entries in the same SG list?
> 
> How does that happen?

Both for DAX and the video buffer case people could do direct I/O
spanning the boundary between such a VMA and a normal one unless
we add special code to prevent that.  Right now I don't think it's
all that useful, but then again it doesn't seem harmful either
and adding those checks might add up.
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH 29/31] parisc: handle page-less SG entries

2015-08-13 Thread Christoph Hellwig

On Wed, Aug 12, 2015 at 09:01:02AM -0700, Linus Torvalds wrote:
> I'm assuming that anybody who wants to use the page-less
> scatter-gather lists always does so on memory that isn't actually
> virtually mapped at all, or only does so on sane architectures that
> are cache coherent at a physical level, but I'd like that assumption
> *documented* somewhere.

It's temporarily mapped by kmap-like helpers.  That code isn't in
this series. The most recent version of it is here:

https://git.kernel.org/cgit/linux/kernel/git/djbw/nvdimm.git/commit/?h=pfn&id=de8237c99fdb4352be2193f3a7610e902b9bb2f0

note that it's not doing the cache flushing it would have to do yet, but
it's also only enabled for x86 at the moment.
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: RFC: prepare for struct scatterlist entries without page backing

2015-08-13 Thread Christoph Hellwig

On Wed, Aug 12, 2015 at 03:42:47PM +0300, Boaz Harrosh wrote:
> The support I have suggested and submitted for zone-less sections.
> (In my add_persistent_memory() patchset)
>
> Would work perfectly well and transparent for all such multimedia cases.
> (All hacks removed). In fact I have loaded pmem (with-pages) on a VRAM
> a few times and it is great easy fun. (I wanted to experiment with cached
> memory over a pcie)

And everyone agree that it was both buggy and incomplete.

Dan has done a respin of the page backed nvdimm work with most of
these comments addressed.

I have to say I hate both pfn-based I/O [1] and page backed nvdimms with
passion, so we're looking into the lesser evil with an open mind.

[1] not the SGL part posted here, which I think is quite sane.  The bio
side is much worse, though.
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v3 6/6] powerpc/powernv: allocate sparse PE# when using M64 BAR in Single PE mode

2015-08-13 Thread Wei Yang

When M64 BAR is set to Single PE mode, the PE# assigned to VF could be
sparse.

This patch restructures the patch to allocate sparse PE# for VFs when M64
BAR is set to Single PE mode.

Signed-off-by: Wei Yang 
---
 arch/powerpc/include/asm/pci-bridge.h |2 +-
 arch/powerpc/platforms/powernv/pci-ioda.c |   59 +++--
 2 files changed, 41 insertions(+), 20 deletions(-)

diff --git a/arch/powerpc/include/asm/pci-bridge.h 
b/arch/powerpc/include/asm/pci-bridge.h
index 9d33ada..b026ef8 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -214,7 +214,7 @@ struct pci_dn {
 #ifdef CONFIG_PCI_IOV
u16 vfs_expanded;   /* number of VFs IOV BAR expanded */
u16 num_vfs;/* number of VFs enabled*/
-   int offset; /* PE# for the first VF PE */
+   int pe_num_map[MAX_M64_BAR];/* PE# for the first VF PE or array */
boolm64_single_mode;/* Use M64 BAR in Single Mode */
 #define IODA_INVALID_M64(-1)
int  m64_map[PCI_SRIOV_NUM_BARS][MAX_M64_BAR];
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 1e6ac86..7633538 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1232,7 +1232,7 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, 
u16 num_vfs)
 
/* Map the M64 here */
if (pdn->m64_single_mode) {
-   pe_num = pdn->offset + j;
+   pe_num = pdn->pe_num_map[j];
rc = opal_pci_map_pe_mmio_window(phb->opal_id,
pe_num, OPAL_M64_WINDOW_TYPE,
pdn->m64_map[i][j], 0);
@@ -1336,7 +1336,7 @@ void pnv_pci_sriov_disable(struct pci_dev *pdev)
struct pnv_phb*phb;
struct pci_dn *pdn;
struct pci_sriov  *iov;
-   u16 num_vfs;
+   u16 num_vfs, i;
 
bus = pdev->bus;
hose = pci_bus_to_host(bus);
@@ -1350,14 +1350,17 @@ void pnv_pci_sriov_disable(struct pci_dev *pdev)
 
if (phb->type == PNV_PHB_IODA2) {
if (!pdn->m64_single_mode)
-   pnv_pci_vf_resource_shift(pdev, -pdn->offset);
+   pnv_pci_vf_resource_shift(pdev, -pdn->pe_num_map[0]);
 
/* Release M64 windows */
pnv_pci_vf_release_m64(pdev);
 
/* Release PE numbers */
-   bitmap_clear(phb->ioda.pe_alloc, pdn->offset, num_vfs);
-   pdn->offset = 0;
+   if (pdn->m64_single_mode) {
+   for (i = 0; i < num_vfs; i++)
+   pnv_ioda_free_pe(phb, pdn->pe_num_map[i]);
+   } else
+   bitmap_clear(phb->ioda.pe_alloc, pdn->pe_num_map[0], 
num_vfs);
}
 }
 
@@ -1383,7 +1386,10 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, 
u16 num_vfs)
 
/* Reserve PE for each VF */
for (vf_index = 0; vf_index < num_vfs; vf_index++) {
-   pe_num = pdn->offset + vf_index;
+   if (pdn->m64_single_mode)
+   pe_num = pdn->pe_num_map[vf_index];
+   else
+   pe_num = pdn->pe_num_map[0] + vf_index;
 
pe = &phb->ioda.pe_array[pe_num];
pe->pe_number = pe_num;
@@ -1425,6 +1431,7 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 
num_vfs)
struct pnv_phb*phb;
struct pci_dn *pdn;
intret;
+   u16i;
 
bus = pdev->bus;
hose = pci_bus_to_host(bus);
@@ -1448,19 +1455,30 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 
num_vfs)
}
 
/* Calculate available PE for required VFs */
-   mutex_lock(&phb->ioda.pe_alloc_mutex);
-   pdn->offset = bitmap_find_next_zero_area(
-   phb->ioda.pe_alloc, phb->ioda.total_pe,
-   0, num_vfs, 0);
-   if (pdn->offset >= phb->ioda.total_pe) {
+   if (pdn->m64_single_mode) {
+   for (i = 0; i < num_vfs; i++)
+   pdn->pe_num_map[i] = IODA_INVALID_PE;
+   for (i = 0; i < num_vfs; i++) {
+   pdn->pe_num_map[i] = pnv_ioda_alloc_pe(phb);
+   if (pdn->pe_num_map[i] == IODA_INVALID_PE) {
+   ret = -EBUSY;
+   goto m64_failed;
+   }
+   }
+   } else {
+   mutex_lock(&phb->ioda.pe_alloc_mutex);
+   pdn->pe_num_map[0] = bitmap_find_next_zero_area(
+

[PATCH v3 4/6] powerpc/powernv: replace the hard coded boundary with gate

2015-08-13 Thread Wei Yang

At the moment 64bit-prefetchable window can be maximum 64GB, which is
currently got from device tree. This means that in shared mode the maximum
supported VF BAR size is 64GB/256=256MB. While this size could exhaust the
whole 64bit-prefetchable window. This is a design decision to set a
boundary to 64MB of the VF BAR size. Since VF BAR size with 64MB would
occupy a quarter of the 64bit-prefetchable window, this is affordable.

This patch replaces magic limit of 64MB with (m64_segsize >> 1) and adds
comment to explain the reason for it.

Signed-off-by: Wei Yang 
---
 arch/powerpc/platforms/powernv/pci-ioda.c |   22 +-
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 4da0f50..3e8c0b4 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -2688,7 +2688,7 @@ static void pnv_pci_ioda_fixup_iov_resources(struct 
pci_dev *pdev)
struct pnv_phb *phb;
struct resource *res;
int i;
-   resource_size_t size;
+   resource_size_t size, gate;
struct pci_dn *pdn;
int mul, total_vfs;
 
@@ -2704,6 +2704,17 @@ static void pnv_pci_ioda_fixup_iov_resources(struct 
pci_dev *pdev)
 
total_vfs = pci_sriov_get_totalvfs(pdev);
mul = phb->ioda.total_pe;
+   /*
+* If bigger than or equal to half of M64 segment size, just round up
+* power of two.
+*
+* Generally, one M64 BAR maps one IOV BAR. To avoid conflict with
+* other devices, IOV BAR size is expanded to be (total_pe *
+* VF_BAR_size).  When VF_BAR_size is half of M64 segment size , the
+* expanded size would equal to half of the whole M64 Space size,
+* which will exhaust the M64 Space and limit the system flexibility.
+*/
+   gate = phb->ioda.m64_segsize >> 1;
 
for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
res = &pdev->resource[i + PCI_IOV_RESOURCES];
@@ -2718,10 +2729,11 @@ static void pnv_pci_ioda_fixup_iov_resources(struct 
pci_dev *pdev)
 
size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
 
-   /* bigger than 64M */
-   if (size > (1 << 26)) {
-   dev_info(&pdev->dev, "PowerNV: VF BAR%d: %pR IOV size 
is bigger than 64M, roundup power2\n",
-i, res);
+   /* bigger than or equal to gate */
+   if (size >= gate) {
+   dev_info(&pdev->dev, "PowerNV: VF BAR%d: %pR IOV size "
+   "is bigger than %lld, roundup power2\n",
+i, res, gate);
mul = roundup_pow_of_two(total_vfs);
pdn->m64_single_mode = true;
break;
-- 
1.7.9.5

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v3 5/6] powerpc/powernv: boundary the total VF BAR size instead of the individual one

2015-08-13 Thread Wei Yang

Each VF could have 6 BARs at most. When the total BAR size exceeds the
gate, after expanding it will also exhaust the M64 Window.

This patch limits the boundary by checking the total VF BAR size instead of
the individual BAR.

Signed-off-by: Wei Yang 
---
 arch/powerpc/platforms/powernv/pci-ioda.c |   13 +++--
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 3e8c0b4..1e6ac86 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -2688,7 +2688,7 @@ static void pnv_pci_ioda_fixup_iov_resources(struct 
pci_dev *pdev)
struct pnv_phb *phb;
struct resource *res;
int i;
-   resource_size_t size, gate;
+   resource_size_t size, gate, total_vf_bar_sz;
struct pci_dn *pdn;
int mul, total_vfs;
 
@@ -2715,6 +2715,7 @@ static void pnv_pci_ioda_fixup_iov_resources(struct 
pci_dev *pdev)
 * which will exhaust the M64 Space and limit the system flexibility.
 */
gate = phb->ioda.m64_segsize >> 1;
+   total_vf_bar_sz = 0;
 
for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
res = &pdev->resource[i + PCI_IOV_RESOURCES];
@@ -2727,13 +2728,13 @@ static void pnv_pci_ioda_fixup_iov_resources(struct 
pci_dev *pdev)
return;
}
 
-   size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
+   total_vf_bar_sz += pci_iov_resource_size(pdev,
+   i + PCI_IOV_RESOURCES);
 
/* bigger than or equal to gate */
-   if (size >= gate) {
-   dev_info(&pdev->dev, "PowerNV: VF BAR%d: %pR IOV size "
-   "is bigger than %lld, roundup power2\n",
-i, res, gate);
+   if (total_vf_bar_sz >= gate) {
+   dev_info(&pdev->dev, "PowerNV: VF BAR Total IOV size "
+   "is bigger than %lld, roundup power2\n", gate);
mul = roundup_pow_of_two(total_vfs);
pdn->m64_single_mode = true;
break;
-- 
1.7.9.5

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v3 3/6] powerpc/powernv: use one M64 BAR in Single PE mode for one VF BAR

2015-08-13 Thread Wei Yang

In current implementation, when VF BAR is bigger than 64MB, it uses 4 M64
BARs in Single PE mode to cover the number of VFs required to be enabled.
By doing so, several VFs would be in one VF Group and leads to interference
between VFs in the same group.

This patch changes the design by using one M64 BAR in Single PE mode for
one VF BAR. This gives absolute isolation for VFs.

Signed-off-by: Wei Yang 
---
 arch/powerpc/include/asm/pci-bridge.h |6 +-
 arch/powerpc/platforms/powernv/pci-ioda.c |  163 +++--
 2 files changed, 62 insertions(+), 107 deletions(-)

diff --git a/arch/powerpc/include/asm/pci-bridge.h 
b/arch/powerpc/include/asm/pci-bridge.h
index 712add5..9d33ada 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -187,6 +187,7 @@ static inline int isa_vaddr_is_ioport(void __iomem *address)
  */
 struct iommu_table;
 
+#define MAX_M64_BAR  16
 struct pci_dn {
int flags;
 #define PCI_DN_FLAG_IOV_VF 0x01
@@ -214,10 +215,9 @@ struct pci_dn {
u16 vfs_expanded;   /* number of VFs IOV BAR expanded */
u16 num_vfs;/* number of VFs enabled*/
int offset; /* PE# for the first VF PE */
-#define M64_PER_IOV 4
-   int m64_per_iov;
+   boolm64_single_mode;/* Use M64 BAR in Single Mode */
 #define IODA_INVALID_M64(-1)
-   int m64_wins[PCI_SRIOV_NUM_BARS][M64_PER_IOV];
+   int  m64_map[PCI_SRIOV_NUM_BARS][MAX_M64_BAR];
 #endif /* CONFIG_PCI_IOV */
 #endif
struct list_head child_list;
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 67b8f72..4da0f50 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1162,15 +1162,14 @@ static int pnv_pci_vf_release_m64(struct pci_dev *pdev)
pdn = pci_get_pdn(pdev);
 
for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
-   for (j = 0; j < M64_PER_IOV; j++) {
-   if (pdn->m64_wins[i][j] == IODA_INVALID_M64)
+   for (j = 0; j < MAX_M64_BAR; j++) {
+   if (pdn->m64_map[i][j] == IODA_INVALID_M64)
continue;
opal_pci_phb_mmio_enable(phb->opal_id,
-   OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 0);
-   clear_bit(pdn->m64_wins[i][j], 
&phb->ioda.m64_bar_alloc);
-   pdn->m64_wins[i][j] = IODA_INVALID_M64;
+   OPAL_M64_WINDOW_TYPE, pdn->m64_map[i][j], 0);
+   clear_bit(pdn->m64_map[i][j], &phb->ioda.m64_bar_alloc);
+   pdn->m64_map[i][j] = IODA_INVALID_M64;
}
-
return 0;
 }
 
@@ -1187,8 +1186,7 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, 
u16 num_vfs)
inttotal_vfs;
resource_size_tsize, start;
intpe_num;
-   intvf_groups;
-   intvf_per_group;
+   intm64_bars;
 
bus = pdev->bus;
hose = pci_bus_to_host(bus);
@@ -1196,26 +1194,23 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, 
u16 num_vfs)
pdn = pci_get_pdn(pdev);
total_vfs = pci_sriov_get_totalvfs(pdev);
 
-   /* Initialize the m64_wins to IODA_INVALID_M64 */
-   for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
-   for (j = 0; j < M64_PER_IOV; j++)
-   pdn->m64_wins[i][j] = IODA_INVALID_M64;
+   if (pdn->m64_single_mode)
+   m64_bars = num_vfs;
+   else
+   m64_bars = 1;
+
+   /* Initialize the m64_map to IODA_INVALID_M64 */
+   for (i = 0; i < PCI_SRIOV_NUM_BARS ; i++)
+   for (j = 0; j < MAX_M64_BAR; j++)
+   pdn->m64_map[i][j] = IODA_INVALID_M64;
 
-   if (pdn->m64_per_iov == M64_PER_IOV) {
-   vf_groups = (num_vfs <= M64_PER_IOV) ? num_vfs: M64_PER_IOV;
-   vf_per_group = (num_vfs <= M64_PER_IOV)? 1:
-   roundup_pow_of_two(num_vfs) / pdn->m64_per_iov;
-   } else {
-   vf_groups = 1;
-   vf_per_group = 1;
-   }
 
for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
res = &pdev->resource[i + PCI_IOV_RESOURCES];
if (!res->flags || !res->parent)
continue;
 
-   for (j = 0; j < vf_groups; j++) {
+   for (j = 0; j < m64_bars; j++) {
do {
win = 
find_next_zero_bit(&phb->ioda.m64_bar_alloc,
phb->ioda.m64_bar_idx + 1, 0);
@@ -1224,12 +1219,11 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, 
u16 num_vfs)
goto m64_failed;

[PATCH v3 1/6] powerpc/powernv: don't enable SRIOV when VF BAR has non 64bit-prefetchable BAR

2015-08-13 Thread Wei Yang

On PHB_IODA2, we enable SRIOV devices by mapping IOV BAR with M64 BARs. If
a SRIOV device's IOV BAR is not 64bit-prefetchable, this is not assigned
from 64bit prefetchable window, which means M64 BAR can't work on it.

This patch makes this explicit.

Signed-off-by: Wei Yang 
---
 arch/powerpc/platforms/powernv/pci-ioda.c |   25 +
 1 file changed, 9 insertions(+), 16 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 5738d31..9ac324e 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -908,9 +908,6 @@ static int pnv_pci_vf_resource_shift(struct pci_dev *dev, 
int offset)
if (!res->flags || !res->parent)
continue;
 
-   if (!pnv_pci_is_mem_pref_64(res->flags))
-   continue;
-
/*
 * The actual IOV BAR range is determined by the start address
 * and the actual size for num_vfs VFs BAR.  This check is to
@@ -939,9 +936,6 @@ static int pnv_pci_vf_resource_shift(struct pci_dev *dev, 
int offset)
if (!res->flags || !res->parent)
continue;
 
-   if (!pnv_pci_is_mem_pref_64(res->flags))
-   continue;
-
size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
res2 = *res;
res->start += size * offset;
@@ -1221,9 +1215,6 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, 
u16 num_vfs)
if (!res->flags || !res->parent)
continue;
 
-   if (!pnv_pci_is_mem_pref_64(res->flags))
-   continue;
-
for (j = 0; j < vf_groups; j++) {
do {
win = 
find_next_zero_bit(&phb->ioda.m64_bar_alloc,
@@ -1510,6 +1501,12 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 
num_vfs)
pdn = pci_get_pdn(pdev);
 
if (phb->type == PNV_PHB_IODA2) {
+   if (!pdn->vfs_expanded) {
+   dev_info(&pdev->dev, "don't support this SRIOV device"
+   " with non 64bit-prefetchable IOV BAR\n");
+   return -ENOSPC;
+   }
+
/* Calculate available PE for required VFs */
mutex_lock(&phb->ioda.pe_alloc_mutex);
pdn->offset = bitmap_find_next_zero_area(
@@ -2774,9 +2771,10 @@ static void pnv_pci_ioda_fixup_iov_resources(struct 
pci_dev *pdev)
if (!res->flags || res->parent)
continue;
if (!pnv_pci_is_mem_pref_64(res->flags)) {
-   dev_warn(&pdev->dev, " non M64 VF BAR%d: %pR\n",
+   dev_warn(&pdev->dev, "Don't support SR-IOV with"
+   " non M64 VF BAR%d: %pR. \n",
 i, res);
-   continue;
+   return;
}
 
size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
@@ -2795,11 +2793,6 @@ static void pnv_pci_ioda_fixup_iov_resources(struct 
pci_dev *pdev)
res = &pdev->resource[i + PCI_IOV_RESOURCES];
if (!res->flags || res->parent)
continue;
-   if (!pnv_pci_is_mem_pref_64(res->flags)) {
-   dev_warn(&pdev->dev, "Skipping expanding VF BAR%d: 
%pR\n",
-i, res);
-   continue;
-   }
 
dev_dbg(&pdev->dev, " Fixing VF BAR%d: %pR to\n", i, res);
size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
-- 
1.7.9.5

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v3 2/6] powerpc/powernv: simplify the calculation of iov resource alignment

2015-08-13 Thread Wei Yang

The alignment of IOV BAR on PowerNV platform is the total size of the IOV
BAR. No matter whether the IOV BAR is extended with number of
roundup_pow_of_two(total_vfs) or number of max PE number (256), the total
size could be calculated by (vfs_expanded * VF_BAR_size).

This patch simplifies the pnv_pci_iov_resource_alignment() by removing the
first case.

Signed-off-by: Wei Yang 
Reviewed-by: Gavin Shan 
---
 arch/powerpc/platforms/powernv/pci-ioda.c |   14 +-
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 9ac324e..67b8f72 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -2987,12 +2987,16 @@ static resource_size_t 
pnv_pci_iov_resource_alignment(struct pci_dev *pdev,
  int resno)
 {
struct pci_dn *pdn = pci_get_pdn(pdev);
-   resource_size_t align, iov_align;
-
-   iov_align = resource_size(&pdev->resource[resno]);
-   if (iov_align)
-   return iov_align;
+   resource_size_t align;
 
+   /*
+* On PowerNV platform, IOV BAR is mapped by M64 BAR to enable the
+* SR-IOV. While from hardware perspective, the range mapped by M64
+* BAR should be size aligned.
+*
+* This function return the total IOV BAR size if expanded or just the
+* individual size if not.
+*/
align = pci_iov_resource_size(pdev, resno);
if (pdn->vfs_expanded)
return pdn->vfs_expanded * align;
-- 
1.7.9.5

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v3 0/6] Redesign SR-IOV on PowerNV

2015-08-13 Thread Wei Yang

In original design, it tries to group VFs to enable more number of VFs in the
system, when VF BAR is bigger than 64MB. This design has a flaw in which one
error on a VF will interfere other VFs in the same group.

This patch series change this design by using M64 BAR in Single PE mode to
cover only one VF BAR. By doing so, it gives absolute isolation between VFs.

v3:
   * return -ENOSPC when a VF has non-64bit prefetchable BAR
   * rename offset to pe_num_map and define it statically
   * change commit log based on comments
   * define m64_map statically
v2:
   * clean up iov bar alignment calculation
   * change m64s to m64_bars
   * add a field to represent M64 Single PE mode will be used
   * change m64_wins to m64_map
   * calculate the gate instead of hard coded
   * dynamically allocate m64_map
   * dynamically allocate PE#
   * add a case to calculate iov bar alignment when M64 Single PE is used
   * when M64 Single PE is used, compare num_vfs with M64 BAR available number 
 in system at first



Wei Yang (6):
  powerpc/powernv: don't enable SRIOV when VF BAR has non
64bit-prefetchable BAR
  powerpc/powernv: simplify the calculation of iov resource alignment
  powerpc/powernv: use one M64 BAR in Single PE mode for one VF BAR
  powerpc/powernv: replace the hard coded boundary with gate
  powerpc/powernv: boundary the total VF BAR size instead of the
individual one
  powerpc/powernv: allocate sparse PE# when using M64 BAR in Single PE
mode

 arch/powerpc/include/asm/pci-bridge.h |8 +-
 arch/powerpc/platforms/powernv/pci-ioda.c |  284 ++---
 2 files changed, 139 insertions(+), 153 deletions(-)

-- 
1.7.9.5

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH 3/3] powerpc/e6500: hw tablewalk: order the memory access when acquire/release tcd lock

2015-08-13 Thread Kevin Hao

I didn't find anything unusual. But I think we do need to order the
load/store of esel_next when acquire/release tcd lock. For acquire,
add a data dependency to order the loads of lock and esel_next.
For release, even there already have a "isync" here, but it doesn't
guarantee any memory access order. So we still need "lwsync" for
the two stores for lock and esel_next.

Signed-off-by: Kevin Hao 
---
 arch/powerpc/mm/tlb_low_64e.S | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/powerpc/mm/tlb_low_64e.S b/arch/powerpc/mm/tlb_low_64e.S
index e4185581c5a7..964754911987 100644
--- a/arch/powerpc/mm/tlb_low_64e.S
+++ b/arch/powerpc/mm/tlb_low_64e.S
@@ -334,6 +334,8 @@ BEGIN_FTR_SECTION   /* CPU_FTR_SMT */
 * with tlbilx before overwriting.
 */
 
+   andir15,r15,0   /* add a data dependency to order the loards */
+   add r11,r11,r15 /* between the lock and esel_next */
lbz r15,TCD_ESEL_NEXT(r11)
rlwinm  r10,r15,16,0xff
orisr10,r10,MAS0_TLBSEL(1)@h
@@ -447,6 +449,7 @@ BEGIN_FTR_SECTION
beq cr1,1f  /* no unlock if lock was recursively grabbed */
li  r15,0
isync
+   lwsync
stb r15,0(r11)
 1:
 END_FTR_SECTION_IFSET(CPU_FTR_SMT)
-- 
2.1.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH 2/3] powerpc/e6500: hw tablewalk: optimize a bit for tcd lock acquiring codes

2015-08-13 Thread Kevin Hao

It makes no sense to put the instructions for calculating the lock
value (cpu number + 1) and the clearing of eq bit of cr1 in lbarx/stbcx
loop. And when the lock is acquired by the other thread, the current
lock value has no chance to equal with the lock value used by current
cpu. So we can skip the comparing for these two lock values in the
lbz/bne loop.

Signed-off-by: Kevin Hao 
---
 arch/powerpc/mm/tlb_low_64e.S | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/mm/tlb_low_64e.S b/arch/powerpc/mm/tlb_low_64e.S
index 765b419883f2..e4185581c5a7 100644
--- a/arch/powerpc/mm/tlb_low_64e.S
+++ b/arch/powerpc/mm/tlb_low_64e.S
@@ -308,11 +308,11 @@ BEGIN_FTR_SECTION /* CPU_FTR_SMT */
 *
 * MAS6:IND should be already set based on MAS4
 */
-1: lbarx   r15,0,r11
lhz r10,PACAPACAINDEX(r13)
-   cmpdi   r15,0
-   cmpdi   cr1,r15,1   /* set cr1.eq = 0 for non-recursive */
addir10,r10,1
+   crclr   cr1*4+eq/* set cr1.eq = 0 for non-recursive */
+1: lbarx   r15,0,r11
+   cmpdi   r15,0
bne 2f
stbcx.  r10,0,r11
bne 1b
@@ -320,9 +320,9 @@ BEGIN_FTR_SECTION   /* CPU_FTR_SMT */
.subsection 1
 2: cmpdcr1,r15,r10 /* recursive lock due to mcheck/crit/etc? */
beq cr1,3b  /* unlock will happen if cr1.eq = 0 */
-   lbz r15,0(r11)
+10:lbz r15,0(r11)
cmpdi   r15,0
-   bne 2b
+   bne 10b
b   1b
.previous
 
-- 
2.1.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH 1/3] powerpc/e6500: remove the stale TCD_LOCK macro

2015-08-13 Thread Kevin Hao

Since we moved the "lock" to be the first element of
struct tlb_core_data in commit 82d86de25b9c ("powerpc/e6500: Make TLB
lock recursive), this macro is not used by any code. Just delete it.

Signed-off-by: Kevin Hao 
---
 arch/powerpc/kernel/asm-offsets.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/powerpc/kernel/asm-offsets.c 
b/arch/powerpc/kernel/asm-offsets.c
index 98230579d99c..810f433731dc 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -213,7 +213,6 @@ int main(void)
offsetof(struct tlb_core_data, esel_max));
DEFINE(TCD_ESEL_FIRST,
offsetof(struct tlb_core_data, esel_first));
-   DEFINE(TCD_LOCK, offsetof(struct tlb_core_data, lock));
 #endif /* CONFIG_PPC_BOOK3E */
 
 #ifdef CONFIG_PPC_STD_MMU_64
-- 
2.1.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH] powerpc/slb: Use a local to avoid multiple calls to get_slb_shadow()

2015-08-13 Thread Michael Ellerman

For no reason other than it looks ugly.

Signed-off-by: Michael Ellerman 
---
 arch/powerpc/mm/slb.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c
index 0c7115fd314b..515730e499fe 100644
--- a/arch/powerpc/mm/slb.c
+++ b/arch/powerpc/mm/slb.c
@@ -62,16 +62,16 @@ static inline void slb_shadow_update(unsigned long ea, int 
ssize,
 unsigned long flags,
 enum slb_index index)
 {
+   struct slb_shadow *p = get_slb_shadow();
+
/*
 * Clear the ESID first so the entry is not valid while we are
 * updating it.  No write barriers are needed here, provided
 * we only update the current CPU's SLB shadow buffer.
 */
-   get_slb_shadow()->save_area[index].esid = 0;
-   get_slb_shadow()->save_area[index].vsid =
-   cpu_to_be64(mk_vsid_data(ea, ssize, flags));
-   get_slb_shadow()->save_area[index].esid =
-   cpu_to_be64(mk_esid_data(ea, ssize, index));
+   p->save_area[index].esid = 0;
+   p->save_area[index].vsid = cpu_to_be64(mk_vsid_data(ea, ssize, flags));
+   p->save_area[index].esid = cpu_to_be64(mk_esid_data(ea, ssize, index));
 }
 
 static inline void slb_shadow_clear(enum slb_index index)
-- 
2.1.4

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v2] powerpc/slb: Define an enum for the bolted indexes

2015-08-13 Thread Michael Ellerman

From: Anshuman Khandual 

This patch defines macros for the three bolted SLB indexes we use.
Switch the functions that take the indexes as an argument to use the
enum.

Signed-off-by: Anshuman Khandual 
Signed-off-by: Michael Ellerman 
---

v2: Use index rather than slot as that's what the ISA docs call it.
Use the enum in the function signatures.

 arch/powerpc/mm/slb.c | 47 ++-
 1 file changed, 26 insertions(+), 21 deletions(-)

diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c
index 8a32a2be3c53..0c7115fd314b 100644
--- a/arch/powerpc/mm/slb.c
+++ b/arch/powerpc/mm/slb.c
@@ -25,6 +25,11 @@
 #include 
 #include 
 
+enum slb_index {
+   LINEAR_INDEX= 0, /* Kernel linear map  (0xc000) */
+   VMALLOC_INDEX   = 1, /* Kernel virtual map (0xd000) */
+   KSTACK_INDEX= 2, /* Kernel stack map */
+};
 
 extern void slb_allocate_realmode(unsigned long ea);
 extern void slb_allocate_user(unsigned long ea);
@@ -41,9 +46,9 @@ static void slb_allocate(unsigned long ea)
(((ssize) == MMU_SEGSIZE_256M)? ESID_MASK: ESID_MASK_1T)
 
 static inline unsigned long mk_esid_data(unsigned long ea, int ssize,
-unsigned long entry)
+enum slb_index index)
 {
-   return (ea & slb_esid_mask(ssize)) | SLB_ESID_V | entry;
+   return (ea & slb_esid_mask(ssize)) | SLB_ESID_V | index;
 }
 
 static inline unsigned long mk_vsid_data(unsigned long ea, int ssize,
@@ -55,39 +60,39 @@ static inline unsigned long mk_vsid_data(unsigned long ea, 
int ssize,
 
 static inline void slb_shadow_update(unsigned long ea, int ssize,
 unsigned long flags,
-unsigned long entry)
+enum slb_index index)
 {
/*
 * Clear the ESID first so the entry is not valid while we are
 * updating it.  No write barriers are needed here, provided
 * we only update the current CPU's SLB shadow buffer.
 */
-   get_slb_shadow()->save_area[entry].esid = 0;
-   get_slb_shadow()->save_area[entry].vsid =
+   get_slb_shadow()->save_area[index].esid = 0;
+   get_slb_shadow()->save_area[index].vsid =
cpu_to_be64(mk_vsid_data(ea, ssize, flags));
-   get_slb_shadow()->save_area[entry].esid =
-   cpu_to_be64(mk_esid_data(ea, ssize, entry));
+   get_slb_shadow()->save_area[index].esid =
+   cpu_to_be64(mk_esid_data(ea, ssize, index));
 }
 
-static inline void slb_shadow_clear(unsigned long entry)
+static inline void slb_shadow_clear(enum slb_index index)
 {
-   get_slb_shadow()->save_area[entry].esid = 0;
+   get_slb_shadow()->save_area[index].esid = 0;
 }
 
 static inline void create_shadowed_slbe(unsigned long ea, int ssize,
unsigned long flags,
-   unsigned long entry)
+   enum slb_index index)
 {
/*
 * Updating the shadow buffer before writing the SLB ensures
 * we don't get a stale entry here if we get preempted by PHYP
 * between these two statements.
 */
-   slb_shadow_update(ea, ssize, flags, entry);
+   slb_shadow_update(ea, ssize, flags, index);
 
asm volatile("slbmte  %0,%1" :
 : "r" (mk_vsid_data(ea, ssize, flags)),
-  "r" (mk_esid_data(ea, ssize, entry))
+  "r" (mk_esid_data(ea, ssize, index))
 : "memory" );
 }
 
@@ -103,16 +108,16 @@ static void __slb_flush_and_rebolt(void)
lflags = SLB_VSID_KERNEL | linear_llp;
vflags = SLB_VSID_KERNEL | vmalloc_llp;
 
-   ksp_esid_data = mk_esid_data(get_paca()->kstack, mmu_kernel_ssize, 2);
+   ksp_esid_data = mk_esid_data(get_paca()->kstack, mmu_kernel_ssize, 
KSTACK_INDEX);
if ((ksp_esid_data & ~0xfffUL) <= PAGE_OFFSET) {
ksp_esid_data &= ~SLB_ESID_V;
ksp_vsid_data = 0;
-   slb_shadow_clear(2);
+   slb_shadow_clear(KSTACK_INDEX);
} else {
/* Update stack entry; others don't change */
-   slb_shadow_update(get_paca()->kstack, mmu_kernel_ssize, lflags, 
2);
+   slb_shadow_update(get_paca()->kstack, mmu_kernel_ssize, lflags, 
KSTACK_INDEX);
ksp_vsid_data =
-   be64_to_cpu(get_slb_shadow()->save_area[2].vsid);
+   
be64_to_cpu(get_slb_shadow()->save_area[KSTACK_INDEX].vsid);
}
 
/* We need to do this all in asm, so we're sure we don't touch
@@ -151,7 +156,7 @@ void slb_vmalloc_update(void)
unsigned long vflags;
 
vflags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_vmalloc_psize].sllp;
-   slb_shadow_update(VMALLOC_START, m

Re: [PATCH 02/20] powerpc/8xx: Map linear kernel RAM with 8M pages

2015-08-13 Thread Segher Boessenkool

On Wed, Aug 12, 2015 at 03:40:56PM +0200, Christophe Leroy wrote:
>   /* Insert level 1 index */
>   rlwimi  r11, r10, 32 - ((PAGE_SHIFT - 2) << 1), (PAGE_SHIFT - 2) << 1, 
> 29
>   lwz r11, (swapper_pg_dir-PAGE_OFFSET)@l(r11)/* Get the 
> level 1 entry */
> + mtcrr11

Maybe mtcrf is faster?  You only want one field, anyhow.

> + bgt-cr7,5f  /* CR7.GT = bit 29 = Large page (8M or 512K) */

You can write this as   bt- 29,5f   which should be easier to read.

>   /* Insert level 1 index */
>  3:   rlwimi  r11, r10, 32 - ((PAGE_SHIFT - 2) << 1), (PAGE_SHIFT - 2) << 1, 
> 29
>   lwz r11, (swapper_pg_dir-PAGE_OFFSET)@l(r11)/* Get the 
> level 1 entry */
> + mtcrr11
> + bgt cr7,200f

Same here...  Probably good to comment it, too.


Segher
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

82 matches

Mail list logo