[PATCH v5 7/8] Define PERF_PMU_TXN_READ interface
Define a new PERF_PMU_TXN_READ interface to read a group of counters at once. pmu->start_txn()// Initialize before first event for each event in group pmu->read(event); // Queue each event to be read rc = pmu->commit_txn() // Read/update all queued counters Note that we use this interface with all PMUs. PMUs that implement this interface use the ->read() operation to _queue_ the counters to be read and use ->commit_txn() to actually read all the queued counters at once. PMUs that don't implement PERF_PMU_TXN_READ ignore ->start_txn() and ->commit_txn() and continue to read counters one at a time. Thanks to input from Peter Zijlstra. Signed-off-by: Sukadev Bhattiprolu --- Changelog[v4] - [Peter Zijlstra] Add lockdep_assert_held() in perf_event_read_group(). Make sure the entire transaction happens on the same CPU. --- include/linux/perf_event.h |1 + kernel/events/core.c | 24 +++- 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 44bf05f..da307ad 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -169,6 +169,7 @@ struct perf_event; #define PERF_EVENT_TXN 0x1 #define PERF_PMU_TXN_ADD 0x1 /* txn to add/schedule event on PMU */ +#define PERF_PMU_TXN_READ 0x2 /* txn to read event group from PMU */ /** * pmu::capabilities flags diff --git a/kernel/events/core.c b/kernel/events/core.c index e3ce047..fde2f43 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3189,6 +3189,7 @@ static void __perf_event_read(void *info) struct perf_event *sub, *event = data->event; struct perf_event_context *ctx = event->ctx; struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + struct pmu *pmu = event->pmu; /* * If this is a task context, we need to check whether it is @@ -3207,18 +3208,31 @@ static void __perf_event_read(void *info) } update_event_times(event); - if (event->state == PERF_EVENT_STATE_ACTIVE) - event->pmu->read(event); + if (event->state != PERF_EVENT_STATE_ACTIVE) + goto unlock; - if (!data->group) + if (!data->group) { + pmu->read(event); + data->ret = 0; goto unlock; + } + + pmu->start_txn(pmu, PERF_PMU_TXN_READ); + + pmu->read(event); list_for_each_entry(sub, &event->sibling_list, group_entry) { update_event_times(sub); - if (sub->state == PERF_EVENT_STATE_ACTIVE) + if (sub->state == PERF_EVENT_STATE_ACTIVE) { + /* +* Use sibling's PMU rather than @event's since +* sibling could be on different (eg: software) PMU. +*/ sub->pmu->read(sub); + } } - data->ret = 0; + + data->ret = pmu->commit_txn(pmu); unlock: raw_spin_unlock(&ctx->lock); -- 1.7.9.5 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v5 5/8] perf: Invert perf_read_group() loops
From: Peter Zijlstra In order to enable the use of perf_event_read(.group = true), we need to invert the sibling-child loop nesting of perf_read_group(). Currently we iterate the child list for each sibling, this precludes using group reads. Flip things around so we iterate each group for each child. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Sukadev Bhattiprolu --- Changes to Peter's patch: - Add GFP_KERNEL to kzalloc(). - Pass in address of counter to atomic_read(). - Return event->size rather than leader->size (perf_read_group()) - Keep chkpatch happy. --- kernel/events/core.c | 85 -- 1 file changed, 55 insertions(+), 30 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 31ec842..2221ebe 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3795,50 +3795,75 @@ u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) } EXPORT_SYMBOL_GPL(perf_event_read_value); -static int perf_read_group(struct perf_event *event, - u64 read_format, char __user *buf) +static void __perf_read_group_add(struct perf_event *leader, + u64 read_format, u64 *values) { - struct perf_event *leader = event->group_leader, *sub; - struct perf_event_context *ctx = leader->ctx; - int n = 0, size = 0, ret; - u64 count, enabled, running; - u64 values[5]; + struct perf_event *sub; + int n = 1; /* skip @nr */ - lockdep_assert_held(&ctx->mutex); + perf_event_read(leader, true); + + /* +* Since we co-schedule groups, {enabled,running} times of siblings +* will be identical to those of the leader, so we only publish one +* set. +*/ + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { + values[n++] += leader->total_time_enabled + + atomic64_read(&leader->child_total_time_enabled); + } - count = perf_event_read_value(leader, &enabled, &running); + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { + values[n++] += leader->total_time_running + + atomic64_read(&leader->child_total_time_running); + } - values[n++] = 1 + leader->nr_siblings; - if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) - values[n++] = enabled; - if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) - values[n++] = running; - values[n++] = count; + /* +* Write {count,id} tuples for every sibling. +*/ + values[n++] += perf_event_count(leader); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(leader); - size = n * sizeof(u64); + list_for_each_entry(sub, &leader->sibling_list, group_entry) { + values[n++] += perf_event_count(sub); + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_event_id(sub); + } +} - if (copy_to_user(buf, values, size)) - return -EFAULT; +static int perf_read_group(struct perf_event *event, + u64 read_format, char __user *buf) +{ + struct perf_event *leader = event->group_leader, *child; + struct perf_event_context *ctx = leader->ctx; + int ret = event->read_size; + u64 *values; - ret = size; + lockdep_assert_held(&ctx->mutex); - list_for_each_entry(sub, &leader->sibling_list, group_entry) { - n = 0; + values = kzalloc(event->read_size, GFP_KERNEL); + if (!values) + return -ENOMEM; - values[n++] = perf_event_read_value(sub, &enabled, &running); - if (read_format & PERF_FORMAT_ID) - values[n++] = primary_event_id(sub); + values[0] = 1 + leader->nr_siblings; + + /* +* By locking the child_mutex of the leader we effectively +* lock the child list of all siblings.. XXX explain how. +*/ + mutex_lock(&leader->child_mutex); - size = n * sizeof(u64); + __perf_read_group_add(leader, read_format, values); + list_for_each_entry(child, &leader->child_list, child_list) + __perf_read_group_add(child, read_format, values); - if (copy_to_user(buf + ret, values, size)) { - return -EFAULT; - } + mutex_unlock(&leader->child_mutex); - ret += size; - } + if (copy_to_user(buf, values, event->read_size)) + ret = -EFAULT; + + kfree(values); return ret; } -- 1.7.9.5 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v5 2/8] perf: Split perf_event_read() and perf_event_count()
perf_event_read() does two things: - call the PMU to read/update the counter value, and - compute the total count of the event and its children Not all callers need both. perf_event_reset() for instance needs the first piece but doesn't need the second. Similarly, when we implement the ability to read a group of events using the transaction interface, we would need the two pieces done independently. Break up perf_event_read() and have it just read/update the counter and have the callers compute the total count if necessary. Signed-off-by: Sukadev Bhattiprolu --- kernel/events/core.c | 14 -- 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 4435bf5..f9ca8cb 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3212,7 +3212,7 @@ static inline u64 perf_event_count(struct perf_event *event) return __perf_event_count(event); } -static u64 perf_event_read(struct perf_event *event) +static void perf_event_read(struct perf_event *event) { /* * If event is enabled and currently active on a CPU, update the @@ -3238,8 +3238,6 @@ static u64 perf_event_read(struct perf_event *event) update_event_times(event); raw_spin_unlock_irqrestore(&ctx->lock, flags); } - - return perf_event_count(event); } /* @@ -3751,14 +3749,18 @@ u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) *running = 0; mutex_lock(&event->child_mutex); - total += perf_event_read(event); + + perf_event_read(event); + total += perf_event_count(event); + *enabled += event->total_time_enabled + atomic64_read(&event->child_total_time_enabled); *running += event->total_time_running + atomic64_read(&event->child_total_time_running); list_for_each_entry(child, &event->child_list, child_list) { - total += perf_event_read(child); + perf_event_read(child); + total += perf_event_count(child); *enabled += child->total_time_enabled; *running += child->total_time_running; } @@ -3918,7 +3920,7 @@ static unsigned int perf_poll(struct file *file, poll_table *wait) static void _perf_event_reset(struct perf_event *event) { - (void)perf_event_read(event); + perf_event_read(event); local64_set(&event->count, 0); perf_event_update_userpage(event); } -- 1.7.9.5 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v5 6/8] perf: Add return value for perf_event_read().
When we implement the ability to read several counters at once (using the PERF_PMU_TXN_READ transaction interface), perf_event_read() can fail when the 'group' parameter is true (eg: trying to read too many events at once). For now, have perf_event_read() return an integer. Ignore the return value when 'group' parameter is false. Signed-off-by: Sukadev Bhattiprolu --- kernel/events/core.c | 45 ++--- 1 file changed, 34 insertions(+), 11 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 2221ebe..e3ce047 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3177,6 +3177,7 @@ void perf_event_exec(void) struct perf_read_data { struct perf_event *event; bool group; + int ret; }; /* @@ -3217,6 +3218,7 @@ static void __perf_event_read(void *info) if (sub->state == PERF_EVENT_STATE_ACTIVE) sub->pmu->read(sub); } + data->ret = 0; unlock: raw_spin_unlock(&ctx->lock); @@ -3230,8 +3232,10 @@ static inline u64 perf_event_count(struct perf_event *event) return __perf_event_count(event); } -static void perf_event_read(struct perf_event *event, bool group) +static int perf_event_read(struct perf_event *event, bool group) { + int ret = 0; + /* * If event is enabled and currently active on a CPU, update the * value in the event structure: @@ -3240,9 +3244,11 @@ static void perf_event_read(struct perf_event *event, bool group) struct perf_read_data data = { .event = event, .group = group, + .ret = 0, }; smp_call_function_single(event->oncpu, __perf_event_read, &data, 1); + ret = data.ret; } else if (event->state == PERF_EVENT_STATE_INACTIVE) { struct perf_event_context *ctx = event->ctx; unsigned long flags; @@ -3263,6 +3269,8 @@ static void perf_event_read(struct perf_event *event, bool group) update_event_times(event); raw_spin_unlock_irqrestore(&ctx->lock, flags); } + + return ret; } /* @@ -3775,7 +3783,7 @@ u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) mutex_lock(&event->child_mutex); - perf_event_read(event, false); + (void)perf_event_read(event, false); total += perf_event_count(event); *enabled += event->total_time_enabled + @@ -3784,7 +3792,7 @@ u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) atomic64_read(&event->child_total_time_running); list_for_each_entry(child, &event->child_list, child_list) { - perf_event_read(child, false); + (void)perf_event_read(child, false); total += perf_event_count(child); *enabled += child->total_time_enabled; *running += child->total_time_running; @@ -3795,13 +3803,16 @@ u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) } EXPORT_SYMBOL_GPL(perf_event_read_value); -static void __perf_read_group_add(struct perf_event *leader, +static int __perf_read_group_add(struct perf_event *leader, u64 read_format, u64 *values) { struct perf_event *sub; int n = 1; /* skip @nr */ + int ret; - perf_event_read(leader, true); + ret = perf_event_read(leader, true); + if (ret) + return ret; /* * Since we co-schedule groups, {enabled,running} times of siblings @@ -3830,6 +3841,8 @@ static void __perf_read_group_add(struct perf_event *leader, if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(sub); } + + return 0; } static int perf_read_group(struct perf_event *event, @@ -3837,7 +3850,7 @@ static int perf_read_group(struct perf_event *event, { struct perf_event *leader = event->group_leader, *child; struct perf_event_context *ctx = leader->ctx; - int ret = event->read_size; + int ret; u64 *values; lockdep_assert_held(&ctx->mutex); @@ -3854,17 +3867,27 @@ static int perf_read_group(struct perf_event *event, */ mutex_lock(&leader->child_mutex); - __perf_read_group_add(leader, read_format, values); - list_for_each_entry(child, &leader->child_list, child_list) - __perf_read_group_add(child, read_format, values); + ret = __perf_read_group_add(leader, read_format, values); + if (ret) + goto unlock; + + list_for_each_entry(child, &leader->child_list, child_list) { + ret = __perf_read_group_add(child, read_format, values); + if (ret) +
[PATCH v5 3/8] perf: Rename perf_event_read_{one, group}, perf_read_hw
From: "Peter Zijlstra (Intel)" In order to free up the perf_event_read_group() name: s/perf_event_read_\(one\|group\)/perf_read_\1/g s/perf_read_hw/__perf_read/g Signed-off-by: Peter Zijlstra (Intel) --- kernel/events/core.c | 14 +++--- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index f9ca8cb..02095f4 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3675,7 +3675,7 @@ static void put_event(struct perf_event *event) * see the comment there. * * 2) there is a lock-inversion with mmap_sem through -* perf_event_read_group(), which takes faults while +* perf_read_group(), which takes faults while * holding ctx->mutex, however this is called after * the last filedesc died, so there is no possibility * to trigger the AB-BA case. @@ -3770,7 +3770,7 @@ u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) } EXPORT_SYMBOL_GPL(perf_event_read_value); -static int perf_event_read_group(struct perf_event *event, +static int perf_read_group(struct perf_event *event, u64 read_format, char __user *buf) { struct perf_event *leader = event->group_leader, *sub; @@ -3818,7 +3818,7 @@ static int perf_event_read_group(struct perf_event *event, return ret; } -static int perf_event_read_one(struct perf_event *event, +static int perf_read_one(struct perf_event *event, u64 read_format, char __user *buf) { u64 enabled, running; @@ -3856,7 +3856,7 @@ static bool is_event_hup(struct perf_event *event) * Read the performance event - simple non blocking version for now */ static ssize_t -perf_read_hw(struct perf_event *event, char __user *buf, size_t count) +__perf_read(struct perf_event *event, char __user *buf, size_t count) { u64 read_format = event->attr.read_format; int ret; @@ -3874,9 +3874,9 @@ perf_read_hw(struct perf_event *event, char __user *buf, size_t count) WARN_ON_ONCE(event->ctx->parent_ctx); if (read_format & PERF_FORMAT_GROUP) - ret = perf_event_read_group(event, read_format, buf); + ret = perf_read_group(event, read_format, buf); else - ret = perf_event_read_one(event, read_format, buf); + ret = perf_read_one(event, read_format, buf); return ret; } @@ -3889,7 +3889,7 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) int ret; ctx = perf_event_ctx_lock(event); - ret = perf_read_hw(event, buf, count); + ret = __perf_read(event, buf, count); perf_event_ctx_unlock(event, ctx); return ret; -- 1.7.9.5 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v5 1/8] perf: Add a flags parameter to pmu txn interfaces
Currently, the PMU interface allows reading only one counter at a time. But some PMUs like the 24x7 counters in Power, support reading several counters at once. To leveage this functionality, extend the transaction interface to support a "transaction type". The first type, PERF_PMU_TXN_ADD, refers to the existing transactions, i.e. used to _schedule_ all the events on the PMU as a group. A second transaction type, PERF_PMU_TXN_READ, will be used in a follow-on patch, by the 24x7 counters to read several counters at once. Extend the transaction interfaces to the PMU to accept a 'txn_flags' parameter and use this parameter to ignore any transactions that are not of type PERF_PMU_TXN_ADD. Thanks to Peter Zijlstra for his input. Signed-off-by: Sukadev Bhattiprolu --- Changelog[v4] - [Peter Zijlstra] Fix an copy-paste error in power_pmu_cancel_txn(). - [Peter Zijlstra] Use __this_cpu_read() and __this_cpu_write(). Changelog[v3] - [Peter Zijlstra] Ensure the nop_txn interfaces disable/enable PMU only for TXN_ADD transactions. - [Peter Zijlstra] Cache the flags parameter in ->start_txn() and drop the flags parameter from ->commit_txn() and ->cancel_txn(). --- arch/powerpc/perf/core-book3s.c | 25 - arch/s390/kernel/perf_cpum_cf.c | 24 +++- arch/sparc/kernel/perf_event.c | 19 ++- arch/x86/kernel/cpu/perf_event.c | 27 +-- arch/x86/kernel/cpu/perf_event.h |1 + include/linux/perf_event.h | 14 +++--- kernel/events/core.c | 31 --- 7 files changed, 130 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index d90893b..b18efe4 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -50,6 +50,7 @@ struct cpu_hw_events { unsigned int group_flag; int n_txn_start; + int txn_flags; /* BHRB bits */ u64 bhrb_filter;/* BHRB HW branch filter */ @@ -1586,11 +1587,19 @@ static void power_pmu_stop(struct perf_event *event, int ef_flags) * Start group events scheduling transaction * Set the flag to make pmu::enable() not perform the * schedulability test, it will be performed at commit time + * + * We only support PERF_PMU_TXN_ADD transactions. Save the + * transaction flags but otherwise ignore non-PERF_PMU_TXN_ADD + * transactions. */ -static void power_pmu_start_txn(struct pmu *pmu) +static void power_pmu_start_txn(struct pmu *pmu, int txn_flags) { struct cpu_hw_events *cpuhw = this_cpu_ptr(&cpu_hw_events); + cpuhw->txn_flags = txn_flags; + if (txn_flags & ~PERF_PMU_TXN_ADD) + return; + perf_pmu_disable(pmu); cpuhw->group_flag |= PERF_EVENT_TXN; cpuhw->n_txn_start = cpuhw->n_events; @@ -1604,6 +1613,12 @@ static void power_pmu_start_txn(struct pmu *pmu) static void power_pmu_cancel_txn(struct pmu *pmu) { struct cpu_hw_events *cpuhw = this_cpu_ptr(&cpu_hw_events); + int txn_flags; + + txn_flags = cpuhw->txn_flags; + cpuhw->txn_flags = 0; + if (txn_flags & ~PERF_PMU_TXN_ADD) + return; cpuhw->group_flag &= ~PERF_EVENT_TXN; perf_pmu_enable(pmu); @@ -1618,10 +1633,18 @@ static int power_pmu_commit_txn(struct pmu *pmu) { struct cpu_hw_events *cpuhw; long i, n; + int txn_flags; if (!ppmu) return -EAGAIN; + cpuhw = this_cpu_ptr(&cpu_hw_events); + + txn_flags = cpuhw->txn_flags; + cpuhw->txn_flags = 0; + if (cpuhw->txn_flags & ~PERF_PMU_TXN_ADD) + return 0; + n = cpuhw->n_events; if (check_excludes(cpuhw->event, cpuhw->flags, 0, n)) return -EAGAIN; diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c index 56fdad4..a6f9e7b 100644 --- a/arch/s390/kernel/perf_cpum_cf.c +++ b/arch/s390/kernel/perf_cpum_cf.c @@ -72,6 +72,7 @@ struct cpu_hw_events { atomic_tctr_set[CPUMF_CTR_SET_MAX]; u64 state, tx_state; unsigned intflags; + int txn_flags; }; static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { .ctr_set = { @@ -82,6 +83,7 @@ static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { }, .state = 0, .flags = 0, + .txn_flags = 0, }; static int get_counter_set(u64 event) @@ -572,11 +574,19 @@ static void cpumf_pmu_del(struct perf_event *event, int flags) /* * Start group events scheduling transaction. * Set flags to perform a single test at commit time. + * + * We only support PERF_PMU_TXN_ADD transactions. Save the + * transaction flags but otherwise ignore non-PERF_PMU_TXN_ADD + * transactions. */ -static void cpumf_pmu_start_
[PATCH v5 0/8] perf: Implement group-read of events using txn interface
Unlike normal hardware PMCs, the 24x7 counters in Power8 are stored in memory and accessed via a hypervisor call (HCALL). A major aspect of the HCALL is that it allows retireving _several_ counters at once (unlike regular PMCs, which are read one at a time). By reading several counters at once, we can get a more consistent snapshot of the system. This patchset extends the transaction interface to accomplish submitting several events to the PMU and have the PMU read them all at once. User is expected to submit the set of events they want to read as an "event group". In the kernel, we submit each event to the PMU using the following logic (from Peter Zijlstra). pmu->start_txn(pmu, PMU_TXN_READ); leader->read(); for_each_sibling() sibling->read(); pmu->commit_txn(); where: - the ->read()s queue events to be submitted to the hypervisor, and, - the ->commit_txn() issues the HCALL, retrieves the result and updates the event count. Architectures/PMUs that don't need/implement PMU_TXN_READ type of transactions, simply ignore the ->start_txn() and ->commit_txn() and continue to read the counters one at a time in the ->read() call. Compile/touch tested on x86. Need help testing on s390 and Sparc. Thanks to Peter Zijlstra for his input/code. Changelog[v5] - Invert the sibling-child loop nesting in perf-read-group (re-org code and drop the patch that defined perf_event_aggregate()). Changelog[v4] - Ensure all the transactions operations happen on the same CPU so PMUs can use per-CPU buffers for the transaction. - Add lockdep assert and fix a locking issue in perf_read_group(). Changelog [v3] - Simple changes/reorg of patchset to split/rename functions - [Peter Zijlstra] Save the transaction flags in ->start_txn() and drop the flags parameter from ->commit_txn() and ->cancel_txn(). - [Peter Zijlstra] The nop txn interfaces don't need to disable/enable PMU for PERF_PMU_TXN_READ transactions. Changelog [v2] - Use the transaction interface unconditionally to avoid special-case code. Architectures/PMUs that don't need the READ transaction types simply ignore the ->start_txn() and ->commit_txn() calls. Peter Zijlstra (2): perf: Add group reads to perf_event_read() perf: Invert perf_read_group() loops Peter Zijlstra (Intel) (1): perf: Rename perf_event_read_{one,group}, perf_read_hw Sukadev Bhattiprolu (5): perf: Add a flags parameter to pmu txn interfaces perf: Split perf_event_read() and perf_event_count() perf: Add return value for perf_event_read(). Define PERF_PMU_TXN_READ interface powerpc/perf/hv-24x7: Use PMU_TXN_READ interface arch/powerpc/perf/core-book3s.c | 25 - arch/powerpc/perf/hv-24x7.c | 166 +- arch/s390/kernel/perf_cpum_cf.c | 24 - arch/sparc/kernel/perf_event.c | 19 +++- arch/x86/kernel/cpu/perf_event.c | 27 - arch/x86/kernel/cpu/perf_event.h |1 + include/linux/perf_event.h | 15 ++- kernel/events/core.c | 210 +- 8 files changed, 429 insertions(+), 58 deletions(-) -- 1.7.9.5 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH v4 11/11] cxl: Add CONFIG_CXL_EEH symbol
Once cxlflash has been merged we might drop this, but until then: Acked-by: Ian Munsie ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH v4 10/11] cxl: EEH support
Acked-by: Ian Munsie ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH v4 09/11] cxl: Allow the kernel to trust that an image won't change on PERST.
Acked-by: Ian Munsie ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH v4 08/11] cxl: Don't remove AFUs/vPHBs in cxl_reset
Acked-by: Ian Munsie ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH v4 07/11] cxl: Refactor AFU init/teardown
Excerpts from Daniel Axtens's message of 2015-08-13 14:11:25 +1000: > +rc = cxl_map_slice_regs(afu, adapter, dev); > +if (rc) > +return rc; > > -if ((rc = cxl_map_slice_regs(afu, adapter, dev))) Like the previous patch, mixing this coding style change in with this patch makes the diff harder to follow than necessary (though not as hard as the last one). If you happen to do a v5, please put the coding style changes in a separate patch, but otherwise it looks fine: Acked-by: Ian Munsie ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH v4 06/11] cxl: Refactor adaptor init/teardown
Excerpts from Daniel Axtens's message of 2015-08-13 14:11:24 +1000: > +/* This should contain *only* operations that can safely be done in > + * both creation and recovery. > + */ > +static int cxl_configure_adapter(struct cxl *adapter, struct pci_dev *dev) > { > -struct cxl *adapter; > -bool free = true; > int rc; > > +adapter->dev.parent = &dev->dev; > +adapter->dev.release = cxl_release_adapter; > +pci_set_drvdata(dev, adapter); These seem a bit odd here (though perfectly harmless) - not sure these need to be done again on recovery (but maybe I'm wrong?) - seems more like something that should be done early in cxl_init_adapter? > -if ((rc = cxl_update_image_control(adapter))) > -goto err2; > +rc = cxl_update_image_control(adapter); > +if (rc) These types of coding style changes should really be in a separate patch to make it easier to see exactly how you have changed the init path in this one. I know mpe wanted these changed and after looking at the diff pretty carefully I realise that you haven't actually changed much functionally so I'll let this pass, but if you happen to do another respin please move the style changes into a separate patch. Acked-by: Ian Munsie ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH] powerpc/eeh: Probe after unbalanced kref check
In the complete hotplug case, EEH PEs are supposed to be released and set to NULL. Normally, this is done by eeh_remove_device(), which is called from pcibios_release_device(). However, if something is holding a kref to the device, it will not be released, and the PE will remain. eeh_add_device_late() has a check for this which will explictly destroy the PE in this case. This check in eeh_add_device_late() occurs after a call to eeh_ops->probe(). On PowerNV, probe is a pointer to pnv_eeh_probe(), which will exit without probing if there is an existing PE. This means that on PowerNV, devices with outstanding krefs will not be rediscovered by EEH correctly after a complete hotplug. This is affecting CXL (CAPI) devices in the field. Put the probe after the kref check so that the PE is destroyed and affected devices are correctly rediscovered by EEH. Fixes: d91dafc02f42 ("powerpc/eeh: Delay probing EEH device during hotplug") Cc: sta...@vger.kernel.org Cc: Gavin Shan Signed-off-by: Daniel Axtens --- arch/powerpc/kernel/eeh.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index af9b597b10af..8e61d717915e 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -1116,9 +1116,6 @@ void eeh_add_device_late(struct pci_dev *dev) return; } - if (eeh_has_flag(EEH_PROBE_MODE_DEV)) - eeh_ops->probe(pdn, NULL); - /* * The EEH cache might not be removed correctly because of * unbalanced kref to the device during unplug time, which @@ -1142,6 +1139,9 @@ void eeh_add_device_late(struct pci_dev *dev) dev->dev.archdata.edev = NULL; } + if (eeh_has_flag(EEH_PROBE_MODE_DEV)) + eeh_ops->probe(pdn, NULL); + edev->pdev = dev; dev->dev.archdata.edev = edev; -- 2.1.4 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH] cxl: Plug irq_bitmap getting leaked in cxl_context
This patch plugs the leak of irq_bitmap, allocated as part of initialization of cxl_context struct; during the call to afu_allocate_irqs. The bitmap is now release during the call to function afu_release_irqs. Reported-by: Matthew R. Ochs Signed-off-by: Vaibhav Jain --- drivers/misc/cxl/irq.c | 4 1 file changed, 4 insertions(+) diff --git a/drivers/misc/cxl/irq.c b/drivers/misc/cxl/irq.c index 680cd26..c8f1f9d 100644 --- a/drivers/misc/cxl/irq.c +++ b/drivers/misc/cxl/irq.c @@ -511,4 +511,8 @@ void afu_release_irqs(struct cxl_context *ctx, void *cookie) afu_irq_name_free(ctx); cxl_release_irq_ranges(&ctx->irqs, ctx->afu->adapter); + + kfree(ctx->irq_bitmap); + ctx->irq_bitmap = NULL; + ctx->irq_count = 0; } -- 2.2.1 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v2] video: fbdev: fsl: Fix the sleep function for FSL DIU module
From: Jason Jin For deep sleep, the diu module will power off, when wake up from the deep sleep, the registers need to be reinitialized. Signed-off-by: Jason Jin Signed-off-by: Wang Dongsheng --- *v2* Changes: - int i -> unsigned int i. Rmove: - struct mfb_info *mfbi; diff --git a/drivers/video/fbdev/fsl-diu-fb.c b/drivers/video/fbdev/fsl-diu-fb.c index 7fa2e6f..b335c1a 100644 --- a/drivers/video/fbdev/fsl-diu-fb.c +++ b/drivers/video/fbdev/fsl-diu-fb.c @@ -1628,9 +1628,16 @@ static int fsl_diu_suspend(struct platform_device *ofdev, pm_message_t state) static int fsl_diu_resume(struct platform_device *ofdev) { struct fsl_diu_data *data; + unsigned int i; data = dev_get_drvdata(&ofdev->dev); - enable_lcdc(data->fsl_diu_info); + + fsl_diu_enable_interrupts(data); + update_lcdc(data->fsl_diu_info); + for (i = 0; i < NUM_AOIS; i++) { + if (data->mfb[i].count) + fsl_diu_enable_panel(&data->fsl_diu_info[i]); + } return 0; } -- 2.1.0.27.g96db324 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
RE: [PATCH] video/fsl: Fix the sleep function for FSL DIU module
Hi Tabi, > -Original Message- > From: Timur Tabi [mailto:ti...@tabi.org] > Sent: Tuesday, March 25, 2014 11:55 PM > To: Wang Dongsheng-B40534 > Cc: Wood Scott-B07421; Jin Zhengxiong-R64188; Li Yang-Leo-R58472; linuxppc- > d...@lists.ozlabs.org; linux-fb...@vger.kernel.org > Subject: Re: [PATCH] video/fsl: Fix the sleep function for FSL DIU module > > On 03/25/2014 02:56 AM, Dongsheng Wang wrote: > > From: Jason Jin > > > > For deep sleep, the diu module will power off, when wake up from the > > deep sleep, the registers need to be reinitialized. > > > > Signed-off-by: Jason Jin > > Signed-off-by: Wang Dongsheng > > > > diff --git a/drivers/video/fsl-diu-fb.c b/drivers/video/fsl-diu-fb.c > > index e8758b9..7ec780c 100644 > > --- a/drivers/video/fsl-diu-fb.c > > +++ b/drivers/video/fsl-diu-fb.c > > @@ -1628,9 +1628,18 @@ static int fsl_diu_suspend(struct platform_device > *ofdev, pm_message_t state) > > static int fsl_diu_resume(struct platform_device *ofdev) > > { > > struct fsl_diu_data *data; > > + struct mfb_info *mfbi; > > You don't need this, if ... > > > + int i; > > > > data = dev_get_drvdata(&ofdev->dev); > > - enable_lcdc(data->fsl_diu_info); > > + fsl_diu_enable_interrupts(data); > > + update_lcdc(data->fsl_diu_info); > > + > > + for (i = 0; i < NUM_AOIS; i++) { > > + mfbi = &data->mfb[i]; > > + if (mfbi->count) > > ... you do this: > > if (data->mfb[i].count) > > Also, 'i' should be an 'unsigned int'. > > > + fsl_diu_enable_panel(&data->fsl_diu_info[i]); > > + } > > > > return 0; > > } > > > > Other than that, this seems okay. > Thanks, send v2 to update this patch. Regards, -Dongsheng ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH v4 05/11] cxl: Clean up adapter MMIO unmap path.
Acked-by: Ian Munsie ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH v4 04/11] cxl: Make IRQ release idempotent
Acked-by: Ian Munsie ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH v4 03/11] cxl: Allocate and release the SPA with the AFU
Excerpts from Daniel Axtens's message of 2015-08-13 14:11:21 +1000: > Previously the SPA was allocated and freed upon entering and leaving > AFU-directed mode. This causes some issues for error recovery - contexts > hold a pointer inside the SPA, and they may persist after the AFU has > been detached. > > We would ideally like to allocate the SPA when the AFU is allocated, and > release it until the AFU is released. However, we don't know how big the > SPA needs to be until we read the AFU descriptor. > > Therefore, restructure the code: > > - Allocate the SPA only once, on the first attach. > > - Release the SPA only when the entire AFU is being released (not >detached). Guard the release with a NULL check, so we don't free >if it was never allocated (e.g. dedicated mode) This is certainly an improvement, though in the long run I wonder if we should consider making the contexts increase the refcount of the AFU so that we can be sure that the AFU structure will outlive the contexts? That would be a more significant rework though, and this patch is needed either way and solves an immediate problem, so: Acked-by: Ian Munsie ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH v4 02/11] cxl: Drop commands if the PCI channel is not in normal state
Acked-by: Ian Munsie Excerpts from Daniel Axtens's message of 2015-08-13 14:11:20 +1000: > +/* Only warn if we detached while the link was OK. Only because mpe is sure to pick this up (I personally don't mind) - block comments should start with /* on a line by itself. > +/* If the adapter has gone down, we can assume that we ... > +/* We could be asked to terminate when the hw is down. That ... > +/* We could be asked to remove when the hw is down. Again, if ... > +/* If the adapter has gone away, we can't get any meaningful ... > +/* Config space IO is based on phb->cfg_addr, which is based on Ditto. Cheers, -Ian ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH v4 01/11] cxl: Convert MMIO read/write macros to inline functions
Acked-by: Ian Munsie ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH] book3s_hv_rmhandlers:Pass the correct trap argument to kvmhv_commence_exit
On Thu, May 21, 2015 at 01:57:04PM +0530, Gautham R. Shenoy wrote: > In guest_exit_cont we call kvmhv_commence_exit which expects the trap > number as the argument. However r3 doesn't contain the trap number at > this point and as a result we would be calling the function with a > spurious trap number. > > Fix this by copying r12 into r3 before calling kvmhv_commence_exit as > r12 contains the trap number > > Signed-off-by: Gautham R. Shenoy Hi Gautham, I agree with your logic: r3 is quite clearly corrupted in that path. So: Reviewed-by: Sam Bobroff Just one comment: Do you have a case of this causing some visible problem due to the corrupted trap number? (I'll test the patch if you do.) Cheers, Sam. ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH v3] powerpc: Add an inline function to update POWER8 HID0
On Wed, Aug 05, 2015 at 12:38:31PM +0530, Gautham R. Shenoy wrote: > Section 3.7 of Version 1.2 of the Power8 Processor User's Manual > prescribes that updates to HID0 be preceded by a SYNC instruction and > followed by an ISYNC instruction (Page 91). > > Create an inline function name update_power8_hid0() which follows this > recipe and invoke it from the static split core path. > > Signed-off-by: Gautham R. Shenoy Hi Gautham, I've tested this on a Power 8 machine and verified that it is able to change split modes and that when doing so the new code is used. Reviewed-by: Sam Bobroff Tested-by: Sam Bobroff ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v2 1/2] powerpc/85xx: Add binding for SCFG
From: Wang Dongsheng SCFG provides SoC specific configuration and status registers for the chip. Add this for powerpc platform. Signed-off-by: Wang Dongsheng --- *V2* - Remove scfg description in board.txt and create scfg.txt for scfg. - Change "fsl,-scfg" to "fsl,-scfg" diff --git a/Documentation/devicetree/bindings/powerpc/fsl/scfg.txt b/Documentation/devicetree/bindings/powerpc/fsl/scfg.txt new file mode 100644 index 000..0532c46 --- /dev/null +++ b/Documentation/devicetree/bindings/powerpc/fsl/scfg.txt @@ -0,0 +1,18 @@ +Freescale Supplement configuration unit (SCFG) + +SCFG is the supplemental configuration unit, that provides SoC specific +configuration and status registers for the chip. Such as getting PEX port +status. + +Required properties: + +- compatible: should be "fsl,-scfg" +- reg: should contain base address and length of SCFG memory-mapped +registers + +Example: + + scfg: global-utilities@fc000 { + compatible = "fsl,t1040-scfg"; + reg = <0xfc000 0x1000>; + }; -- 2.1.0.27.g96db324 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH V2] QorIQ/TMU: add thermal management support based on TMU
Hello Hongtao, On Fri, Aug 14, 2015 at 03:15:22AM +, Hongtao Jia wrote: > Hi Eduardo, > > In previous mail I asked questions about including header files in device > tree. > Don't bother, I have already figured out the solution. > > Another questions is about cpu cooling: > I found out that there is no explicit calling for registering cpu cooling > device in the of-thermal style drivers. Your understanding is correct. > > And Samsung did it in cpufreq driver: drivers/cpufreq/exynos-cpufreq.c > Yes. > Should all the of-thermal driver use the same way? of-thermal won't handle the cooling device registering. It is typically registered by the cpufreq driver. Have a look in drivers/cpufreq/cpufreq-dt.c > Or is there any recommendation for registering cpu cooling device? > (I enabled the CONFIG_CPUFREQ_DT and still got no cooling device registered) If your system supports using cpufreq-dt, then it will handle registering the cpucooling for you, if you configures the cooling dt properties in your DT files. How does your DT entry look like? BR, Eduardo > > Thanks. > > --- > Best Regards, > Hongtao > > > > -Original Message- > > From: Linuxppc-dev [mailto:linuxppc-dev- > > bounces+b38951=freescale@lists.ozlabs.org] On Behalf Of Hongtao Jia > > Sent: Friday, August 07, 2015 4:15 PM > > To: Eduardo Valentin > > Cc: Wood Scott-B07421; linuxppc-dev@lists.ozlabs.org; linux- > > p...@vger.kernel.org > > Subject: RE: [PATCH V2] QorIQ/TMU: add thermal management support based > > on TMU > > > > Thanks for your comments. > > Please see my questions inline. > > > > Thanks. > > --- > > Best Regards, > > Hongtao > > > > > > > -Original Message- > > > From: Eduardo Valentin [mailto:edubez...@gmail.com] > > > Sent: Thursday, August 06, 2015 3:43 AM > > > To: Jia Hongtao-B38951 > > > Cc: linux...@vger.kernel.org; linuxppc-dev@lists.ozlabs.org; Wood > > > Scott- > > > B07421 > > > Subject: Re: [PATCH V2] QorIQ/TMU: add thermal management support > > > based on TMU > > > > > > On Thu, Jul 30, 2015 at 08:13:09AM +, Hongtao Jia wrote: > > > > - "Any specific reason why not using OF thermal?" > > > > - No, actually. > > > > > > > > I'd like to use OF thermal after some clarification. > > > > > > > > Regarding to "cooling-maps". For some cases there should be more > > > > than one cpus as cooling device and they are independent. > > > > 1. Let's say 4. So we need to provide 4 maps like map0-map3. Right? > > > > > > That would depend on the amount of sensors you have. Do you have one > > > sensor per cpu? if the answer is yes, then you probably want to have > > > four different map entries, yes, but one on each thermal zone of each > > > cpu temperature sensor. if the answer is no, then you would need to > > > have all the maps in the same thermal zone. > > > > > > > 2. "cooling-max-level" may vary depend on switch settings or firmware. > > > Is that > > > >OK if I do not provide "cooling-min-level" and "cooling-max-level" > > > property? > > > > > > That is already achievable by using the cooling-device property of a > > > cooling map. > > > > > > Please have a look in the example section of the > > > Documentation/devicetree/bindings/thermal/thermal.txt > > > > Yes, I read this file. > > So in my understanding: > > There is no need to provide "cooling-min-level" and "cooling-max-level" > > property. > > THERMAL_NO_LIMIT value in cooling device node will indicate the driver to > > automatically parse the min and max state, right? > > > > Talking about THERMAL_NO_LIMIT, I need to #include > bindings/thermal/thermal.h> to provide the definition. But I got > > compiling error when build dtb file. > > I did some research and using "make t1040qds.dtb" in order to involve > > preprocessor. > > But with simply adding "#include " to > > t1040si-post.dtsi at line 35 I still got error like this: > > Error: arch/powerpc/boot/dts/fsl/t1040si-post.dtsi:35.1-9 syntax error > > FATAL ERROR: Unable to parse input tree > > > > Could you help me out here. > > Thanks. > > > > > > > > Let me know if you need further clarification. > > > > > > > > > BR, > > > > > > Eduardo Valentin > > > > > > > > > > > Thanks. > > > > -Hongtao > > > > > > > > > > > > > -Original Message- > > > > > From: Eduardo Valentin [mailto:edubez...@gmail.com] > > > > > Sent: Thursday, July 30, 2015 2:56 PM > > > > > To: Jia Hongtao-B38951 > > > > > Cc: linux...@vger.kernel.org; linuxppc-dev@lists.ozlabs.org; Wood > > > > > Scott- > > > > > B07421 > > > > > Subject: Re: [PATCH V2] QorIQ/TMU: add thermal management support > > > > > based on TMU > > > > > > > > > > On Wed, Jul 29, 2015 at 02:19:39PM +0800, Jia Hongtao wrote: > > > > > > It supports one critical trip point and one passive trip point. > > > > > > The cpufreq is used as the cooling device to throttle CPUs when > > > > > > the passive trip is crossed. > > > > > > > > > > > > Signed-off-by: Jia Hongtao > > > > > > --- > > > > > > This patch base
Re: [PATCH 29/31] parisc: handle page-less SG entries
From: James Bottomley Date: Thu, 13 Aug 2015 20:59:20 -0700 > On Thu, 2015-08-13 at 20:30 -0700, Dan Williams wrote: >> On Thu, Aug 13, 2015 at 7:31 AM, Christoph Hellwig wrote: >> > On Wed, Aug 12, 2015 at 09:01:02AM -0700, Linus Torvalds wrote: >> >> I'm assuming that anybody who wants to use the page-less >> >> scatter-gather lists always does so on memory that isn't actually >> >> virtually mapped at all, or only does so on sane architectures that >> >> are cache coherent at a physical level, but I'd like that assumption >> >> *documented* somewhere. >> > >> > It's temporarily mapped by kmap-like helpers. That code isn't in >> > this series. The most recent version of it is here: >> > >> > https://git.kernel.org/cgit/linux/kernel/git/djbw/nvdimm.git/commit/?h=pfn&id=de8237c99fdb4352be2193f3a7610e902b9bb2f0 >> > >> > note that it's not doing the cache flushing it would have to do yet, but >> > it's also only enabled for x86 at the moment. >> >> For virtually tagged caches I assume we would temporarily map with >> kmap_atomic_pfn_t(), similar to how drm_clflush_pages() implements >> powerpc support. However with DAX we could end up with multiple >> virtual aliases for a page-less pfn. > > At least on some PA architectures, you have to be very careful. > Improperly managed, multiple aliases will cause the system to crash > (actually a machine check in the cache chequerboard). For the most > temperamental systems, we need the cache line flushed and the alias > mapping ejected from the TLB cache before we access the same page at an > inequivalent alias. Also, I want to mention that on sparc64 we manage the cache aliasing state in the page struct. Until a page is mapped into userspace, we just record the most recent cpu to store into that page with kernel side mappings. Once the page ends up being mapped or the cpu doing kernel side stores changes, we actually perform the cache flush. Generally speaking, I think that all actual physical memory the kernel operates on should have a struct page backing it. So this whole discussion of operating on physical memory in scatter lists without backing page structs feels really foreign to me. ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH 29/31] parisc: handle page-less SG entries
On Thu, 2015-08-13 at 20:30 -0700, Dan Williams wrote: > On Thu, Aug 13, 2015 at 7:31 AM, Christoph Hellwig wrote: > > On Wed, Aug 12, 2015 at 09:01:02AM -0700, Linus Torvalds wrote: > >> I'm assuming that anybody who wants to use the page-less > >> scatter-gather lists always does so on memory that isn't actually > >> virtually mapped at all, or only does so on sane architectures that > >> are cache coherent at a physical level, but I'd like that assumption > >> *documented* somewhere. > > > > It's temporarily mapped by kmap-like helpers. That code isn't in > > this series. The most recent version of it is here: > > > > https://git.kernel.org/cgit/linux/kernel/git/djbw/nvdimm.git/commit/?h=pfn&id=de8237c99fdb4352be2193f3a7610e902b9bb2f0 > > > > note that it's not doing the cache flushing it would have to do yet, but > > it's also only enabled for x86 at the moment. > > For virtually tagged caches I assume we would temporarily map with > kmap_atomic_pfn_t(), similar to how drm_clflush_pages() implements > powerpc support. However with DAX we could end up with multiple > virtual aliases for a page-less pfn. At least on some PA architectures, you have to be very careful. Improperly managed, multiple aliases will cause the system to crash (actually a machine check in the cache chequerboard). For the most temperamental systems, we need the cache line flushed and the alias mapping ejected from the TLB cache before we access the same page at an inequivalent alias. James ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH v3 6/6] powerpc/powernv: allocate sparse PE# when using M64 BAR in Single PE mode
On Fri, Aug 14, 2015 at 11:03:00AM +1000, Gavin Shan wrote: >On Thu, Aug 13, 2015 at 10:11:11PM +0800, Wei Yang wrote: >>When M64 BAR is set to Single PE mode, the PE# assigned to VF could be >>sparse. >> >>This patch restructures the patch to allocate sparse PE# for VFs when M64 >>BAR is set to Single PE mode. >> >>Signed-off-by: Wei Yang >>--- >> arch/powerpc/include/asm/pci-bridge.h |2 +- >> arch/powerpc/platforms/powernv/pci-ioda.c | 59 >> +++-- >> 2 files changed, 41 insertions(+), 20 deletions(-) >> >>diff --git a/arch/powerpc/include/asm/pci-bridge.h >>b/arch/powerpc/include/asm/pci-bridge.h >>index 9d33ada..b026ef8 100644 >>--- a/arch/powerpc/include/asm/pci-bridge.h >>+++ b/arch/powerpc/include/asm/pci-bridge.h >>@@ -214,7 +214,7 @@ struct pci_dn { >> #ifdef CONFIG_PCI_IOV >> u16 vfs_expanded; /* number of VFs IOV BAR expanded */ >> u16 num_vfs;/* number of VFs enabled*/ >>- int offset; /* PE# for the first VF PE */ >>+ int pe_num_map[MAX_M64_BAR];/* PE# for the first VF PE or array */ > >Same question as to "m64_map". pdn for non-PF doesn't need it. > The same, I prefer the dynamic version. >> boolm64_single_mode;/* Use M64 BAR in Single Mode */ >> #define IODA_INVALID_M64(-1) >> int m64_map[PCI_SRIOV_NUM_BARS][MAX_M64_BAR]; >>diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c >>b/arch/powerpc/platforms/powernv/pci-ioda.c >>index 1e6ac86..7633538 100644 >>--- a/arch/powerpc/platforms/powernv/pci-ioda.c >>+++ b/arch/powerpc/platforms/powernv/pci-ioda.c >>@@ -1232,7 +1232,7 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, >>u16 num_vfs) >> >> /* Map the M64 here */ >> if (pdn->m64_single_mode) { >>- pe_num = pdn->offset + j; >>+ pe_num = pdn->pe_num_map[j]; >> rc = opal_pci_map_pe_mmio_window(phb->opal_id, >> pe_num, OPAL_M64_WINDOW_TYPE, >> pdn->m64_map[i][j], 0); >>@@ -1336,7 +1336,7 @@ void pnv_pci_sriov_disable(struct pci_dev *pdev) >> struct pnv_phb*phb; >> struct pci_dn *pdn; >> struct pci_sriov *iov; >>- u16 num_vfs; >>+ u16 num_vfs, i; >> >> bus = pdev->bus; >> hose = pci_bus_to_host(bus); >>@@ -1350,14 +1350,17 @@ void pnv_pci_sriov_disable(struct pci_dev *pdev) >> >> if (phb->type == PNV_PHB_IODA2) { >> if (!pdn->m64_single_mode) >>- pnv_pci_vf_resource_shift(pdev, -pdn->offset); >>+ pnv_pci_vf_resource_shift(pdev, -pdn->pe_num_map[0]); >> >> /* Release M64 windows */ >> pnv_pci_vf_release_m64(pdev); >> >> /* Release PE numbers */ >>- bitmap_clear(phb->ioda.pe_alloc, pdn->offset, num_vfs); >>- pdn->offset = 0; >>+ if (pdn->m64_single_mode) { >>+ for (i = 0; i < num_vfs; i++) >>+ pnv_ioda_free_pe(phb, pdn->pe_num_map[i]); >>+ } else >>+ bitmap_clear(phb->ioda.pe_alloc, pdn->pe_num_map[0], >>num_vfs); >> } >> } >> >>@@ -1383,7 +1386,10 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, >>u16 num_vfs) >> >> /* Reserve PE for each VF */ >> for (vf_index = 0; vf_index < num_vfs; vf_index++) { >>- pe_num = pdn->offset + vf_index; >>+ if (pdn->m64_single_mode) >>+ pe_num = pdn->pe_num_map[vf_index]; >>+ else >>+ pe_num = pdn->pe_num_map[0] + vf_index; >> >> pe = &phb->ioda.pe_array[pe_num]; >> pe->pe_number = pe_num; >>@@ -1425,6 +1431,7 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 >>num_vfs) >> struct pnv_phb*phb; >> struct pci_dn *pdn; >> intret; >>+ u16i; >> >> bus = pdev->bus; >> hose = pci_bus_to_host(bus); >>@@ -1448,19 +1455,30 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 >>num_vfs) >> } >> >> /* Calculate available PE for required VFs */ >>- mutex_lock(&phb->ioda.pe_alloc_mutex); >>- pdn->offset = bitmap_find_next_zero_area( >>- phb->ioda.pe_alloc, phb->ioda.total_pe, >>- 0, num_vfs, 0); >>- if (pdn->offset >= phb->ioda.total_pe) { >>+ if (pdn->m64_single_mode) { >>+ for (i = 0; i < num_vfs; i++) >>+ pdn->pe_num_map[i] = IODA_INVALID_PE; >>+ for (i = 0; i < num_vfs; i++) { >>+ pdn->pe_num_map[i] = pnv_ioda_alloc_pe(phb); >>+ if (pdn->pe_num_map[i] == IODA_INVALID_PE) { >>+
Re: [PATCH v3 3/6] powerpc/powernv: use one M64 BAR in Single PE mode for one VF BAR
On Fri, Aug 14, 2015 at 10:52:21AM +1000, Gavin Shan wrote: >On Thu, Aug 13, 2015 at 10:11:08PM +0800, Wei Yang wrote: >>In current implementation, when VF BAR is bigger than 64MB, it uses 4 M64 >>BARs in Single PE mode to cover the number of VFs required to be enabled. >>By doing so, several VFs would be in one VF Group and leads to interference >>between VFs in the same group. >> >>This patch changes the design by using one M64 BAR in Single PE mode for >>one VF BAR. This gives absolute isolation for VFs. >> >>Signed-off-by: Wei Yang >>--- >> arch/powerpc/include/asm/pci-bridge.h |6 +- >> arch/powerpc/platforms/powernv/pci-ioda.c | 163 >> +++-- >> 2 files changed, 62 insertions(+), 107 deletions(-) >> >>diff --git a/arch/powerpc/include/asm/pci-bridge.h >>b/arch/powerpc/include/asm/pci-bridge.h >>index 712add5..9d33ada 100644 >>--- a/arch/powerpc/include/asm/pci-bridge.h >>+++ b/arch/powerpc/include/asm/pci-bridge.h >>@@ -187,6 +187,7 @@ static inline int isa_vaddr_is_ioport(void __iomem >>*address) >> */ >> struct iommu_table; >> >>+#define MAX_M64_BAR 16 > >struct pnv_phb::m64_bar_idx is initialized to 15. Another macro is defined here >as 16. Both of them can be used as maximal M64 BAR number. Obviously, they're >duplicated. On the other hand, I don't think it's a good idea to have the >static >"m64_map" because @pdn is created for every PCI devices, including VFs. non-PF >don't "m64_map", together other fields like "m64_per_iov" at all. It's >obviously >wasting memory. So it would be allocated dynamically when the PF's pdn is >created >or in pnv_pci_ioda_fixup_iov_resources(). > I prefer the dynamic one. Alexey, I changed to static defined based on your comments. So do you have some concern on the dynamic version? >In long run, it might be reasonable to move all SRIOV related fields in pci_dn >to another data struct (struct pci_iov_dn?) and allocate that dynamically. > >> int flags; >> #define PCI_DN_FLAG_IOV_VF 0x01 >>@@ -214,10 +215,9 @@ struct pci_dn { >> u16 vfs_expanded; /* number of VFs IOV BAR expanded */ >> u16 num_vfs;/* number of VFs enabled*/ >> int offset; /* PE# for the first VF PE */ >>-#define M64_PER_IOV 4 >>- int m64_per_iov; >>+ boolm64_single_mode;/* Use M64 BAR in Single Mode */ >> #define IODA_INVALID_M64(-1) >>- int m64_wins[PCI_SRIOV_NUM_BARS][M64_PER_IOV]; >>+ int m64_map[PCI_SRIOV_NUM_BARS][MAX_M64_BAR]; >> #endif /* CONFIG_PCI_IOV */ >> #endif >> struct list_head child_list; >>diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c >>b/arch/powerpc/platforms/powernv/pci-ioda.c >>index 67b8f72..4da0f50 100644 >>--- a/arch/powerpc/platforms/powernv/pci-ioda.c >>+++ b/arch/powerpc/platforms/powernv/pci-ioda.c >>@@ -1162,15 +1162,14 @@ static int pnv_pci_vf_release_m64(struct pci_dev >>*pdev) >> pdn = pci_get_pdn(pdev); >> >> for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) >>- for (j = 0; j < M64_PER_IOV; j++) { >>- if (pdn->m64_wins[i][j] == IODA_INVALID_M64) >>+ for (j = 0; j < MAX_M64_BAR; j++) { >>+ if (pdn->m64_map[i][j] == IODA_INVALID_M64) >> continue; >> opal_pci_phb_mmio_enable(phb->opal_id, >>- OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 0); >>- clear_bit(pdn->m64_wins[i][j], >>&phb->ioda.m64_bar_alloc); >>- pdn->m64_wins[i][j] = IODA_INVALID_M64; >>+ OPAL_M64_WINDOW_TYPE, pdn->m64_map[i][j], 0); >>+ clear_bit(pdn->m64_map[i][j], &phb->ioda.m64_bar_alloc); >>+ pdn->m64_map[i][j] = IODA_INVALID_M64; >> } >>- >> return 0; >> } >> >>@@ -1187,8 +1186,7 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, >>u16 num_vfs) >> inttotal_vfs; >> resource_size_tsize, start; >> intpe_num; >>- intvf_groups; >>- intvf_per_group; >>+ intm64_bars; >> >> bus = pdev->bus; >> hose = pci_bus_to_host(bus); >>@@ -1196,26 +1194,23 @@ static int pnv_pci_vf_assign_m64(struct pci_dev >>*pdev, u16 num_vfs) >> pdn = pci_get_pdn(pdev); >> total_vfs = pci_sriov_get_totalvfs(pdev); >> >>- /* Initialize the m64_wins to IODA_INVALID_M64 */ >>- for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) >>- for (j = 0; j < M64_PER_IOV; j++) >>- pdn->m64_wins[i][j] = IODA_INVALID_M64; >>+ if (pdn->m64_single_mode) >>+ m64_bars = num_vfs; >>+ else >>+ m64_bars = 1; >>+ >>+ /* Initialize the m64_map to IODA_INVALID_M64 */ >>+ for (i = 0; i < PCI_SRIOV_NUM_BARS ; i++) >>+ for (j = 0; j < MAX_M64_BAR; j++) >>+ pdn-
[PATCH v2 2/2] powerpc/mpc85xx:Add SCFG device tree support of T104x
From: Wang Dongsheng Signed-off-by: Wang Dongsheng --- *V2* No changes. diff --git a/arch/powerpc/boot/dts/fsl/t1040si-post.dtsi b/arch/powerpc/boot/dts/fsl/t1040si-post.dtsi index 9e9f7e2..9770d02 100644 --- a/arch/powerpc/boot/dts/fsl/t1040si-post.dtsi +++ b/arch/powerpc/boot/dts/fsl/t1040si-post.dtsi @@ -484,6 +484,11 @@ reg= <0xea000 0x4000>; }; + scfg: global-utilities@fc000 { + compatible = "fsl,t1040-scfg"; + reg = <0xfc000 0x1000>; + }; + /include/ "elo3-dma-0.dtsi" /include/ "elo3-dma-1.dtsi" /include/ "qoriq-espi-0.dtsi" -- 2.1.0.27.g96db324 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH v3 2/6] powerpc/powernv: simplify the calculation of iov resource alignment
On Fri, Aug 14, 2015 at 11:04:58AM +1000, Gavin Shan wrote: >On Thu, Aug 13, 2015 at 10:11:07PM +0800, Wei Yang wrote: >>The alignment of IOV BAR on PowerNV platform is the total size of the IOV >>BAR. No matter whether the IOV BAR is extended with number of >>roundup_pow_of_two(total_vfs) or number of max PE number (256), the total >>size could be calculated by (vfs_expanded * VF_BAR_size). >> >>This patch simplifies the pnv_pci_iov_resource_alignment() by removing the >>first case. >> >>Signed-off-by: Wei Yang >>Reviewed-by: Gavin Shan >>--- >> arch/powerpc/platforms/powernv/pci-ioda.c | 14 +- >> 1 file changed, 9 insertions(+), 5 deletions(-) >> >>diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c >>b/arch/powerpc/platforms/powernv/pci-ioda.c >>index 9ac324e..67b8f72 100644 >>--- a/arch/powerpc/platforms/powernv/pci-ioda.c >>+++ b/arch/powerpc/platforms/powernv/pci-ioda.c >>@@ -2987,12 +2987,16 @@ static resource_size_t >>pnv_pci_iov_resource_alignment(struct pci_dev *pdev, >>int resno) >> { >> struct pci_dn *pdn = pci_get_pdn(pdev); >>- resource_size_t align, iov_align; >>- >>- iov_align = resource_size(&pdev->resource[resno]); >>- if (iov_align) >>- return iov_align; >>+ resource_size_t align; >> >>+ /* >>+ * On PowerNV platform, IOV BAR is mapped by M64 BAR to enable the >>+ * SR-IOV. While from hardware perspective, the range mapped by M64 >>+ * BAR should be size aligned. >>+ * >>+ * This function return the total IOV BAR size if expanded or just the >>+ * individual size if not. >>+ */ > >s/return/returns > Thanks :-) >> align = pci_iov_resource_size(pdev, resno); >> if (pdn->vfs_expanded) >> return pdn->vfs_expanded * align; >>-- >>1.7.9.5 >> -- Richard Yang Help you, Help me ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH 3/3] powerpc/e6500: hw tablewalk: order the memory access when acquire/release tcd lock
On Thu, 2015-08-13 at 19:51 +0800, Kevin Hao wrote: > I didn't find anything unusual. But I think we do need to order the > load/store of esel_next when acquire/release tcd lock. For acquire, > add a data dependency to order the loads of lock and esel_next. > For release, even there already have a "isync" here, but it doesn't > guarantee any memory access order. So we still need "lwsync" for > the two stores for lock and esel_next. I was going to say that esel_next is just a hint and it doesn't really matter if we occasionally get the wrong value, unless it happens often enough to cause more performance degradation than the lwsync causes. However, with the A-008139 workaround we do need to read the same value from esel_next both times. It might be less costly to save/restore an additional register instead of lwsync, though. -Scott ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH 29/31] parisc: handle page-less SG entries
On Thu, Aug 13, 2015 at 7:31 AM, Christoph Hellwig wrote: > On Wed, Aug 12, 2015 at 09:01:02AM -0700, Linus Torvalds wrote: >> I'm assuming that anybody who wants to use the page-less >> scatter-gather lists always does so on memory that isn't actually >> virtually mapped at all, or only does so on sane architectures that >> are cache coherent at a physical level, but I'd like that assumption >> *documented* somewhere. > > It's temporarily mapped by kmap-like helpers. That code isn't in > this series. The most recent version of it is here: > > https://git.kernel.org/cgit/linux/kernel/git/djbw/nvdimm.git/commit/?h=pfn&id=de8237c99fdb4352be2193f3a7610e902b9bb2f0 > > note that it's not doing the cache flushing it would have to do yet, but > it's also only enabled for x86 at the moment. For virtually tagged caches I assume we would temporarily map with kmap_atomic_pfn_t(), similar to how drm_clflush_pages() implements powerpc support. However with DAX we could end up with multiple virtual aliases for a page-less pfn. ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
RE: [PATCH V2] QorIQ/TMU: add thermal management support based on TMU
Hi Eduardo, In previous mail I asked questions about including header files in device tree. Don't bother, I have already figured out the solution. Another questions is about cpu cooling: I found out that there is no explicit calling for registering cpu cooling device in the of-thermal style drivers. And Samsung did it in cpufreq driver: drivers/cpufreq/exynos-cpufreq.c Should all the of-thermal driver use the same way? Or is there any recommendation for registering cpu cooling device? (I enabled the CONFIG_CPUFREQ_DT and still got no cooling device registered) Thanks. --- Best Regards, Hongtao > -Original Message- > From: Linuxppc-dev [mailto:linuxppc-dev- > bounces+b38951=freescale@lists.ozlabs.org] On Behalf Of Hongtao Jia > Sent: Friday, August 07, 2015 4:15 PM > To: Eduardo Valentin > Cc: Wood Scott-B07421; linuxppc-dev@lists.ozlabs.org; linux- > p...@vger.kernel.org > Subject: RE: [PATCH V2] QorIQ/TMU: add thermal management support based > on TMU > > Thanks for your comments. > Please see my questions inline. > > Thanks. > --- > Best Regards, > Hongtao > > > > -Original Message- > > From: Eduardo Valentin [mailto:edubez...@gmail.com] > > Sent: Thursday, August 06, 2015 3:43 AM > > To: Jia Hongtao-B38951 > > Cc: linux...@vger.kernel.org; linuxppc-dev@lists.ozlabs.org; Wood > > Scott- > > B07421 > > Subject: Re: [PATCH V2] QorIQ/TMU: add thermal management support > > based on TMU > > > > On Thu, Jul 30, 2015 at 08:13:09AM +, Hongtao Jia wrote: > > > - "Any specific reason why not using OF thermal?" > > > - No, actually. > > > > > > I'd like to use OF thermal after some clarification. > > > > > > Regarding to "cooling-maps". For some cases there should be more > > > than one cpus as cooling device and they are independent. > > > 1. Let's say 4. So we need to provide 4 maps like map0-map3. Right? > > > > That would depend on the amount of sensors you have. Do you have one > > sensor per cpu? if the answer is yes, then you probably want to have > > four different map entries, yes, but one on each thermal zone of each > > cpu temperature sensor. if the answer is no, then you would need to > > have all the maps in the same thermal zone. > > > > > 2. "cooling-max-level" may vary depend on switch settings or firmware. > > Is that > > >OK if I do not provide "cooling-min-level" and "cooling-max-level" > > property? > > > > That is already achievable by using the cooling-device property of a > > cooling map. > > > > Please have a look in the example section of the > > Documentation/devicetree/bindings/thermal/thermal.txt > > Yes, I read this file. > So in my understanding: > There is no need to provide "cooling-min-level" and "cooling-max-level" > property. > THERMAL_NO_LIMIT value in cooling device node will indicate the driver to > automatically parse the min and max state, right? > > Talking about THERMAL_NO_LIMIT, I need to #include bindings/thermal/thermal.h> to provide the definition. But I got > compiling error when build dtb file. > I did some research and using "make t1040qds.dtb" in order to involve > preprocessor. > But with simply adding "#include " to > t1040si-post.dtsi at line 35 I still got error like this: > Error: arch/powerpc/boot/dts/fsl/t1040si-post.dtsi:35.1-9 syntax error > FATAL ERROR: Unable to parse input tree > > Could you help me out here. > Thanks. > > > > > Let me know if you need further clarification. > > > > > > BR, > > > > Eduardo Valentin > > > > > > > > Thanks. > > > -Hongtao > > > > > > > > > > -Original Message- > > > > From: Eduardo Valentin [mailto:edubez...@gmail.com] > > > > Sent: Thursday, July 30, 2015 2:56 PM > > > > To: Jia Hongtao-B38951 > > > > Cc: linux...@vger.kernel.org; linuxppc-dev@lists.ozlabs.org; Wood > > > > Scott- > > > > B07421 > > > > Subject: Re: [PATCH V2] QorIQ/TMU: add thermal management support > > > > based on TMU > > > > > > > > On Wed, Jul 29, 2015 at 02:19:39PM +0800, Jia Hongtao wrote: > > > > > It supports one critical trip point and one passive trip point. > > > > > The cpufreq is used as the cooling device to throttle CPUs when > > > > > the passive trip is crossed. > > > > > > > > > > Signed-off-by: Jia Hongtao > > > > > --- > > > > > This patch based on: > > > > > http://patchwork.ozlabs.org/patch/482987/ > > > > > > > > > > Changes for V2: > > > > > * Add tmu-range parse. > > > > > * Use default trend hook. > > > > > * Using latest thermal_zone_bind_cooling_device API. > > > > > * Add calibration check during initialization. > > > > > * Disable/enalbe device when suspend/resume. > > > > > > > > > > drivers/thermal/Kconfig | 11 ++ > > > > > drivers/thermal/Makefile| 1 + > > > > > drivers/thermal/qoriq_thermal.c | 406 > > > > > > > > > > 3 files changed, 418 insertions(+) create mode 100644 > > > > > drivers/thermal/qoriq_thermal.c > > > > > > > > > > diff --git a/drivers/thermal/Kconfig b/drivers/thermal/Kconfig > > > >
[PATCH 1/1] powerpc/xmon: Paged output for paca display
The paca display is already more than 24 lines, which can be problematic if you have an old school 80x24 terminal, or more likely you are on a virtual terminal which does not scroll for whatever reason. This adds an optional letter to the "dp" and "dpa" xmon commands ("dpp" and "dppa"), which will enable a "per-page" display (with 16 line pages): the first page will be displayed and if there was data that didn't fit, it will display a message indicating that the user can use enter to display the next page. The intent is that this feels similar to the way the memory display functions work. This is implemented by running over the entire output both for the initial command and for each subsequent page: the visible part is clipped out by checking line numbers. Handling the empty command as "more" is done by writing a special command into a static buffer that indicates where to move the sliding visibility window. This is similar to the approach used for the memory dump commands except that the state data is encoded into the "last_cmd" string, rather than a set of static variables. The memory dump commands could probably be rewritten to make use of the same buffer and remove their other static variables. Sample output: 0:mon> dpp1 paca for cpu 0x1 @ cfdc0480: possible = yes present = yes online = yes lock_token = 0x8000 (0x8) paca_index = 0x1 (0xa) kernel_toc = 0xc0eb2400 (0x10) kernelbase = 0xc000 (0x18) kernel_msr = 0xb0001032 (0x20) emergency_sp = 0xc0003ffe8000 (0x28) mc_emergency_sp = 0xc0003ffe4000 (0x2e0) in_mce = 0x0 (0x2e8) data_offset = 0x7f17 (0x30) hw_cpu_id= 0x8 (0x38) cpu_start= 0x1 (0x3a) kexec_state = 0x0 (0x3b) [Enter for next page] 0:mon> __current= 0xc0007e696620 (0x290) kstack = 0xc0007e6ebe30 (0x298) stab_rr = 0xb (0x2a0) saved_r1 = 0xc0007ef37860 (0x2a8) trap_save= 0x0 (0x2b8) soft_enabled = 0x0 (0x2ba) irq_happened = 0x1 (0x2bb) io_sync = 0x0 (0x2bc) irq_work_pending = 0x0 (0x2bd) nap_state_lost = 0x0 (0x2be) 0:mon> (Based on a similar patch by Michael Ellerman "[v2] powerpc/xmon: Allow limiting the size of the paca display". This patch is an alternative and cannot coexist with the original.) Signed-off-by: Sam Bobroff --- arch/powerpc/xmon/xmon.c | 82 1 file changed, 62 insertions(+), 20 deletions(-) diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index e599259..9157286 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -72,6 +72,7 @@ static int xmon_gate; static unsigned long in_xmon __read_mostly = 0; +static char last_cmd_buf[128]; static unsigned long adrs; static int size = 1; #define MAX_DUMP (128 * 1024) @@ -204,8 +205,8 @@ Commands:\n\ dldump the kernel log buffer\n" #ifdef CONFIG_PPC64 "\ - dp[#]dump paca for current cpu, or cpu #\n\ - dpa dump paca for all possible cpus\n" + dp[p][#] dump paca for current cpu, or cpu # (p = paged)\n\ + dp[p]a dump paca for all possible cpus (p = paged)\n" #endif "\ dr dump stream of raw bytes\n\ @@ -2070,7 +2071,17 @@ static void xmon_rawdump (unsigned long adrs, long ndump) } #ifdef CONFIG_PPC64 -static void dump_one_paca(int cpu) +static bool line_visible(unsigned long start, unsigned long count, +unsigned long *line) { + bool rv = (!count + || ((*line >= start) && (*line < (start + count; + + (*line)++; + return rv; +} + +static void dump_one_paca(int cpu, unsigned long start, + unsigned long count, unsigned long *line) { struct paca_struct *p; @@ -2084,15 +2095,22 @@ static void dump_one_paca(int cpu) p = &paca[cpu]; - printf("paca for cpu 0x%x @ %p:\n", cpu, p); +#define VPRINT(...) do { \ + if (line_visible(start, count, line)) \ + printf(__VA_ARGS__); \ +} while (0) + VPRINT("paca for cpu 0x%x @ %p:\n", cpu, p); - printf(" %-*s = %s\n", 16, "possible", cpu_possible(cpu) ? "yes" : "no"); - printf(" %-*s = %s\n", 16, "present", cpu_present(cpu) ? "yes" : "no"); - printf(" %-*s = %s\n", 16, "online", cpu_online(cpu) ? "yes" : "no"); + VPRINT(" %-*s = %s\n", 16, "possible", cpu_possible(cpu) ? "yes" : "no"); + VPRINT(" %-*s = %s\n", 16, "present", cpu_present(cpu) ? "yes" : "no"); + VPRINT(" %-*s = %s\n", 16, "online", cpu_online(cpu) ? "yes" : "no"); +#undef VPRINT -#define DUMP(paca, name, format) \ - printf(" %-*s = %#-*"format"\t(0x%lx)\n", 16, #name, 1
Re: [V3] powerpc/irq: Enable some more exceptions in /proc/interrupts interface
On Thu, 2015-08-06 at 18:54 +0530, Anshuman Khandual wrote: > On 08/04/2015 03:27 PM, Michael Ellerman wrote: > > On Mon, 2015-13-07 at 08:16:06 UTC, Anshuman Khandual wrote: > >> This patch enables facility unavailable exceptions for generic facility, > >> FPU, ALTIVEC and VSX in /proc/interrupts listing by incrementing their > >> newly added IRQ statistical counters as and when these exceptions happen. > >> This also adds couple of helper functions which will be called from within > >> the interrupt handler context to update their statistics. Similarly this > >> patch also enables alignment and program check exceptions as well. > > > > ... > > > >> diff --git a/arch/powerpc/kernel/exceptions-64s.S > >> b/arch/powerpc/kernel/exceptions-64s.S > >> index 0a0399c2..a86180c 100644 > >> --- a/arch/powerpc/kernel/exceptions-64s.S > >> +++ b/arch/powerpc/kernel/exceptions-64s.S > >> @@ -1158,6 +1158,7 @@ BEGIN_FTR_SECTION > >> END_FTR_SECTION_IFSET(CPU_FTR_TM) > >> #endif > >>bl load_up_fpu > >> + bl fpu_unav_exceptions_count > > > > Is it safe to call C code here? > > Hmm, is it not ? I had that question but was not really sure. Dont > understand the difference between 'fast_exception_return' and > 'ret_from_except' completely. If you're "not really sure" it's correct, please say so in the change log! I'd rather you didn't send me patches with possibly subtle bugs in core code. cheers ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH] kvm:powerpc:Fix return statements for wrapper functions in the file book3s_64_mmu_hv.c
On Wed, 2015-08-12 at 21:06 +0200, Alexander Graf wrote: > > On 10.08.15 17:27, Nicholas Krause wrote: > > This fixes the wrapper functions kvm_umap_hva_hv and the function > > kvm_unmap_hav_range_hv to return the return value of the function > > kvm_handle_hva or kvm_handle_hva_range that they are wrapped to > > call internally rather then always making the caller of these > > wrapper functions think they always run successfully by returning > > the value of zero directly. > > > > Signed-off-by: Nicholas Krause > > Paul, could you please take on this one? Paul's away for a while can you take it directly? cheers ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: RFC: Reducing the number of non volatile GPRs in the ppc64 kernel
On Wed, 2015-08-05 at 14:03 +1000, Anton Blanchard wrote: > Hi, > > While looking at traces of kernel workloads, I noticed places where gcc > used a large number of non volatiles. Some of these functions > did very little work, and we spent most of our time saving the > non volatiles to the stack and reading them back. > > It made me wonder if we have the right ratio of volatile to non > volatile GPRs. Since the kernel is completely self contained, we could > potentially change that ratio. > > Attached is a quick hack to gcc and the kernel to decrease the number > of non volatile GPRs to 8. I'm not sure if this is a good idea (and if > the volatile to non volatile ratio is right), but this gives us > something to play with. OK, interesting idea. Can't say I'd ever though of that. I'm thinking we'd want some pretty solid analysis of the resulting code-gen and real world perf before we made a switch like that. Presumably it's going to hurt our null syscall, due to the added save/restores, but hopefully help with paths that do actual work. If the caller is actually using the non-volatiles then presumably it will be a wash, because the caller will have to do the save anyway. Though maybe it would still be a win because the caller can do the saves & restores when it needs to rather than all in a block. I'm also not clear on how it would affect folks who build modules separate from the kernel. We'd have to make sure they had the right GCC, or things would go badly wrong, unless it can be done with command line flags? I don't know how much we care about that but distros presumably do. cheers ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH v2] powerpc/xmon: Allow limiting the size of the paca display
On Wed, Aug 12, 2015 at 09:55:25PM +1000, Michael Ellerman wrote: > The paca display is already more than 24 lines, which can be problematic > if you have an old school 80x24 terminal, or more likely you are on a > virtual terminal which does not scroll for whatever reason. > > We'd like to expand the paca display even more, so add a way to limit > the number of lines that are displayed. > > This adds a third form of 'dp' which is 'dp # #', where the first number > is the cpu, and the second is the number of lines to display. > > Example output: > > 5:mon> dp 3 6 > paca for cpu 0x3 @ cfe00c00: >possible = yes >present = yes >online = yes >lock_token = 0x8000 (0xa) >paca_index = 0x3 (0x8) Michael, This patch inspired me to do the additional work to make the output paged, more like the memory dump commands. I'll post it shortly as "powerpc/xmon: Paged output for paca display". Cheers, Sam. ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH v3 2/6] powerpc/powernv: simplify the calculation of iov resource alignment
On Thu, Aug 13, 2015 at 10:11:07PM +0800, Wei Yang wrote: >The alignment of IOV BAR on PowerNV platform is the total size of the IOV >BAR. No matter whether the IOV BAR is extended with number of >roundup_pow_of_two(total_vfs) or number of max PE number (256), the total >size could be calculated by (vfs_expanded * VF_BAR_size). > >This patch simplifies the pnv_pci_iov_resource_alignment() by removing the >first case. > >Signed-off-by: Wei Yang >Reviewed-by: Gavin Shan >--- > arch/powerpc/platforms/powernv/pci-ioda.c | 14 +- > 1 file changed, 9 insertions(+), 5 deletions(-) > >diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c >b/arch/powerpc/platforms/powernv/pci-ioda.c >index 9ac324e..67b8f72 100644 >--- a/arch/powerpc/platforms/powernv/pci-ioda.c >+++ b/arch/powerpc/platforms/powernv/pci-ioda.c >@@ -2987,12 +2987,16 @@ static resource_size_t >pnv_pci_iov_resource_alignment(struct pci_dev *pdev, > int resno) > { > struct pci_dn *pdn = pci_get_pdn(pdev); >- resource_size_t align, iov_align; >- >- iov_align = resource_size(&pdev->resource[resno]); >- if (iov_align) >- return iov_align; >+ resource_size_t align; > >+ /* >+ * On PowerNV platform, IOV BAR is mapped by M64 BAR to enable the >+ * SR-IOV. While from hardware perspective, the range mapped by M64 >+ * BAR should be size aligned. >+ * >+ * This function return the total IOV BAR size if expanded or just the >+ * individual size if not. >+ */ s/return/returns > align = pci_iov_resource_size(pdev, resno); > if (pdn->vfs_expanded) > return pdn->vfs_expanded * align; >-- >1.7.9.5 > ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH v3 6/6] powerpc/powernv: allocate sparse PE# when using M64 BAR in Single PE mode
On Thu, Aug 13, 2015 at 10:11:11PM +0800, Wei Yang wrote: >When M64 BAR is set to Single PE mode, the PE# assigned to VF could be >sparse. > >This patch restructures the patch to allocate sparse PE# for VFs when M64 >BAR is set to Single PE mode. > >Signed-off-by: Wei Yang >--- > arch/powerpc/include/asm/pci-bridge.h |2 +- > arch/powerpc/platforms/powernv/pci-ioda.c | 59 +++-- > 2 files changed, 41 insertions(+), 20 deletions(-) > >diff --git a/arch/powerpc/include/asm/pci-bridge.h >b/arch/powerpc/include/asm/pci-bridge.h >index 9d33ada..b026ef8 100644 >--- a/arch/powerpc/include/asm/pci-bridge.h >+++ b/arch/powerpc/include/asm/pci-bridge.h >@@ -214,7 +214,7 @@ struct pci_dn { > #ifdef CONFIG_PCI_IOV > u16 vfs_expanded; /* number of VFs IOV BAR expanded */ > u16 num_vfs;/* number of VFs enabled*/ >- int offset; /* PE# for the first VF PE */ >+ int pe_num_map[MAX_M64_BAR];/* PE# for the first VF PE or array */ Same question as to "m64_map". pdn for non-PF doesn't need it. > boolm64_single_mode;/* Use M64 BAR in Single Mode */ > #define IODA_INVALID_M64(-1) > int m64_map[PCI_SRIOV_NUM_BARS][MAX_M64_BAR]; >diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c >b/arch/powerpc/platforms/powernv/pci-ioda.c >index 1e6ac86..7633538 100644 >--- a/arch/powerpc/platforms/powernv/pci-ioda.c >+++ b/arch/powerpc/platforms/powernv/pci-ioda.c >@@ -1232,7 +1232,7 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, >u16 num_vfs) > > /* Map the M64 here */ > if (pdn->m64_single_mode) { >- pe_num = pdn->offset + j; >+ pe_num = pdn->pe_num_map[j]; > rc = opal_pci_map_pe_mmio_window(phb->opal_id, > pe_num, OPAL_M64_WINDOW_TYPE, > pdn->m64_map[i][j], 0); >@@ -1336,7 +1336,7 @@ void pnv_pci_sriov_disable(struct pci_dev *pdev) > struct pnv_phb*phb; > struct pci_dn *pdn; > struct pci_sriov *iov; >- u16 num_vfs; >+ u16 num_vfs, i; > > bus = pdev->bus; > hose = pci_bus_to_host(bus); >@@ -1350,14 +1350,17 @@ void pnv_pci_sriov_disable(struct pci_dev *pdev) > > if (phb->type == PNV_PHB_IODA2) { > if (!pdn->m64_single_mode) >- pnv_pci_vf_resource_shift(pdev, -pdn->offset); >+ pnv_pci_vf_resource_shift(pdev, -pdn->pe_num_map[0]); > > /* Release M64 windows */ > pnv_pci_vf_release_m64(pdev); > > /* Release PE numbers */ >- bitmap_clear(phb->ioda.pe_alloc, pdn->offset, num_vfs); >- pdn->offset = 0; >+ if (pdn->m64_single_mode) { >+ for (i = 0; i < num_vfs; i++) >+ pnv_ioda_free_pe(phb, pdn->pe_num_map[i]); >+ } else >+ bitmap_clear(phb->ioda.pe_alloc, pdn->pe_num_map[0], >num_vfs); > } > } > >@@ -1383,7 +1386,10 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, >u16 num_vfs) > > /* Reserve PE for each VF */ > for (vf_index = 0; vf_index < num_vfs; vf_index++) { >- pe_num = pdn->offset + vf_index; >+ if (pdn->m64_single_mode) >+ pe_num = pdn->pe_num_map[vf_index]; >+ else >+ pe_num = pdn->pe_num_map[0] + vf_index; > > pe = &phb->ioda.pe_array[pe_num]; > pe->pe_number = pe_num; >@@ -1425,6 +1431,7 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 >num_vfs) > struct pnv_phb*phb; > struct pci_dn *pdn; > intret; >+ u16i; > > bus = pdev->bus; > hose = pci_bus_to_host(bus); >@@ -1448,19 +1455,30 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 >num_vfs) > } > > /* Calculate available PE for required VFs */ >- mutex_lock(&phb->ioda.pe_alloc_mutex); >- pdn->offset = bitmap_find_next_zero_area( >- phb->ioda.pe_alloc, phb->ioda.total_pe, >- 0, num_vfs, 0); >- if (pdn->offset >= phb->ioda.total_pe) { >+ if (pdn->m64_single_mode) { >+ for (i = 0; i < num_vfs; i++) >+ pdn->pe_num_map[i] = IODA_INVALID_PE; >+ for (i = 0; i < num_vfs; i++) { >+ pdn->pe_num_map[i] = pnv_ioda_alloc_pe(phb); >+ if (pdn->pe_num_map[i] == IODA_INVALID_PE) { >+ ret = -EBUSY; >+ goto m64_failed; >+ } >+ } >+
Re: [PATCH v3 5/6] powerpc/powernv: boundary the total VF BAR size instead of the individual one
On Thu, Aug 13, 2015 at 10:11:10PM +0800, Wei Yang wrote: >Each VF could have 6 BARs at most. When the total BAR size exceeds the >gate, after expanding it will also exhaust the M64 Window. > >This patch limits the boundary by checking the total VF BAR size instead of >the individual BAR. > >Signed-off-by: Wei Yang Reviewed-by: Gavin Shan >--- > arch/powerpc/platforms/powernv/pci-ioda.c | 13 +++-- > 1 file changed, 7 insertions(+), 6 deletions(-) > >diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c >b/arch/powerpc/platforms/powernv/pci-ioda.c >index 3e8c0b4..1e6ac86 100644 >--- a/arch/powerpc/platforms/powernv/pci-ioda.c >+++ b/arch/powerpc/platforms/powernv/pci-ioda.c >@@ -2688,7 +2688,7 @@ static void pnv_pci_ioda_fixup_iov_resources(struct >pci_dev *pdev) > struct pnv_phb *phb; > struct resource *res; > int i; >- resource_size_t size, gate; >+ resource_size_t size, gate, total_vf_bar_sz; > struct pci_dn *pdn; > int mul, total_vfs; > >@@ -2715,6 +2715,7 @@ static void pnv_pci_ioda_fixup_iov_resources(struct >pci_dev *pdev) >* which will exhaust the M64 Space and limit the system flexibility. >*/ > gate = phb->ioda.m64_segsize >> 1; >+ total_vf_bar_sz = 0; > > for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { > res = &pdev->resource[i + PCI_IOV_RESOURCES]; >@@ -2727,13 +2728,13 @@ static void pnv_pci_ioda_fixup_iov_resources(struct >pci_dev *pdev) > return; > } > >- size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES); >+ total_vf_bar_sz += pci_iov_resource_size(pdev, >+ i + PCI_IOV_RESOURCES); > > /* bigger than or equal to gate */ >- if (size >= gate) { >- dev_info(&pdev->dev, "PowerNV: VF BAR%d: %pR IOV size " >- "is bigger than %lld, roundup power2\n", >- i, res, gate); >+ if (total_vf_bar_sz >= gate) { >+ dev_info(&pdev->dev, "PowerNV: VF BAR Total IOV size " >+ "is bigger than %lld, roundup power2\n", gate); dev_info(&pdev->dev, "PowerNV: Total VF BAR size %lld " "is bigger than %lld, roundup power2\n", total_vf_bar_sz, gate); > mul = roundup_pow_of_two(total_vfs); > pdn->m64_single_mode = true; > break; >-- >1.7.9.5 > ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH v3 4/6] powerpc/powernv: replace the hard coded boundary with gate
On Thu, Aug 13, 2015 at 10:11:09PM +0800, Wei Yang wrote: >At the moment 64bit-prefetchable window can be maximum 64GB, which is >currently got from device tree. This means that in shared mode the maximum >supported VF BAR size is 64GB/256=256MB. While this size could exhaust the >whole 64bit-prefetchable window. This is a design decision to set a >boundary to 64MB of the VF BAR size. Since VF BAR size with 64MB would >occupy a quarter of the 64bit-prefetchable window, this is affordable. > >This patch replaces magic limit of 64MB with (m64_segsize >> 1) and adds >comment to explain the reason for it. > >Signed-off-by: Wei Yang Reviewed-by: Gavin Shan >--- > arch/powerpc/platforms/powernv/pci-ioda.c | 22 +- > 1 file changed, 17 insertions(+), 5 deletions(-) > >diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c >b/arch/powerpc/platforms/powernv/pci-ioda.c >index 4da0f50..3e8c0b4 100644 >--- a/arch/powerpc/platforms/powernv/pci-ioda.c >+++ b/arch/powerpc/platforms/powernv/pci-ioda.c >@@ -2688,7 +2688,7 @@ static void pnv_pci_ioda_fixup_iov_resources(struct >pci_dev *pdev) > struct pnv_phb *phb; > struct resource *res; > int i; >- resource_size_t size; >+ resource_size_t size, gate; > struct pci_dn *pdn; > int mul, total_vfs; > >@@ -2704,6 +2704,17 @@ static void pnv_pci_ioda_fixup_iov_resources(struct >pci_dev *pdev) > > total_vfs = pci_sriov_get_totalvfs(pdev); > mul = phb->ioda.total_pe; >+ /* >+ * If bigger than or equal to half of M64 segment size, just round up >+ * power of two. >+ * >+ * Generally, one M64 BAR maps one IOV BAR. To avoid conflict with >+ * other devices, IOV BAR size is expanded to be (total_pe * >+ * VF_BAR_size). When VF_BAR_size is half of M64 segment size , the >+ * expanded size would equal to half of the whole M64 Space size, >+ * which will exhaust the M64 Space and limit the system flexibility. >+ */ s/M64 Space/M64 space >+ gate = phb->ioda.m64_segsize >> 1; > > for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { > res = &pdev->resource[i + PCI_IOV_RESOURCES]; >@@ -2718,10 +2729,11 @@ static void pnv_pci_ioda_fixup_iov_resources(struct >pci_dev *pdev) > > size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES); > >- /* bigger than 64M */ >- if (size > (1 << 26)) { >- dev_info(&pdev->dev, "PowerNV: VF BAR%d: %pR IOV size >is bigger than 64M, roundup power2\n", >- i, res); >+ /* bigger than or equal to gate */ >+ if (size >= gate) { >+ dev_info(&pdev->dev, "PowerNV: VF BAR%d: %pR IOV size " >+ "is bigger than %lld, roundup power2\n", >+ i, res, gate); > mul = roundup_pow_of_two(total_vfs); > pdn->m64_single_mode = true; > break; >-- >1.7.9.5 > ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH v3 3/6] powerpc/powernv: use one M64 BAR in Single PE mode for one VF BAR
On Thu, Aug 13, 2015 at 10:11:08PM +0800, Wei Yang wrote: >In current implementation, when VF BAR is bigger than 64MB, it uses 4 M64 >BARs in Single PE mode to cover the number of VFs required to be enabled. >By doing so, several VFs would be in one VF Group and leads to interference >between VFs in the same group. > >This patch changes the design by using one M64 BAR in Single PE mode for >one VF BAR. This gives absolute isolation for VFs. > >Signed-off-by: Wei Yang >--- > arch/powerpc/include/asm/pci-bridge.h |6 +- > arch/powerpc/platforms/powernv/pci-ioda.c | 163 +++-- > 2 files changed, 62 insertions(+), 107 deletions(-) > >diff --git a/arch/powerpc/include/asm/pci-bridge.h >b/arch/powerpc/include/asm/pci-bridge.h >index 712add5..9d33ada 100644 >--- a/arch/powerpc/include/asm/pci-bridge.h >+++ b/arch/powerpc/include/asm/pci-bridge.h >@@ -187,6 +187,7 @@ static inline int isa_vaddr_is_ioport(void __iomem >*address) > */ > struct iommu_table; > >+#define MAX_M64_BAR 16 struct pnv_phb::m64_bar_idx is initialized to 15. Another macro is defined here as 16. Both of them can be used as maximal M64 BAR number. Obviously, they're duplicated. On the other hand, I don't think it's a good idea to have the static "m64_map" because @pdn is created for every PCI devices, including VFs. non-PF don't "m64_map", together other fields like "m64_per_iov" at all. It's obviously wasting memory. So it would be allocated dynamically when the PF's pdn is created or in pnv_pci_ioda_fixup_iov_resources(). In long run, it might be reasonable to move all SRIOV related fields in pci_dn to another data struct (struct pci_iov_dn?) and allocate that dynamically. > int flags; > #define PCI_DN_FLAG_IOV_VF0x01 >@@ -214,10 +215,9 @@ struct pci_dn { > u16 vfs_expanded; /* number of VFs IOV BAR expanded */ > u16 num_vfs;/* number of VFs enabled*/ > int offset; /* PE# for the first VF PE */ >-#define M64_PER_IOV 4 >- int m64_per_iov; >+ boolm64_single_mode;/* Use M64 BAR in Single Mode */ > #define IODA_INVALID_M64(-1) >- int m64_wins[PCI_SRIOV_NUM_BARS][M64_PER_IOV]; >+ int m64_map[PCI_SRIOV_NUM_BARS][MAX_M64_BAR]; > #endif /* CONFIG_PCI_IOV */ > #endif > struct list_head child_list; >diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c >b/arch/powerpc/platforms/powernv/pci-ioda.c >index 67b8f72..4da0f50 100644 >--- a/arch/powerpc/platforms/powernv/pci-ioda.c >+++ b/arch/powerpc/platforms/powernv/pci-ioda.c >@@ -1162,15 +1162,14 @@ static int pnv_pci_vf_release_m64(struct pci_dev *pdev) > pdn = pci_get_pdn(pdev); > > for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) >- for (j = 0; j < M64_PER_IOV; j++) { >- if (pdn->m64_wins[i][j] == IODA_INVALID_M64) >+ for (j = 0; j < MAX_M64_BAR; j++) { >+ if (pdn->m64_map[i][j] == IODA_INVALID_M64) > continue; > opal_pci_phb_mmio_enable(phb->opal_id, >- OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 0); >- clear_bit(pdn->m64_wins[i][j], >&phb->ioda.m64_bar_alloc); >- pdn->m64_wins[i][j] = IODA_INVALID_M64; >+ OPAL_M64_WINDOW_TYPE, pdn->m64_map[i][j], 0); >+ clear_bit(pdn->m64_map[i][j], &phb->ioda.m64_bar_alloc); >+ pdn->m64_map[i][j] = IODA_INVALID_M64; > } >- > return 0; > } > >@@ -1187,8 +1186,7 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, >u16 num_vfs) > inttotal_vfs; > resource_size_tsize, start; > intpe_num; >- intvf_groups; >- intvf_per_group; >+ intm64_bars; > > bus = pdev->bus; > hose = pci_bus_to_host(bus); >@@ -1196,26 +1194,23 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, >u16 num_vfs) > pdn = pci_get_pdn(pdev); > total_vfs = pci_sriov_get_totalvfs(pdev); > >- /* Initialize the m64_wins to IODA_INVALID_M64 */ >- for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) >- for (j = 0; j < M64_PER_IOV; j++) >- pdn->m64_wins[i][j] = IODA_INVALID_M64; >+ if (pdn->m64_single_mode) >+ m64_bars = num_vfs; >+ else >+ m64_bars = 1; >+ >+ /* Initialize the m64_map to IODA_INVALID_M64 */ >+ for (i = 0; i < PCI_SRIOV_NUM_BARS ; i++) >+ for (j = 0; j < MAX_M64_BAR; j++) >+ pdn->m64_map[i][j] = IODA_INVALID_M64; It would be done in pnv_pci_ioda_fixup_iov_resources(). That means it will be done for once if hotplug isn't considered. The code here will be called on every attempt to enable SRIOV capability, which isn't necessary, right? > >- if
Re: [PATCH v3 1/6] powerpc/powernv: don't enable SRIOV when VF BAR has non 64bit-prefetchable BAR
On Thu, Aug 13, 2015 at 10:11:06PM +0800, Wei Yang wrote: >On PHB_IODA2, we enable SRIOV devices by mapping IOV BAR with M64 BARs. If >a SRIOV device's IOV BAR is not 64bit-prefetchable, this is not assigned >from 64bit prefetchable window, which means M64 BAR can't work on it. > >This patch makes this explicit. > >Signed-off-by: Wei Yang Reviewed-by: Gavin Shan >--- > arch/powerpc/platforms/powernv/pci-ioda.c | 25 + > 1 file changed, 9 insertions(+), 16 deletions(-) > >diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c >b/arch/powerpc/platforms/powernv/pci-ioda.c >index 5738d31..9ac324e 100644 >--- a/arch/powerpc/platforms/powernv/pci-ioda.c >+++ b/arch/powerpc/platforms/powernv/pci-ioda.c >@@ -908,9 +908,6 @@ static int pnv_pci_vf_resource_shift(struct pci_dev *dev, >int offset) > if (!res->flags || !res->parent) > continue; > >- if (!pnv_pci_is_mem_pref_64(res->flags)) >- continue; >- > /* >* The actual IOV BAR range is determined by the start address >* and the actual size for num_vfs VFs BAR. This check is to >@@ -939,9 +936,6 @@ static int pnv_pci_vf_resource_shift(struct pci_dev *dev, >int offset) > if (!res->flags || !res->parent) > continue; > >- if (!pnv_pci_is_mem_pref_64(res->flags)) >- continue; >- > size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES); > res2 = *res; > res->start += size * offset; >@@ -1221,9 +1215,6 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, >u16 num_vfs) > if (!res->flags || !res->parent) > continue; > >- if (!pnv_pci_is_mem_pref_64(res->flags)) >- continue; >- > for (j = 0; j < vf_groups; j++) { > do { > win = > find_next_zero_bit(&phb->ioda.m64_bar_alloc, >@@ -1510,6 +1501,12 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 >num_vfs) > pdn = pci_get_pdn(pdev); > > if (phb->type == PNV_PHB_IODA2) { >+ if (!pdn->vfs_expanded) { >+ dev_info(&pdev->dev, "don't support this SRIOV device" >+ " with non 64bit-prefetchable IOV BAR\n"); >+ return -ENOSPC; >+ } >+ > /* Calculate available PE for required VFs */ > mutex_lock(&phb->ioda.pe_alloc_mutex); > pdn->offset = bitmap_find_next_zero_area( >@@ -2774,9 +2771,10 @@ static void pnv_pci_ioda_fixup_iov_resources(struct >pci_dev *pdev) > if (!res->flags || res->parent) > continue; > if (!pnv_pci_is_mem_pref_64(res->flags)) { >- dev_warn(&pdev->dev, " non M64 VF BAR%d: %pR\n", >+ dev_warn(&pdev->dev, "Don't support SR-IOV with" >+ " non M64 VF BAR%d: %pR. \n", >i, res); >- continue; >+ return; > } > > size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES); >@@ -2795,11 +2793,6 @@ static void pnv_pci_ioda_fixup_iov_resources(struct >pci_dev *pdev) > res = &pdev->resource[i + PCI_IOV_RESOURCES]; > if (!res->flags || res->parent) > continue; >- if (!pnv_pci_is_mem_pref_64(res->flags)) { >- dev_warn(&pdev->dev, "Skipping expanding VF BAR%d: >%pR\n", >- i, res); >- continue; >- } > > dev_dbg(&pdev->dev, " Fixing VF BAR%d: %pR to\n", i, res); > size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES); >-- >1.7.9.5 > ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: RFC: prepare for struct scatterlist entries without page backing
Hi Christoph, On Fri, Aug 14, 2015 at 12:35 AM, Christoph Hellwig wrote: > On Thu, Aug 13, 2015 at 09:37:37AM +1000, Julian Calaby wrote: >> I.e. ~90% of this patch set seems to be just mechanically dropping >> BUG_ON()s and converting open coded stuff to use accessor functions >> (which should be macros or get inlined, right?) - and the remaining >> bit is not flushing if we don't have a physical page somewhere. > > Which is was 90%. By lines changed most actually is the diffs for > the cache flushing. I was talking in terms of changes made, not lines changed: by my recollection, about a third of the patches didn't touch flush calls and most of the lines changed looked like refactoring so that making the flush call conditional would be easier. I guess it smelled like you were doing lots of distinct changes in a single patch and I got my numbers wrong. >> Would it make sense to split this patch set into a few bits: one to >> drop all the useless BUG_ON()s, one to convert all the open coded >> stuff to accessor functions, then another to do the actual page-less >> sg stuff? > > Without the ifs the BUG_ON() actually are useful to assert we > never feed the sort of physical addresses we can't otherwise support, > so I don't think that part is doable. My point is that there's a couple of patches that only remove BUG_ON()s, which implies that for that particular driver it doesn't matter if there's a physical page or not, so therefore that code is purely "documentation". Thanks, -- Julian Calaby Email: julian.cal...@gmail.com Profile: http://www.google.com/profiles/julian.calaby/ ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH v2 7/7] pmem, dax: have direct_access use __pmem annotation
On Thu, Aug 13, 2015 at 9:51 AM, Ross Zwisler wrote: > Update the annotation for the kaddr pointer returned by direct_access() > so that it is a __pmem pointer. This is consistent with the PMEM driver > and with how this direct_access() pointer is used in the DAX code. > > Signed-off-by: Ross Zwisler > --- > Documentation/filesystems/Locking | 3 ++- > arch/powerpc/sysdev/axonram.c | 7 --- > drivers/block/brd.c | 4 ++-- > drivers/nvdimm/pmem.c | 4 ++-- > drivers/s390/block/dcssblk.c | 10 + > fs/block_dev.c| 2 +- > fs/dax.c | 44 > +-- > include/linux/blkdev.h| 8 +++ > 8 files changed, 45 insertions(+), 37 deletions(-) > > diff --git a/Documentation/filesystems/Locking > b/Documentation/filesystems/Locking > index 6a34a0f..06d4434 100644 > --- a/Documentation/filesystems/Locking > +++ b/Documentation/filesystems/Locking > @@ -397,7 +397,8 @@ prototypes: > int (*release) (struct gendisk *, fmode_t); > int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned > long); > int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, > unsigned long); > - int (*direct_access) (struct block_device *, sector_t, void **, > unsigned long *); > + int (*direct_access) (struct block_device *, sector_t, void __pmem **, > + unsigned long *); So this collides with the __pfn_t work. I think the we have a reasonable chance of getting that in to 4.3, so I'd wait to see if we hit any major roadblocks with that set [1] before merging these. [1]: https://lists.01.org/pipermail/linux-nvdimm/2015-August/001803.html ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v2 7/7] pmem, dax: have direct_access use __pmem annotation
Update the annotation for the kaddr pointer returned by direct_access() so that it is a __pmem pointer. This is consistent with the PMEM driver and with how this direct_access() pointer is used in the DAX code. Signed-off-by: Ross Zwisler --- Documentation/filesystems/Locking | 3 ++- arch/powerpc/sysdev/axonram.c | 7 --- drivers/block/brd.c | 4 ++-- drivers/nvdimm/pmem.c | 4 ++-- drivers/s390/block/dcssblk.c | 10 + fs/block_dev.c| 2 +- fs/dax.c | 44 +-- include/linux/blkdev.h| 8 +++ 8 files changed, 45 insertions(+), 37 deletions(-) diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 6a34a0f..06d4434 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -397,7 +397,8 @@ prototypes: int (*release) (struct gendisk *, fmode_t); int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); - int (*direct_access) (struct block_device *, sector_t, void **, unsigned long *); + int (*direct_access) (struct block_device *, sector_t, void __pmem **, + unsigned long *); int (*media_changed) (struct gendisk *); void (*unlock_native_capacity) (struct gendisk *); int (*revalidate_disk) (struct gendisk *); diff --git a/arch/powerpc/sysdev/axonram.c b/arch/powerpc/sysdev/axonram.c index ee90db1..a2be2a6 100644 --- a/arch/powerpc/sysdev/axonram.c +++ b/arch/powerpc/sysdev/axonram.c @@ -141,13 +141,14 @@ axon_ram_make_request(struct request_queue *queue, struct bio *bio) */ static long axon_ram_direct_access(struct block_device *device, sector_t sector, - void **kaddr, unsigned long *pfn, long size) + void __pmem **kaddr, unsigned long *pfn, long size) { struct axon_ram_bank *bank = device->bd_disk->private_data; loff_t offset = (loff_t)sector << AXON_RAM_SECTOR_SHIFT; + void *addr = (void *)(bank->ph_addr + offset); - *kaddr = (void *)(bank->ph_addr + offset); - *pfn = virt_to_phys(*kaddr) >> PAGE_SHIFT; + *kaddr = (void __pmem *)addr; + *pfn = virt_to_phys(addr) >> PAGE_SHIFT; return bank->size - offset; } diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 5750b39..2691bb6 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -371,7 +371,7 @@ static int brd_rw_page(struct block_device *bdev, sector_t sector, #ifdef CONFIG_BLK_DEV_RAM_DAX static long brd_direct_access(struct block_device *bdev, sector_t sector, - void **kaddr, unsigned long *pfn, long size) + void __pmem **kaddr, unsigned long *pfn, long size) { struct brd_device *brd = bdev->bd_disk->private_data; struct page *page; @@ -381,7 +381,7 @@ static long brd_direct_access(struct block_device *bdev, sector_t sector, page = brd_insert_page(brd, sector); if (!page) return -ENOSPC; - *kaddr = page_address(page); + *kaddr = (void __pmem *)page_address(page); *pfn = page_to_pfn(page); /* diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index ade9eb9..68f6a6a 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -92,7 +92,7 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector, } static long pmem_direct_access(struct block_device *bdev, sector_t sector, - void **kaddr, unsigned long *pfn, long size) + void __pmem **kaddr, unsigned long *pfn, long size) { struct pmem_device *pmem = bdev->bd_disk->private_data; size_t offset = sector << 9; @@ -101,7 +101,7 @@ static long pmem_direct_access(struct block_device *bdev, sector_t sector, return -ENODEV; /* FIXME convert DAX to comprehend that this mapping has a lifetime */ - *kaddr = (void __force *) pmem->virt_addr + offset; + *kaddr = pmem->virt_addr + offset; *pfn = (pmem->phys_addr + offset) >> PAGE_SHIFT; return pmem->size - offset; diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c index da21281..2c5a397 100644 --- a/drivers/s390/block/dcssblk.c +++ b/drivers/s390/block/dcssblk.c @@ -29,7 +29,7 @@ static int dcssblk_open(struct block_device *bdev, fmode_t mode); static void dcssblk_release(struct gendisk *disk, fmode_t mode); static void dcssblk_make_request(struct request_queue *q, struct bio *bio); static long dcssblk_direct_access(struct block_device *bdev, sector_t secnum, -void **kaddr, unsigned long *pfn, long size); +void __pmem **kaddr, unsigned long *pfn, long size); static char dcssblk_segments[DCSSBLK_PAR
[PATCH v2 0/7] dax: I/O path enhancements
The goal of this series is to enhance the DAX I/O path so that all operations that store data (I/O writes, zeroing blocks, punching holes, etc.) properly synchronize the stores to media using the PMEM API. This ensures that the data DAX is writing is durable on media before the operation completes. Patches 1-4 are a few random cleanups. Changes from v1: - Removed patches to PMEM for the "read flush" _DSM flag. These are different enough that they deserve their own series, and they have a separate baseline which is currently moving (Dan's memremap() series). - Added clear_pmem() PMEM API to zero DAX memory and flush it in one call. (Dave) - Open coded flushing in arch_wb_cache_pmem() instead of adding a generic clwb_flush_range(). This allowed me to avoid having extra memory barriers and instead rely completely on arch_wmb_pmem() for ordering. (Dave) - Moved the arch implementation of the PMEM API into it's own arch header (Christoph). Ross Zwisler (7): brd: make rd_size static pmem, x86: move x86 PMEM API to new pmem.h header pmem: remove layer when calling arch_has_wmb_pmem() pmem, x86: clean up conditional pmem includes pmem: add wb_cache_pmem() and clear_pmem() dax: update I/O path to do proper PMEM flushing pmem, dax: have direct_access use __pmem annotation Documentation/filesystems/Locking | 3 +- MAINTAINERS | 1 + arch/powerpc/sysdev/axonram.c | 7 ++- arch/x86/include/asm/cacheflush.h | 71 -- arch/x86/include/asm/pmem.h | 123 ++ drivers/block/brd.c | 6 +- drivers/nvdimm/pmem.c | 4 +- drivers/s390/block/dcssblk.c | 10 ++-- fs/block_dev.c| 2 +- fs/dax.c | 73 ++ include/linux/blkdev.h| 8 +-- include/linux/pmem.h | 66 12 files changed, 247 insertions(+), 127 deletions(-) create mode 100644 arch/x86/include/asm/pmem.h -- 2.1.0 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: RFC: Reducing the number of non volatile GPRs in the ppc64 kernel
Hi, Here is another instruction trace from a kernel context switch trace. Quite a lot of register and CR save/restore code. Regards, Anton c02943d8 mfcrr12 c02943dc std r20,-96(r1) c02943e0 std r21,-88(r1) c02943e4 rldicl. r9,r4,63,63 c02943e8 std r22,-80(r1) c02943ec mflrr0 c02943f0 std r24,-64(r1) c02943f4 std r25,-56(r1) c02943f8 std r26,-48(r1) c02943fc std r27,-40(r1) c0294400 std r31,-8(r1) c0294404 std r15,-136(r1) c0294408 stw r12,8(r1) c029440c std r16,-128(r1) c0294410 mcrfcr4,cr0 c0294414 std r0,16(r1) c0294418 std r17,-120(r1) c029441c std r18,-112(r1) c0294420 std r19,-104(r1) c0294424 std r23,-72(r1) c0294428 std r28,-32(r1) c029442c std r29,-24(r1) c0294430 std r30,-16(r1) c0294434 stdur1,-272(r1) c0294438 cmpwi cr7,r6,1 c029443c rlwinm r31,r4,4,1,31 c0294440 li r9,0 c029 rotlwi r31,r31,28 c0294448 mr r24,r6 c029444c mr r26,r4 c0294450 mr r25,r3 c0294454 mr r22,r5 c0294458 mr r21,r7 c029445c mr r20,r8 c0294460 std r9,120(r1) c0294464 std r9,112(r1) c0294468 clrldi r27,r31,32 c029446c beq cr7,c0294888 c0294888 ld r29,0(r5) c029488c addir29,r29,-32 c0294890 beq c0294478 c0294478 lwz r9,516(r25) c029447c and r10,r9,r31 c0294480 cmpwi r10,0 c0294484 bne c02945d0 c0294488 cmpdi cr7,r29,0 c029448c beq cr7,c02948c4 c0294490 lwz r9,264(r29) c0294494 and r10,r9,r31 c0294498 cmpwi r10,0 c029449c beq c02948c4 c02948c4 li r3,0 c02948c8 b c02947cc c02947cc addir1,r1,272 c02947d0 ld r0,16(r1) c02947d4 lwz r12,8(r1) c02947d8 ld r15,-136(r1) c02947dc ld r16,-128(r1) c02947e0 mtlrr0 c02947e4 ld r17,-120(r1) c02947e8 ld r18,-112(r1) c02947ec mtocrf 32,r12 c02947f0 mtocrf 16,r12 c02947f4 mtocrf 8,r12 c02947f8 ld r19,-104(r1) c02947fc ld r20,-96(r1) c0294800 ld r21,-88(r1) c0294804 ld r22,-80(r1) c0294808 ld r23,-72(r1) c029480c ld r24,-64(r1) c0294810 ld r25,-56(r1) c0294814 ld r26,-48(r1) c0294818 ld r27,-40(r1) c029481c ld r28,-32(r1) c0294820 ld r29,-24(r1) c0294824 ld r30,-16(r1) c0294828 ld r31,-8(r1) c029482c blr ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH 09/10] Define PERF_PMU_TXN_READ interface
On Thu, Aug 13, 2015 at 01:04:28PM -0700, Sukadev Bhattiprolu wrote: > | > | +static int perf_read_group(struct perf_event *event, > | > | + u64 read_format, char __user *buf) > | > | +{ > | > | + struct perf_event *leader = event->group_leader, *child; > | > | + struct perf_event_context *ctx = leader->ctx; > | > | + int ret = leader->read_size; > One other question, We return leader->read_size but allocate/copy_to_user > the sibling's event->read_size. We consistently use read_format from the > 'event' being read, rather than its 'group_leader', so we are ok in terms > of what we copy into values[] for each event in the group. > > But, can the leader's read_format (and hence its read_size) differ from > its sibling's read_size? If so, in the current code, we return the event's > read_size but in the new code, we return the leader's read_size. Hmm, good spotting that. I'm fairly sure I didn't do that on purpose. I think we should use event->read_size there too and have the lot consistent. I don't think we require read_format to be uniform across siblings. ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH 09/10] Define PERF_PMU_TXN_READ interface
Peter Zijlstra [pet...@infradead.org] wrote: | On Tue, Aug 11, 2015 at 09:14:00PM -0700, Sukadev Bhattiprolu wrote: | > | +static void __perf_read_group_add(struct perf_event *leader, u64 read_format, u64 *values) | > | { | > | + struct perf_event *sub; | > | + int n = 1; /* skip @nr */ | > | > This n = 1 is to skip over the values[0] = 1 + nr_siblings in the | > caller. | > | > Anyway, in __perf_read_group_add() we always start with n = 1, however | > ... | > | | > | + perf_event_read(leader, true); | > | + | > | + /* | > | + * Since we co-schedule groups, {enabled,running} times of siblings | > | + * will be identical to those of the leader, so we only publish one | > | + * set. | > | + */ | > | + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { | > | + values[n++] += leader->total_time_enabled + | > | + atomic64_read(leader->child_total_time_enabled); | | Note how this is an in-place addition, Ah, yes, Sorry I missed that. It make sense now and my tests seem to be running fine. | | > | + } | > | | > | + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { | > | + values[n++] += leader->total_time_running + | > | + atomic64_read(leader->child_total_time_running); | | and here, | | > | + } | > | | > | + /* | > | + * Write {count,id} tuples for every sibling. | > | + */ | > | + values[n++] += perf_event_count(leader); | | and here, | | | > | if (read_format & PERF_FORMAT_ID) | > | values[n++] = primary_event_id(leader); | | and this will always assign the same value. | | > | + list_for_each_entry(sub, &leader->sibling_list, group_entry) { | > | + values[n++] += perf_event_count(sub); | > | + if (read_format & PERF_FORMAT_ID) | > | + values[n++] = primary_event_id(sub); | | Same for these, therefore, | | > | + } | > | +} | > | | > | +static int perf_read_group(struct perf_event *event, | > | +u64 read_format, char __user *buf) | > | +{ | > | + struct perf_event *leader = event->group_leader, *child; | > | + struct perf_event_context *ctx = leader->ctx; | > | + int ret = leader->read_size; One other question, We return leader->read_size but allocate/copy_to_user the sibling's event->read_size. We consistently use read_format from the 'event' being read, rather than its 'group_leader', so we are ok in terms of what we copy into values[] for each event in the group. But, can the leader's read_format (and hence its read_size) differ from its sibling's read_size? If so, in the current code, we return the event's read_size but in the new code, we return the leader's read_size. | > | + u64 *values; | > | | > | + lockdep_assert_held(&ctx->mutex); | > | | > | + values = kzalloc(event->read_size); | > | + if (!values) | > | + return -ENOMEM; | > | | > | + values[0] = 1 + leader->nr_siblings; | > | | > | + /* | > | + * By locking the child_mutex of the leader we effectively | > | + * lock the child list of all siblings.. XXX explain how. | > | + */ | > | + mutex_lock(&leader->child_mutex); | > | | > | + __perf_read_group_add(leader, read_format, values); | > | > ... we don't copy_to_user() here, | > | > | + list_for_each_entry(child, &leader->child_list, child_list) | > | + __perf_read_group_add(child, read_format, values); | > | > so won't we overwrite the values[], if we always start at n = 1 | > in __perf_read_group_add()? | | yes and no, we have to re-iterate the same values for each child as they | all have the same group, but we add the time and count fields, we do not | overwrite. The _add() suffix was supposed to be a hint ;-) | | > | + mutex_unlock(&leader->child_mutex); | > | + | > | + if (copy_to_user(buf, values, event->read_size)) | > | + ret = -EFAULT; | > | + | > | + kfree(values); | > | | > | return ret; | > | } | | Where previously we would iterate the group and for each member | iterate/sum all the child values together before copying the value out, | we now, because we need to read groups together, need to first iterate | the child list and sum whole groups. ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH 2/3] powerpc/e6500: hw tablewalk: optimize a bit for tcd lock acquiring codes
On Thu, 2015-08-13 at 19:51 +0800, Kevin Hao wrote: > It makes no sense to put the instructions for calculating the lock > value (cpu number + 1) and the clearing of eq bit of cr1 in lbarx/stbcx > loop. And when the lock is acquired by the other thread, the current > lock value has no chance to equal with the lock value used by current > cpu. So we can skip the comparing for these two lock values in the > lbz/bne loop. > > Signed-off-by: Kevin Hao > --- > arch/powerpc/mm/tlb_low_64e.S | 10 +- > 1 file changed, 5 insertions(+), 5 deletions(-) > > diff --git a/arch/powerpc/mm/tlb_low_64e.S b/arch/powerpc/mm/tlb_low_64e.S > index 765b419883f2..e4185581c5a7 100644 > --- a/arch/powerpc/mm/tlb_low_64e.S > +++ b/arch/powerpc/mm/tlb_low_64e.S > @@ -308,11 +308,11 @@ BEGIN_FTR_SECTION /* CPU_FTR_SMT */ >* >* MAS6:IND should be already set based on MAS4 >*/ > -1: lbarx r15,0,r11 > lhz r10,PACAPACAINDEX(r13) > - cmpdi r15,0 > - cmpdi cr1,r15,1 /* set cr1.eq = 0 for non-recursive */ > addir10,r10,1 > + crclr cr1*4+eq/* set cr1.eq = 0 for non-recursive */ > +1: lbarx r15,0,r11 > + cmpdi r15,0 > bne 2f You're optimizing the contended case at the expense of introducing stalls in the uncontended case. Does it really matter if there are more instructions in the loop? This change just means that you'll spin in the loop for more iterations (if it even does that -- I think the cycles per loop iteration might be the same before and after, due to load latency and pairing) while waiting for the other thread to release the lock. Do you have any benchmark results for this patch? -Scott ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH 5/5] dma-mapping: consolidate dma_set_mask
On Thu, Aug 13, 2015 at 04:25:05PM +0100, Russell King - ARM Linux wrote: > On Thu, Aug 13, 2015 at 05:04:08PM +0200, Christoph Hellwig wrote: > > diff --git a/arch/arm/common/dmabounce.c b/arch/arm/common/dmabounce.c > > index 1143c4d..260f52a 100644 > > --- a/arch/arm/common/dmabounce.c > > +++ b/arch/arm/common/dmabounce.c > > @@ -440,14 +440,6 @@ static void dmabounce_sync_for_device(struct device > > *dev, > > arm_dma_ops.sync_single_for_device(dev, handle, size, dir); > > } > > > > -static int dmabounce_set_mask(struct device *dev, u64 dma_mask) > > -{ > > - if (dev->archdata.dmabounce) > > - return 0; > > - > > - return arm_dma_ops.set_dma_mask(dev, dma_mask); > > Are you sure about this? A user of dmabounce gets to request any mask > with the original code (even though it was never written back... which > is a separate bug.) After this, it seems that this will get limited > by the dma_supported() check. As this old code is about bouncing any > buffer into DMA-able memory, it doesn't care about the DMA mask. I think you're right. With the default dma_supported implementation it would be fine, but ARM uses a custom one. I'll keep the arm specific dma_set_mask implementation for the next round. ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH 2/5] dma-mapping: consolidate dma_{alloc,free}_noncoherent
On Thu, Aug 13, 2015 at 04:20:40PM +0100, Russell King - ARM Linux wrote: > > -/* > > - * Dummy noncoherent implementation. We don't provide a dma_cache_sync > > - * function so drivers using this API are highlighted with build warnings. > > - */ > > I'd like a similar comment to remain after this patch explaining that we > don't support non-coherent allocations and that it'll be highlighted by > the lack of dma_cache_sync, otherwise I'm sure we'll start to get patches > to add the thing. I'll keep a modified version of this comment in the ARM dma-mapping.h in addition to an explanation near the new common dma_alloc_noncoherent definition, thanks! ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: RFC: prepare for struct scatterlist entries without page backing
On 08/13/2015 05:40 PM, Christoph Hellwig wrote: > On Wed, Aug 12, 2015 at 03:42:47PM +0300, Boaz Harrosh wrote: >> The support I have suggested and submitted for zone-less sections. >> (In my add_persistent_memory() patchset) >> >> Would work perfectly well and transparent for all such multimedia cases. >> (All hacks removed). In fact I have loaded pmem (with-pages) on a VRAM >> a few times and it is great easy fun. (I wanted to experiment with cached >> memory over a pcie) > > And everyone agree that it was both buggy and incomplete. > What? No one ever said anything about bugs. Is the first ever I hear of it. I was always in the notion that no one even tried it out. I'm smoking these page-full nvidimms for more than a year. With RDMA to pears and swap out to disks. So is not that bad I would say > Dan has done a respin of the page backed nvdimm work with most of > these comments addressed. > I would love some comments. All I got so far is silence. (And I do not like Dan's patches comments will come next week) > I have to say I hate both pfn-based I/O [1] and page backed nvdimms with > passion, so we're looking into the lesser evil with an open mind. > > [1] not the SGL part posted here, which I think is quite sane. The bio > side is much worse, though. > What can I say. I like the page-backed nvdimms. And the long term for me is 2M pages. I hope we can sit one day soon and you explain to me whats evil about it. I would really really like to understand Thanks though Boaz ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH 5/5] dma-mapping: consolidate dma_set_mask
On Thu, Aug 13, 2015 at 05:04:08PM +0200, Christoph Hellwig wrote: > diff --git a/arch/arm/common/dmabounce.c b/arch/arm/common/dmabounce.c > index 1143c4d..260f52a 100644 > --- a/arch/arm/common/dmabounce.c > +++ b/arch/arm/common/dmabounce.c > @@ -440,14 +440,6 @@ static void dmabounce_sync_for_device(struct device *dev, > arm_dma_ops.sync_single_for_device(dev, handle, size, dir); > } > > -static int dmabounce_set_mask(struct device *dev, u64 dma_mask) > -{ > - if (dev->archdata.dmabounce) > - return 0; > - > - return arm_dma_ops.set_dma_mask(dev, dma_mask); Are you sure about this? A user of dmabounce gets to request any mask with the original code (even though it was never written back... which is a separate bug.) After this, it seems that this will get limited by the dma_supported() check. As this old code is about bouncing any buffer into DMA-able memory, it doesn't care about the DMA mask. -- FTTC broadband for 0.8mile line: currently at 10.5Mbps down 400kbps up according to speedtest.net. ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH 2/5] dma-mapping: consolidate dma_{alloc, free}_noncoherent
On Thu, Aug 13, 2015 at 05:04:05PM +0200, Christoph Hellwig wrote: > diff --git a/arch/arm/include/asm/dma-mapping.h > b/arch/arm/include/asm/dma-mapping.h > index 2ae3424..ab521d5 100644 > --- a/arch/arm/include/asm/dma-mapping.h > +++ b/arch/arm/include/asm/dma-mapping.h > @@ -175,21 +175,6 @@ static inline int dma_mapping_error(struct device *dev, > dma_addr_t dma_addr) > return dma_addr == DMA_ERROR_CODE; > } > > -/* > - * Dummy noncoherent implementation. We don't provide a dma_cache_sync > - * function so drivers using this API are highlighted with build warnings. > - */ I'd like a similar comment to remain after this patch explaining that we don't support non-coherent allocations and that it'll be highlighted by the lack of dma_cache_sync, otherwise I'm sure we'll start to get patches to add the thing. -- FTTC broadband for 0.8mile line: currently at 10.5Mbps down 400kbps up according to speedtest.net. ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH 5/5] dma-mapping: consolidate dma_set_mask
Almost everyone implements dma_set_mask the same way, although some time that's hidden in ->set_dma_mask methods. Move this implementation to common code, including a callout to override the post-check action, and remove duplicate instaces in methods as well. Unfortunately some architectures overload unrelated semantics like changing the dma_ops into it so we still need to allow for an architecture override for now. Signed-off-by: Christoph Hellwig --- arch/alpha/include/asm/dma-mapping.h | 5 - arch/alpha/kernel/pci-noop.c | 10 -- arch/alpha/kernel/pci_iommu.c | 11 --- arch/arm/common/dmabounce.c | 9 - arch/arm/include/asm/dma-mapping.h| 5 - arch/arm/mm/dma-mapping.c | 16 arch/arm/xen/mm.c | 1 - arch/arm64/include/asm/dma-mapping.h | 9 - arch/h8300/include/asm/dma-mapping.h | 5 - arch/hexagon/include/asm/dma-mapping.h| 1 - arch/hexagon/kernel/dma.c | 11 --- arch/ia64/include/asm/dma-mapping.h | 9 - arch/microblaze/include/asm/dma-mapping.h | 14 -- arch/mips/include/asm/dma-mapping.h | 16 arch/openrisc/include/asm/dma-mapping.h | 9 - arch/powerpc/include/asm/dma-mapping.h| 4 +++- arch/powerpc/platforms/cell/iommu.c | 3 --- arch/s390/include/asm/dma-mapping.h | 2 -- arch/s390/pci/pci_dma.c | 10 -- arch/sh/include/asm/dma-mapping.h | 14 -- arch/sparc/include/asm/dma-mapping.h | 5 +++-- arch/tile/include/asm/dma-mapping.h | 5 +++-- arch/unicore32/include/asm/dma-mapping.h | 10 -- arch/x86/include/asm/dma-mapping.h| 2 -- arch/x86/kernel/pci-dma.c | 11 --- drivers/xen/swiotlb-xen.c | 12 include/asm-generic/dma-mapping-common.h | 16 include/xen/swiotlb-xen.h | 2 -- 28 files changed, 25 insertions(+), 202 deletions(-) diff --git a/arch/alpha/include/asm/dma-mapping.h b/arch/alpha/include/asm/dma-mapping.h index 9d763e5..72a8ca7 100644 --- a/arch/alpha/include/asm/dma-mapping.h +++ b/arch/alpha/include/asm/dma-mapping.h @@ -12,11 +12,6 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev) #include -static inline int dma_set_mask(struct device *dev, u64 mask) -{ - return get_dma_ops(dev)->set_dma_mask(dev, mask); -} - #define dma_cache_sync(dev, va, size, dir) ((void)0) #endif /* _ALPHA_DMA_MAPPING_H */ diff --git a/arch/alpha/kernel/pci-noop.c b/arch/alpha/kernel/pci-noop.c index df24b76..2b1f4a1 100644 --- a/arch/alpha/kernel/pci-noop.c +++ b/arch/alpha/kernel/pci-noop.c @@ -166,15 +166,6 @@ static int alpha_noop_supported(struct device *dev, u64 mask) return mask < 0x00ffUL ? 0 : 1; } -static int alpha_noop_set_mask(struct device *dev, u64 mask) -{ - if (!dev->dma_mask || !dma_supported(dev, mask)) - return -EIO; - - *dev->dma_mask = mask; - return 0; -} - struct dma_map_ops alpha_noop_ops = { .alloc = alpha_noop_alloc_coherent, .free = alpha_noop_free_coherent, @@ -182,7 +173,6 @@ struct dma_map_ops alpha_noop_ops = { .map_sg = alpha_noop_map_sg, .mapping_error = alpha_noop_mapping_error, .dma_supported = alpha_noop_supported, - .set_dma_mask = alpha_noop_set_mask, }; struct dma_map_ops *dma_ops = &alpha_noop_ops; diff --git a/arch/alpha/kernel/pci_iommu.c b/arch/alpha/kernel/pci_iommu.c index eddee77..8969bf2 100644 --- a/arch/alpha/kernel/pci_iommu.c +++ b/arch/alpha/kernel/pci_iommu.c @@ -939,16 +939,6 @@ static int alpha_pci_mapping_error(struct device *dev, dma_addr_t dma_addr) return dma_addr == 0; } -static int alpha_pci_set_mask(struct device *dev, u64 mask) -{ - if (!dev->dma_mask || - !pci_dma_supported(alpha_gendev_to_pci(dev), mask)) - return -EIO; - - *dev->dma_mask = mask; - return 0; -} - struct dma_map_ops alpha_pci_ops = { .alloc = alpha_pci_alloc_coherent, .free = alpha_pci_free_coherent, @@ -958,7 +948,6 @@ struct dma_map_ops alpha_pci_ops = { .unmap_sg = alpha_pci_unmap_sg, .mapping_error = alpha_pci_mapping_error, .dma_supported = alpha_pci_supported, - .set_dma_mask = alpha_pci_set_mask, }; struct dma_map_ops *dma_ops = &alpha_pci_ops; diff --git a/arch/arm/common/dmabounce.c b/arch/arm/common/dmabounce.c index 1143c4d..260f52a 100644 --- a/arch/arm/common/dmabounce.c +++ b/arch/arm/common/dmabounce.c @@ -440,14 +440,6 @@ static void dmabounce_sync_for_device(struct device *dev, arm_dma_ops.sync_single_for_device(de
[PATCH 3/5] dma-mapping: cosolidate dma_mapping_error
Currently there are three valid implementations of dma_mapping_error: (1) call ->mapping_error (2) check for a hardcoded error code (3) always return 0 This patch provides a common implementation that calls ->mapping_error if present, then checks for DMA_ERROR_CODE if defined or otherwise returns 0. Signed-off-by: Christoph Hellwig --- arch/alpha/include/asm/dma-mapping.h | 5 - arch/arm/include/asm/dma-mapping.h| 9 - arch/arm64/include/asm/dma-mapping.h | 7 --- arch/h8300/include/asm/dma-mapping.h | 5 - arch/hexagon/include/asm/dma-mapping.h| 11 +-- arch/ia64/include/asm/dma-mapping.h | 7 --- arch/microblaze/include/asm/dma-mapping.h | 11 --- arch/mips/include/asm/dma-mapping.h | 8 arch/openrisc/include/asm/dma-mapping.h | 5 - arch/powerpc/include/asm/dma-mapping.h| 17 ++--- arch/s390/include/asm/dma-mapping.h | 10 -- arch/sh/include/asm/dma-mapping.h | 13 ++--- arch/sparc/include/asm/dma-mapping.h | 6 -- arch/tile/include/asm/dma-mapping.h | 7 --- arch/unicore32/include/asm/dma-mapping.h | 10 -- arch/x86/include/asm/dma-mapping.h| 11 --- include/asm-generic/dma-mapping-common.h | 14 ++ 17 files changed, 19 insertions(+), 137 deletions(-) diff --git a/arch/alpha/include/asm/dma-mapping.h b/arch/alpha/include/asm/dma-mapping.h index 0552bf0..80ac3e8 100644 --- a/arch/alpha/include/asm/dma-mapping.h +++ b/arch/alpha/include/asm/dma-mapping.h @@ -12,11 +12,6 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev) #include -static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) -{ - return get_dma_ops(dev)->mapping_error(dev, dma_addr); -} - static inline int dma_supported(struct device *dev, u64 mask) { return get_dma_ops(dev)->dma_supported(dev, mask); diff --git a/arch/arm/include/asm/dma-mapping.h b/arch/arm/include/asm/dma-mapping.h index ab521d5..2fa33d7 100644 --- a/arch/arm/include/asm/dma-mapping.h +++ b/arch/arm/include/asm/dma-mapping.h @@ -166,15 +166,6 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size) static inline void dma_mark_clean(void *addr, size_t size) { } -/* - * DMA errors are defined by all-bits-set in the DMA address. - */ -static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) -{ - debug_dma_mapping_error(dev, dma_addr); - return dma_addr == DMA_ERROR_CODE; -} - extern int dma_supported(struct device *dev, u64 mask); extern int arm_dma_set_mask(struct device *dev, u64 dma_mask); diff --git a/arch/arm64/include/asm/dma-mapping.h b/arch/arm64/include/asm/dma-mapping.h index 178e60b..f45f444 100644 --- a/arch/arm64/include/asm/dma-mapping.h +++ b/arch/arm64/include/asm/dma-mapping.h @@ -84,13 +84,6 @@ static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t dev_addr) return (phys_addr_t)dev_addr; } -static inline int dma_mapping_error(struct device *dev, dma_addr_t dev_addr) -{ - struct dma_map_ops *ops = get_dma_ops(dev); - debug_dma_mapping_error(dev, dev_addr); - return ops->mapping_error(dev, dev_addr); -} - static inline int dma_supported(struct device *dev, u64 mask) { struct dma_map_ops *ops = get_dma_ops(dev); diff --git a/arch/h8300/include/asm/dma-mapping.h b/arch/h8300/include/asm/dma-mapping.h index 72465ce..5eef053 100644 --- a/arch/h8300/include/asm/dma-mapping.h +++ b/arch/h8300/include/asm/dma-mapping.h @@ -20,9 +20,4 @@ static inline int dma_set_mask(struct device *dev, u64 mask) return 0; } -static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) -{ - return 0; -} - #endif diff --git a/arch/hexagon/include/asm/dma-mapping.h b/arch/hexagon/include/asm/dma-mapping.h index 58d2d8f..e661192 100644 --- a/arch/hexagon/include/asm/dma-mapping.h +++ b/arch/hexagon/include/asm/dma-mapping.h @@ -31,6 +31,7 @@ struct device; extern int bad_dma_address; +#define DMA_ERROR_CODE bad_dma_address extern struct dma_map_ops *dma_ops; @@ -57,14 +58,4 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size) return addr + size - 1 <= *dev->dma_mask; } -static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) -{ - struct dma_map_ops *dma_ops = get_dma_ops(dev); - - if (dma_ops->mapping_error) - return dma_ops->mapping_error(dev, dma_addr); - - return (dma_addr == bad_dma_address); -} - #endif diff --git a/arch/ia64/include/asm/dma-mapping.h b/arch/ia64/include/asm/dma-mapping.h index a925ff0..27b713d 100644 --- a/arch/ia64/include/asm/dma-mapping.h +++ b/arch/ia64/include/asm/dma-mapping.h @@ -27,13 +27,6 @@ extern void machvec_dma_sync_sg(struct device *, struct scatterlist *, int, #include -static inline int dma_mapping_error(st
[PATCH 2/5] dma-mapping: consolidate dma_{alloc,free}_noncoherent
Most architectures do not support non-coherent allocations and either define dma_{alloc,free}_noncoherent to their coherent versions or stub them out. Openrisc uses dma_{alloc,free}_attrs to implement them, and only Mips implements them directly. This patch moves the Openrisc version to common code, and handles the DMA_ATTR_NON_CONSISTENT case in the mips dma_map_ops instance. Note that actual non-coherent allocations require a dma_cache_sync implementation, so if non-coherent allocations didn't work on an architecture before this patch they still won't work after it. Signed-off-by: Christoph Hellwig --- arch/alpha/include/asm/dma-mapping.h | 3 --- arch/arm/include/asm/dma-mapping.h| 15 --- arch/arm64/include/asm/dma-mapping.h | 14 -- arch/h8300/include/asm/dma-mapping.h | 3 --- arch/hexagon/include/asm/dma-mapping.h| 3 --- arch/ia64/include/asm/dma-mapping.h | 3 --- arch/microblaze/include/asm/dma-mapping.h | 3 --- arch/mips/include/asm/dma-mapping.h | 6 -- arch/mips/mm/dma-default.c| 20 +++- arch/openrisc/include/asm/dma-mapping.h | 20 arch/powerpc/include/asm/dma-mapping.h| 3 --- arch/s390/include/asm/dma-mapping.h | 3 --- arch/sh/include/asm/dma-mapping.h | 3 --- arch/sparc/include/asm/dma-mapping.h | 3 --- arch/tile/include/asm/dma-mapping.h | 3 --- arch/unicore32/include/asm/dma-mapping.h | 3 --- arch/x86/include/asm/dma-mapping.h| 3 --- include/asm-generic/dma-mapping-common.h | 18 ++ 18 files changed, 33 insertions(+), 96 deletions(-) diff --git a/arch/alpha/include/asm/dma-mapping.h b/arch/alpha/include/asm/dma-mapping.h index 9fef5bd..0552bf0 100644 --- a/arch/alpha/include/asm/dma-mapping.h +++ b/arch/alpha/include/asm/dma-mapping.h @@ -27,9 +27,6 @@ static inline int dma_set_mask(struct device *dev, u64 mask) return get_dma_ops(dev)->set_dma_mask(dev, mask); } -#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f) -#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h) - #define dma_cache_sync(dev, va, size, dir) ((void)0) #endif /* _ALPHA_DMA_MAPPING_H */ diff --git a/arch/arm/include/asm/dma-mapping.h b/arch/arm/include/asm/dma-mapping.h index 2ae3424..ab521d5 100644 --- a/arch/arm/include/asm/dma-mapping.h +++ b/arch/arm/include/asm/dma-mapping.h @@ -175,21 +175,6 @@ static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) return dma_addr == DMA_ERROR_CODE; } -/* - * Dummy noncoherent implementation. We don't provide a dma_cache_sync - * function so drivers using this API are highlighted with build warnings. - */ -static inline void *dma_alloc_noncoherent(struct device *dev, size_t size, - dma_addr_t *handle, gfp_t gfp) -{ - return NULL; -} - -static inline void dma_free_noncoherent(struct device *dev, size_t size, - void *cpu_addr, dma_addr_t handle) -{ -} - extern int dma_supported(struct device *dev, u64 mask); extern int arm_dma_set_mask(struct device *dev, u64 dma_mask); diff --git a/arch/arm64/include/asm/dma-mapping.h b/arch/arm64/include/asm/dma-mapping.h index 5e11b3f..178e60b 100644 --- a/arch/arm64/include/asm/dma-mapping.h +++ b/arch/arm64/include/asm/dma-mapping.h @@ -118,19 +118,5 @@ static inline void dma_mark_clean(void *addr, size_t size) { } -/* - * There is no dma_cache_sync() implementation, so just return NULL here. - */ -static inline void *dma_alloc_noncoherent(struct device *dev, size_t size, - dma_addr_t *handle, gfp_t flags) -{ - return NULL; -} - -static inline void dma_free_noncoherent(struct device *dev, size_t size, - void *cpu_addr, dma_addr_t handle) -{ -} - #endif /* __KERNEL__ */ #endif /* __ASM_DMA_MAPPING_H */ diff --git a/arch/h8300/include/asm/dma-mapping.h b/arch/h8300/include/asm/dma-mapping.h index 826aa9b..72465ce 100644 --- a/arch/h8300/include/asm/dma-mapping.h +++ b/arch/h8300/include/asm/dma-mapping.h @@ -20,9 +20,6 @@ static inline int dma_set_mask(struct device *dev, u64 mask) return 0; } -#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f) -#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h) - static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) { return 0; diff --git a/arch/hexagon/include/asm/dma-mapping.h b/arch/hexagon/include/asm/dma-mapping.h index c20d3ca..58d2d8f 100644 --- a/arch/hexagon/include/asm/dma-mapping.h +++ b/arch/hexagon/include/asm/dma-mapping.h @@ -34,9 +34,6 @@ extern int bad_dma_address; extern struct dma_map_ops *dma_ops; -#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f) -#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h) - static inline s
[PATCH 1/5] dma-mapping: consolidate dma_{alloc, free}_{attrs, coherent}
The coherent DMA allocator works the same over all architectures supporting dma_map operations. This patch consolidates them and converges the minor differences: - the debug_dma helpers are now called from all architectures, including those that were previously missing them - dma_alloc_from_coherent and dma_release_from_coherent are now always called from the generic alloc/free routines instead of the ops dma-mapping-common.h always includes dma-coherent.h to get the defintions for them, or the stubs if the architecture doesn't support this feature - checks for ->alloc / ->free presence are removed. There is only one magic instead of dma_map_ops without them (mic_dma_ops) and that one is x86 only anyway. Besides that only x86 needs special treatment to replace a default devices if none is passed and tweak the gfp_flags. An optional arch hook is provided for that. Signed-off-by: Christoph Hellwig --- arch/alpha/include/asm/dma-mapping.h | 18 -- arch/arm/include/asm/dma-mapping.h| 29 arch/arm/mm/dma-mapping.c | 11 -- arch/arm64/include/asm/dma-mapping.h | 33 -- arch/h8300/include/asm/dma-mapping.h | 26 -- arch/hexagon/include/asm/dma-mapping.h| 33 -- arch/ia64/include/asm/dma-mapping.h | 25 - arch/microblaze/include/asm/dma-mapping.h | 31 - arch/mips/cavium-octeon/dma-octeon.c | 8 - arch/mips/include/asm/dma-mapping.h | 31 - arch/mips/loongson64/common/dma-swiotlb.c | 8 - arch/mips/mm/dma-default.c| 7 arch/mips/netlogic/common/nlm-dma.c | 8 - arch/openrisc/include/asm/dma-mapping.h | 30 arch/powerpc/include/asm/dma-mapping.h| 33 -- arch/s390/include/asm/dma-mapping.h | 31 - arch/sh/include/asm/dma-mapping.h | 37 arch/sparc/include/asm/dma-mapping.h | 26 -- arch/tile/include/asm/dma-mapping.h | 27 -- arch/unicore32/include/asm/dma-mapping.h | 24 - arch/x86/include/asm/dma-mapping.h| 16 ++--- arch/x86/kernel/pci-dma.c | 49 +- drivers/xen/swiotlb-xen.c | 6 include/asm-generic/dma-mapping-common.h | 58 +++ 24 files changed, 70 insertions(+), 535 deletions(-) diff --git a/arch/alpha/include/asm/dma-mapping.h b/arch/alpha/include/asm/dma-mapping.h index dfa32f0..9fef5bd 100644 --- a/arch/alpha/include/asm/dma-mapping.h +++ b/arch/alpha/include/asm/dma-mapping.h @@ -12,24 +12,6 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev) #include -#define dma_alloc_coherent(d,s,h,f)dma_alloc_attrs(d,s,h,f,NULL) - -static inline void *dma_alloc_attrs(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t gfp, - struct dma_attrs *attrs) -{ - return get_dma_ops(dev)->alloc(dev, size, dma_handle, gfp, attrs); -} - -#define dma_free_coherent(d,s,c,h) dma_free_attrs(d,s,c,h,NULL) - -static inline void dma_free_attrs(struct device *dev, size_t size, - void *vaddr, dma_addr_t dma_handle, - struct dma_attrs *attrs) -{ - get_dma_ops(dev)->free(dev, size, vaddr, dma_handle, attrs); -} - static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) { return get_dma_ops(dev)->mapping_error(dev, dma_addr); diff --git a/arch/arm/include/asm/dma-mapping.h b/arch/arm/include/asm/dma-mapping.h index b52101d..2ae3424 100644 --- a/arch/arm/include/asm/dma-mapping.h +++ b/arch/arm/include/asm/dma-mapping.h @@ -8,7 +8,6 @@ #include #include -#include #include #include @@ -209,21 +208,6 @@ extern int arm_dma_set_mask(struct device *dev, u64 dma_mask); extern void *arm_dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp, struct dma_attrs *attrs); -#define dma_alloc_coherent(d, s, h, f) dma_alloc_attrs(d, s, h, f, NULL) - -static inline void *dma_alloc_attrs(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t flag, - struct dma_attrs *attrs) -{ - struct dma_map_ops *ops = get_dma_ops(dev); - void *cpu_addr; - BUG_ON(!ops); - - cpu_addr = ops->alloc(dev, size, dma_handle, flag, attrs); - debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr); - return cpu_addr; -} - /** * arm_dma_free - free memory allocated by arm_dma_alloc * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices @@ -241,19 +225,6 @@ static inline void *dma_alloc_attrs(struct device *dev, size_t size, extern void arm_dma_free(struct device *dev, size_t size,
provide more common DMA API functions
Since 2009 we have a nice asm-generic header implementing lots of DMA API functions for architectures using struct dma_map_ops, but unfortunately it's still missing a lot of APIs that all architectures still have to duplicate. This series consolidates the remaining functions, although we still need arch opt outs for two of them as a few architectures have very non-standard implementations. ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH 4/5] dma-mapping: consolidate dma_supported
Most architectures just call into ->dma_supported, but some also return 1 if the method is not present, or 0 if no dma ops are present (although that should never happeb). Consolidate this more broad version into common code. Also fix h8300 which inorrectly always returned 0, which would have been a problem if it's dma_set_mask implementation wasn't a similarly buggy noop. As a few architectures have much more elaborate implementations, we still allow for arch overrides. Signed-off-by: Christoph Hellwig --- arch/alpha/include/asm/dma-mapping.h | 5 - arch/arm/include/asm/dma-mapping.h| 5 +++-- arch/arm64/include/asm/dma-mapping.h | 6 -- arch/h8300/include/asm/dma-mapping.h | 5 - arch/hexagon/include/asm/dma-mapping.h| 1 + arch/ia64/include/asm/dma-mapping.h | 6 -- arch/microblaze/include/asm/dma-mapping.h | 11 --- arch/mips/include/asm/dma-mapping.h | 6 -- arch/openrisc/include/asm/dma-mapping.h | 5 +++-- arch/powerpc/include/asm/dma-mapping.h| 11 --- arch/s390/include/asm/dma-mapping.h | 9 - arch/sh/include/asm/dma-mapping.h | 10 -- arch/sparc/include/asm/dma-mapping.h | 1 + arch/tile/include/asm/dma-mapping.h | 6 -- arch/unicore32/include/asm/dma-mapping.h | 10 -- arch/x86/include/asm/dma-mapping.h| 4 +++- include/asm-generic/dma-mapping-common.h | 13 + 17 files changed, 24 insertions(+), 90 deletions(-) diff --git a/arch/alpha/include/asm/dma-mapping.h b/arch/alpha/include/asm/dma-mapping.h index 80ac3e8..9d763e5 100644 --- a/arch/alpha/include/asm/dma-mapping.h +++ b/arch/alpha/include/asm/dma-mapping.h @@ -12,11 +12,6 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev) #include -static inline int dma_supported(struct device *dev, u64 mask) -{ - return get_dma_ops(dev)->dma_supported(dev, mask); -} - static inline int dma_set_mask(struct device *dev, u64 mask) { return get_dma_ops(dev)->set_dma_mask(dev, mask); diff --git a/arch/arm/include/asm/dma-mapping.h b/arch/arm/include/asm/dma-mapping.h index 2fa33d7..b90d247 100644 --- a/arch/arm/include/asm/dma-mapping.h +++ b/arch/arm/include/asm/dma-mapping.h @@ -38,6 +38,9 @@ static inline void set_dma_ops(struct device *dev, struct dma_map_ops *ops) dev->archdata.dma_ops = ops; } +#define HAVE_ARCH_DMA_SUPPORTED 1 +extern int dma_supported(struct device *dev, u64 mask); + #include static inline int dma_set_mask(struct device *dev, u64 mask) @@ -166,8 +169,6 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size) static inline void dma_mark_clean(void *addr, size_t size) { } -extern int dma_supported(struct device *dev, u64 mask); - extern int arm_dma_set_mask(struct device *dev, u64 dma_mask); /** diff --git a/arch/arm64/include/asm/dma-mapping.h b/arch/arm64/include/asm/dma-mapping.h index f45f444..f519a58 100644 --- a/arch/arm64/include/asm/dma-mapping.h +++ b/arch/arm64/include/asm/dma-mapping.h @@ -84,12 +84,6 @@ static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t dev_addr) return (phys_addr_t)dev_addr; } -static inline int dma_supported(struct device *dev, u64 mask) -{ - struct dma_map_ops *ops = get_dma_ops(dev); - return ops->dma_supported(dev, mask); -} - static inline int dma_set_mask(struct device *dev, u64 mask) { if (!dev->dma_mask || !dma_supported(dev, mask)) diff --git a/arch/h8300/include/asm/dma-mapping.h b/arch/h8300/include/asm/dma-mapping.h index 5eef053..48d652e 100644 --- a/arch/h8300/include/asm/dma-mapping.h +++ b/arch/h8300/include/asm/dma-mapping.h @@ -10,11 +10,6 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev) #include -static inline int dma_supported(struct device *dev, u64 mask) -{ - return 0; -} - static inline int dma_set_mask(struct device *dev, u64 mask) { return 0; diff --git a/arch/hexagon/include/asm/dma-mapping.h b/arch/hexagon/include/asm/dma-mapping.h index e661192..36e8de7 100644 --- a/arch/hexagon/include/asm/dma-mapping.h +++ b/arch/hexagon/include/asm/dma-mapping.h @@ -43,6 +43,7 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev) return dma_ops; } +#define HAVE_ARCH_DMA_SUPPORTED 1 extern int dma_supported(struct device *dev, u64 mask); extern int dma_set_mask(struct device *dev, u64 mask); extern int dma_is_consistent(struct device *dev, dma_addr_t dma_handle); diff --git a/arch/ia64/include/asm/dma-mapping.h b/arch/ia64/include/asm/dma-mapping.h index 27b713d..7982caa 100644 --- a/arch/ia64/include/asm/dma-mapping.h +++ b/arch/ia64/include/asm/dma-mapping.h @@ -27,12 +27,6 @@ extern void machvec_dma_sync_sg(struct device *, struct scatterlist *, int, #include -static inline int dma_supported(struct device *dev, u64 mask) -{ - struct dma_map_ops *ops = platform_dma_get_ops(dev); - r
Re: RFC: prepare for struct scatterlist entries without page backing
On Thu, Aug 13, 2015 at 09:37:37AM +1000, Julian Calaby wrote: > I.e. ~90% of this patch set seems to be just mechanically dropping > BUG_ON()s and converting open coded stuff to use accessor functions > (which should be macros or get inlined, right?) - and the remaining > bit is not flushing if we don't have a physical page somewhere. Which is was 90%. By lines changed most actually is the diffs for the cache flushing. > Would it make sense to split this patch set into a few bits: one to > drop all the useless BUG_ON()s, one to convert all the open coded > stuff to accessor functions, then another to do the actual page-less > sg stuff? Without the ifs the BUG_ON() actually are useful to assert we never feed the sort of physical addresses we can't otherwise support, so I don't think that part is doable. A simple series to make more use of sg_phys and add sg_pfn might still be useful, though. ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH 31/31] dma-mapping-common: skip kmemleak checks for page-less SG entries
On Wed, Aug 12, 2015 at 09:05:15AM -0700, Linus Torvalds wrote: > [ Again, I'm responding to one random patch - this pattern was in > other patches too. ] > > A question: do we actually expect to mix page-less and pageful SG > entries in the same SG list? > > How does that happen? Both for DAX and the video buffer case people could do direct I/O spanning the boundary between such a VMA and a normal one unless we add special code to prevent that. Right now I don't think it's all that useful, but then again it doesn't seem harmful either and adding those checks might add up. ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH 29/31] parisc: handle page-less SG entries
On Wed, Aug 12, 2015 at 09:01:02AM -0700, Linus Torvalds wrote: > I'm assuming that anybody who wants to use the page-less > scatter-gather lists always does so on memory that isn't actually > virtually mapped at all, or only does so on sane architectures that > are cache coherent at a physical level, but I'd like that assumption > *documented* somewhere. It's temporarily mapped by kmap-like helpers. That code isn't in this series. The most recent version of it is here: https://git.kernel.org/cgit/linux/kernel/git/djbw/nvdimm.git/commit/?h=pfn&id=de8237c99fdb4352be2193f3a7610e902b9bb2f0 note that it's not doing the cache flushing it would have to do yet, but it's also only enabled for x86 at the moment. ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: RFC: prepare for struct scatterlist entries without page backing
On Wed, Aug 12, 2015 at 03:42:47PM +0300, Boaz Harrosh wrote: > The support I have suggested and submitted for zone-less sections. > (In my add_persistent_memory() patchset) > > Would work perfectly well and transparent for all such multimedia cases. > (All hacks removed). In fact I have loaded pmem (with-pages) on a VRAM > a few times and it is great easy fun. (I wanted to experiment with cached > memory over a pcie) And everyone agree that it was both buggy and incomplete. Dan has done a respin of the page backed nvdimm work with most of these comments addressed. I have to say I hate both pfn-based I/O [1] and page backed nvdimms with passion, so we're looking into the lesser evil with an open mind. [1] not the SGL part posted here, which I think is quite sane. The bio side is much worse, though. ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v3 6/6] powerpc/powernv: allocate sparse PE# when using M64 BAR in Single PE mode
When M64 BAR is set to Single PE mode, the PE# assigned to VF could be sparse. This patch restructures the patch to allocate sparse PE# for VFs when M64 BAR is set to Single PE mode. Signed-off-by: Wei Yang --- arch/powerpc/include/asm/pci-bridge.h |2 +- arch/powerpc/platforms/powernv/pci-ioda.c | 59 +++-- 2 files changed, 41 insertions(+), 20 deletions(-) diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h index 9d33ada..b026ef8 100644 --- a/arch/powerpc/include/asm/pci-bridge.h +++ b/arch/powerpc/include/asm/pci-bridge.h @@ -214,7 +214,7 @@ struct pci_dn { #ifdef CONFIG_PCI_IOV u16 vfs_expanded; /* number of VFs IOV BAR expanded */ u16 num_vfs;/* number of VFs enabled*/ - int offset; /* PE# for the first VF PE */ + int pe_num_map[MAX_M64_BAR];/* PE# for the first VF PE or array */ boolm64_single_mode;/* Use M64 BAR in Single Mode */ #define IODA_INVALID_M64(-1) int m64_map[PCI_SRIOV_NUM_BARS][MAX_M64_BAR]; diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 1e6ac86..7633538 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1232,7 +1232,7 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs) /* Map the M64 here */ if (pdn->m64_single_mode) { - pe_num = pdn->offset + j; + pe_num = pdn->pe_num_map[j]; rc = opal_pci_map_pe_mmio_window(phb->opal_id, pe_num, OPAL_M64_WINDOW_TYPE, pdn->m64_map[i][j], 0); @@ -1336,7 +1336,7 @@ void pnv_pci_sriov_disable(struct pci_dev *pdev) struct pnv_phb*phb; struct pci_dn *pdn; struct pci_sriov *iov; - u16 num_vfs; + u16 num_vfs, i; bus = pdev->bus; hose = pci_bus_to_host(bus); @@ -1350,14 +1350,17 @@ void pnv_pci_sriov_disable(struct pci_dev *pdev) if (phb->type == PNV_PHB_IODA2) { if (!pdn->m64_single_mode) - pnv_pci_vf_resource_shift(pdev, -pdn->offset); + pnv_pci_vf_resource_shift(pdev, -pdn->pe_num_map[0]); /* Release M64 windows */ pnv_pci_vf_release_m64(pdev); /* Release PE numbers */ - bitmap_clear(phb->ioda.pe_alloc, pdn->offset, num_vfs); - pdn->offset = 0; + if (pdn->m64_single_mode) { + for (i = 0; i < num_vfs; i++) + pnv_ioda_free_pe(phb, pdn->pe_num_map[i]); + } else + bitmap_clear(phb->ioda.pe_alloc, pdn->pe_num_map[0], num_vfs); } } @@ -1383,7 +1386,10 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs) /* Reserve PE for each VF */ for (vf_index = 0; vf_index < num_vfs; vf_index++) { - pe_num = pdn->offset + vf_index; + if (pdn->m64_single_mode) + pe_num = pdn->pe_num_map[vf_index]; + else + pe_num = pdn->pe_num_map[0] + vf_index; pe = &phb->ioda.pe_array[pe_num]; pe->pe_number = pe_num; @@ -1425,6 +1431,7 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs) struct pnv_phb*phb; struct pci_dn *pdn; intret; + u16i; bus = pdev->bus; hose = pci_bus_to_host(bus); @@ -1448,19 +1455,30 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs) } /* Calculate available PE for required VFs */ - mutex_lock(&phb->ioda.pe_alloc_mutex); - pdn->offset = bitmap_find_next_zero_area( - phb->ioda.pe_alloc, phb->ioda.total_pe, - 0, num_vfs, 0); - if (pdn->offset >= phb->ioda.total_pe) { + if (pdn->m64_single_mode) { + for (i = 0; i < num_vfs; i++) + pdn->pe_num_map[i] = IODA_INVALID_PE; + for (i = 0; i < num_vfs; i++) { + pdn->pe_num_map[i] = pnv_ioda_alloc_pe(phb); + if (pdn->pe_num_map[i] == IODA_INVALID_PE) { + ret = -EBUSY; + goto m64_failed; + } + } + } else { + mutex_lock(&phb->ioda.pe_alloc_mutex); + pdn->pe_num_map[0] = bitmap_find_next_zero_area( +
[PATCH v3 4/6] powerpc/powernv: replace the hard coded boundary with gate
At the moment 64bit-prefetchable window can be maximum 64GB, which is currently got from device tree. This means that in shared mode the maximum supported VF BAR size is 64GB/256=256MB. While this size could exhaust the whole 64bit-prefetchable window. This is a design decision to set a boundary to 64MB of the VF BAR size. Since VF BAR size with 64MB would occupy a quarter of the 64bit-prefetchable window, this is affordable. This patch replaces magic limit of 64MB with (m64_segsize >> 1) and adds comment to explain the reason for it. Signed-off-by: Wei Yang --- arch/powerpc/platforms/powernv/pci-ioda.c | 22 +- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 4da0f50..3e8c0b4 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -2688,7 +2688,7 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev) struct pnv_phb *phb; struct resource *res; int i; - resource_size_t size; + resource_size_t size, gate; struct pci_dn *pdn; int mul, total_vfs; @@ -2704,6 +2704,17 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev) total_vfs = pci_sriov_get_totalvfs(pdev); mul = phb->ioda.total_pe; + /* +* If bigger than or equal to half of M64 segment size, just round up +* power of two. +* +* Generally, one M64 BAR maps one IOV BAR. To avoid conflict with +* other devices, IOV BAR size is expanded to be (total_pe * +* VF_BAR_size). When VF_BAR_size is half of M64 segment size , the +* expanded size would equal to half of the whole M64 Space size, +* which will exhaust the M64 Space and limit the system flexibility. +*/ + gate = phb->ioda.m64_segsize >> 1; for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { res = &pdev->resource[i + PCI_IOV_RESOURCES]; @@ -2718,10 +2729,11 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev) size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES); - /* bigger than 64M */ - if (size > (1 << 26)) { - dev_info(&pdev->dev, "PowerNV: VF BAR%d: %pR IOV size is bigger than 64M, roundup power2\n", -i, res); + /* bigger than or equal to gate */ + if (size >= gate) { + dev_info(&pdev->dev, "PowerNV: VF BAR%d: %pR IOV size " + "is bigger than %lld, roundup power2\n", +i, res, gate); mul = roundup_pow_of_two(total_vfs); pdn->m64_single_mode = true; break; -- 1.7.9.5 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v3 5/6] powerpc/powernv: boundary the total VF BAR size instead of the individual one
Each VF could have 6 BARs at most. When the total BAR size exceeds the gate, after expanding it will also exhaust the M64 Window. This patch limits the boundary by checking the total VF BAR size instead of the individual BAR. Signed-off-by: Wei Yang --- arch/powerpc/platforms/powernv/pci-ioda.c | 13 +++-- 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 3e8c0b4..1e6ac86 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -2688,7 +2688,7 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev) struct pnv_phb *phb; struct resource *res; int i; - resource_size_t size, gate; + resource_size_t size, gate, total_vf_bar_sz; struct pci_dn *pdn; int mul, total_vfs; @@ -2715,6 +2715,7 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev) * which will exhaust the M64 Space and limit the system flexibility. */ gate = phb->ioda.m64_segsize >> 1; + total_vf_bar_sz = 0; for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { res = &pdev->resource[i + PCI_IOV_RESOURCES]; @@ -2727,13 +2728,13 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev) return; } - size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES); + total_vf_bar_sz += pci_iov_resource_size(pdev, + i + PCI_IOV_RESOURCES); /* bigger than or equal to gate */ - if (size >= gate) { - dev_info(&pdev->dev, "PowerNV: VF BAR%d: %pR IOV size " - "is bigger than %lld, roundup power2\n", -i, res, gate); + if (total_vf_bar_sz >= gate) { + dev_info(&pdev->dev, "PowerNV: VF BAR Total IOV size " + "is bigger than %lld, roundup power2\n", gate); mul = roundup_pow_of_two(total_vfs); pdn->m64_single_mode = true; break; -- 1.7.9.5 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v3 3/6] powerpc/powernv: use one M64 BAR in Single PE mode for one VF BAR
In current implementation, when VF BAR is bigger than 64MB, it uses 4 M64 BARs in Single PE mode to cover the number of VFs required to be enabled. By doing so, several VFs would be in one VF Group and leads to interference between VFs in the same group. This patch changes the design by using one M64 BAR in Single PE mode for one VF BAR. This gives absolute isolation for VFs. Signed-off-by: Wei Yang --- arch/powerpc/include/asm/pci-bridge.h |6 +- arch/powerpc/platforms/powernv/pci-ioda.c | 163 +++-- 2 files changed, 62 insertions(+), 107 deletions(-) diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h index 712add5..9d33ada 100644 --- a/arch/powerpc/include/asm/pci-bridge.h +++ b/arch/powerpc/include/asm/pci-bridge.h @@ -187,6 +187,7 @@ static inline int isa_vaddr_is_ioport(void __iomem *address) */ struct iommu_table; +#define MAX_M64_BAR 16 struct pci_dn { int flags; #define PCI_DN_FLAG_IOV_VF 0x01 @@ -214,10 +215,9 @@ struct pci_dn { u16 vfs_expanded; /* number of VFs IOV BAR expanded */ u16 num_vfs;/* number of VFs enabled*/ int offset; /* PE# for the first VF PE */ -#define M64_PER_IOV 4 - int m64_per_iov; + boolm64_single_mode;/* Use M64 BAR in Single Mode */ #define IODA_INVALID_M64(-1) - int m64_wins[PCI_SRIOV_NUM_BARS][M64_PER_IOV]; + int m64_map[PCI_SRIOV_NUM_BARS][MAX_M64_BAR]; #endif /* CONFIG_PCI_IOV */ #endif struct list_head child_list; diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 67b8f72..4da0f50 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1162,15 +1162,14 @@ static int pnv_pci_vf_release_m64(struct pci_dev *pdev) pdn = pci_get_pdn(pdev); for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) - for (j = 0; j < M64_PER_IOV; j++) { - if (pdn->m64_wins[i][j] == IODA_INVALID_M64) + for (j = 0; j < MAX_M64_BAR; j++) { + if (pdn->m64_map[i][j] == IODA_INVALID_M64) continue; opal_pci_phb_mmio_enable(phb->opal_id, - OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 0); - clear_bit(pdn->m64_wins[i][j], &phb->ioda.m64_bar_alloc); - pdn->m64_wins[i][j] = IODA_INVALID_M64; + OPAL_M64_WINDOW_TYPE, pdn->m64_map[i][j], 0); + clear_bit(pdn->m64_map[i][j], &phb->ioda.m64_bar_alloc); + pdn->m64_map[i][j] = IODA_INVALID_M64; } - return 0; } @@ -1187,8 +1186,7 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs) inttotal_vfs; resource_size_tsize, start; intpe_num; - intvf_groups; - intvf_per_group; + intm64_bars; bus = pdev->bus; hose = pci_bus_to_host(bus); @@ -1196,26 +1194,23 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs) pdn = pci_get_pdn(pdev); total_vfs = pci_sriov_get_totalvfs(pdev); - /* Initialize the m64_wins to IODA_INVALID_M64 */ - for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) - for (j = 0; j < M64_PER_IOV; j++) - pdn->m64_wins[i][j] = IODA_INVALID_M64; + if (pdn->m64_single_mode) + m64_bars = num_vfs; + else + m64_bars = 1; + + /* Initialize the m64_map to IODA_INVALID_M64 */ + for (i = 0; i < PCI_SRIOV_NUM_BARS ; i++) + for (j = 0; j < MAX_M64_BAR; j++) + pdn->m64_map[i][j] = IODA_INVALID_M64; - if (pdn->m64_per_iov == M64_PER_IOV) { - vf_groups = (num_vfs <= M64_PER_IOV) ? num_vfs: M64_PER_IOV; - vf_per_group = (num_vfs <= M64_PER_IOV)? 1: - roundup_pow_of_two(num_vfs) / pdn->m64_per_iov; - } else { - vf_groups = 1; - vf_per_group = 1; - } for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { res = &pdev->resource[i + PCI_IOV_RESOURCES]; if (!res->flags || !res->parent) continue; - for (j = 0; j < vf_groups; j++) { + for (j = 0; j < m64_bars; j++) { do { win = find_next_zero_bit(&phb->ioda.m64_bar_alloc, phb->ioda.m64_bar_idx + 1, 0); @@ -1224,12 +1219,11 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs) goto m64_failed;
[PATCH v3 1/6] powerpc/powernv: don't enable SRIOV when VF BAR has non 64bit-prefetchable BAR
On PHB_IODA2, we enable SRIOV devices by mapping IOV BAR with M64 BARs. If a SRIOV device's IOV BAR is not 64bit-prefetchable, this is not assigned from 64bit prefetchable window, which means M64 BAR can't work on it. This patch makes this explicit. Signed-off-by: Wei Yang --- arch/powerpc/platforms/powernv/pci-ioda.c | 25 + 1 file changed, 9 insertions(+), 16 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 5738d31..9ac324e 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -908,9 +908,6 @@ static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset) if (!res->flags || !res->parent) continue; - if (!pnv_pci_is_mem_pref_64(res->flags)) - continue; - /* * The actual IOV BAR range is determined by the start address * and the actual size for num_vfs VFs BAR. This check is to @@ -939,9 +936,6 @@ static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset) if (!res->flags || !res->parent) continue; - if (!pnv_pci_is_mem_pref_64(res->flags)) - continue; - size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES); res2 = *res; res->start += size * offset; @@ -1221,9 +1215,6 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs) if (!res->flags || !res->parent) continue; - if (!pnv_pci_is_mem_pref_64(res->flags)) - continue; - for (j = 0; j < vf_groups; j++) { do { win = find_next_zero_bit(&phb->ioda.m64_bar_alloc, @@ -1510,6 +1501,12 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs) pdn = pci_get_pdn(pdev); if (phb->type == PNV_PHB_IODA2) { + if (!pdn->vfs_expanded) { + dev_info(&pdev->dev, "don't support this SRIOV device" + " with non 64bit-prefetchable IOV BAR\n"); + return -ENOSPC; + } + /* Calculate available PE for required VFs */ mutex_lock(&phb->ioda.pe_alloc_mutex); pdn->offset = bitmap_find_next_zero_area( @@ -2774,9 +2771,10 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev) if (!res->flags || res->parent) continue; if (!pnv_pci_is_mem_pref_64(res->flags)) { - dev_warn(&pdev->dev, " non M64 VF BAR%d: %pR\n", + dev_warn(&pdev->dev, "Don't support SR-IOV with" + " non M64 VF BAR%d: %pR. \n", i, res); - continue; + return; } size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES); @@ -2795,11 +2793,6 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev) res = &pdev->resource[i + PCI_IOV_RESOURCES]; if (!res->flags || res->parent) continue; - if (!pnv_pci_is_mem_pref_64(res->flags)) { - dev_warn(&pdev->dev, "Skipping expanding VF BAR%d: %pR\n", -i, res); - continue; - } dev_dbg(&pdev->dev, " Fixing VF BAR%d: %pR to\n", i, res); size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES); -- 1.7.9.5 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v3 2/6] powerpc/powernv: simplify the calculation of iov resource alignment
The alignment of IOV BAR on PowerNV platform is the total size of the IOV BAR. No matter whether the IOV BAR is extended with number of roundup_pow_of_two(total_vfs) or number of max PE number (256), the total size could be calculated by (vfs_expanded * VF_BAR_size). This patch simplifies the pnv_pci_iov_resource_alignment() by removing the first case. Signed-off-by: Wei Yang Reviewed-by: Gavin Shan --- arch/powerpc/platforms/powernv/pci-ioda.c | 14 +- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 9ac324e..67b8f72 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -2987,12 +2987,16 @@ static resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev, int resno) { struct pci_dn *pdn = pci_get_pdn(pdev); - resource_size_t align, iov_align; - - iov_align = resource_size(&pdev->resource[resno]); - if (iov_align) - return iov_align; + resource_size_t align; + /* +* On PowerNV platform, IOV BAR is mapped by M64 BAR to enable the +* SR-IOV. While from hardware perspective, the range mapped by M64 +* BAR should be size aligned. +* +* This function return the total IOV BAR size if expanded or just the +* individual size if not. +*/ align = pci_iov_resource_size(pdev, resno); if (pdn->vfs_expanded) return pdn->vfs_expanded * align; -- 1.7.9.5 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v3 0/6] Redesign SR-IOV on PowerNV
In original design, it tries to group VFs to enable more number of VFs in the system, when VF BAR is bigger than 64MB. This design has a flaw in which one error on a VF will interfere other VFs in the same group. This patch series change this design by using M64 BAR in Single PE mode to cover only one VF BAR. By doing so, it gives absolute isolation between VFs. v3: * return -ENOSPC when a VF has non-64bit prefetchable BAR * rename offset to pe_num_map and define it statically * change commit log based on comments * define m64_map statically v2: * clean up iov bar alignment calculation * change m64s to m64_bars * add a field to represent M64 Single PE mode will be used * change m64_wins to m64_map * calculate the gate instead of hard coded * dynamically allocate m64_map * dynamically allocate PE# * add a case to calculate iov bar alignment when M64 Single PE is used * when M64 Single PE is used, compare num_vfs with M64 BAR available number in system at first Wei Yang (6): powerpc/powernv: don't enable SRIOV when VF BAR has non 64bit-prefetchable BAR powerpc/powernv: simplify the calculation of iov resource alignment powerpc/powernv: use one M64 BAR in Single PE mode for one VF BAR powerpc/powernv: replace the hard coded boundary with gate powerpc/powernv: boundary the total VF BAR size instead of the individual one powerpc/powernv: allocate sparse PE# when using M64 BAR in Single PE mode arch/powerpc/include/asm/pci-bridge.h |8 +- arch/powerpc/platforms/powernv/pci-ioda.c | 284 ++--- 2 files changed, 139 insertions(+), 153 deletions(-) -- 1.7.9.5 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH 3/3] powerpc/e6500: hw tablewalk: order the memory access when acquire/release tcd lock
I didn't find anything unusual. But I think we do need to order the load/store of esel_next when acquire/release tcd lock. For acquire, add a data dependency to order the loads of lock and esel_next. For release, even there already have a "isync" here, but it doesn't guarantee any memory access order. So we still need "lwsync" for the two stores for lock and esel_next. Signed-off-by: Kevin Hao --- arch/powerpc/mm/tlb_low_64e.S | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/powerpc/mm/tlb_low_64e.S b/arch/powerpc/mm/tlb_low_64e.S index e4185581c5a7..964754911987 100644 --- a/arch/powerpc/mm/tlb_low_64e.S +++ b/arch/powerpc/mm/tlb_low_64e.S @@ -334,6 +334,8 @@ BEGIN_FTR_SECTION /* CPU_FTR_SMT */ * with tlbilx before overwriting. */ + andir15,r15,0 /* add a data dependency to order the loards */ + add r11,r11,r15 /* between the lock and esel_next */ lbz r15,TCD_ESEL_NEXT(r11) rlwinm r10,r15,16,0xff orisr10,r10,MAS0_TLBSEL(1)@h @@ -447,6 +449,7 @@ BEGIN_FTR_SECTION beq cr1,1f /* no unlock if lock was recursively grabbed */ li r15,0 isync + lwsync stb r15,0(r11) 1: END_FTR_SECTION_IFSET(CPU_FTR_SMT) -- 2.1.0 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH 2/3] powerpc/e6500: hw tablewalk: optimize a bit for tcd lock acquiring codes
It makes no sense to put the instructions for calculating the lock value (cpu number + 1) and the clearing of eq bit of cr1 in lbarx/stbcx loop. And when the lock is acquired by the other thread, the current lock value has no chance to equal with the lock value used by current cpu. So we can skip the comparing for these two lock values in the lbz/bne loop. Signed-off-by: Kevin Hao --- arch/powerpc/mm/tlb_low_64e.S | 10 +- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/mm/tlb_low_64e.S b/arch/powerpc/mm/tlb_low_64e.S index 765b419883f2..e4185581c5a7 100644 --- a/arch/powerpc/mm/tlb_low_64e.S +++ b/arch/powerpc/mm/tlb_low_64e.S @@ -308,11 +308,11 @@ BEGIN_FTR_SECTION /* CPU_FTR_SMT */ * * MAS6:IND should be already set based on MAS4 */ -1: lbarx r15,0,r11 lhz r10,PACAPACAINDEX(r13) - cmpdi r15,0 - cmpdi cr1,r15,1 /* set cr1.eq = 0 for non-recursive */ addir10,r10,1 + crclr cr1*4+eq/* set cr1.eq = 0 for non-recursive */ +1: lbarx r15,0,r11 + cmpdi r15,0 bne 2f stbcx. r10,0,r11 bne 1b @@ -320,9 +320,9 @@ BEGIN_FTR_SECTION /* CPU_FTR_SMT */ .subsection 1 2: cmpdcr1,r15,r10 /* recursive lock due to mcheck/crit/etc? */ beq cr1,3b /* unlock will happen if cr1.eq = 0 */ - lbz r15,0(r11) +10:lbz r15,0(r11) cmpdi r15,0 - bne 2b + bne 10b b 1b .previous -- 2.1.0 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH 1/3] powerpc/e6500: remove the stale TCD_LOCK macro
Since we moved the "lock" to be the first element of struct tlb_core_data in commit 82d86de25b9c ("powerpc/e6500: Make TLB lock recursive), this macro is not used by any code. Just delete it. Signed-off-by: Kevin Hao --- arch/powerpc/kernel/asm-offsets.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 98230579d99c..810f433731dc 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -213,7 +213,6 @@ int main(void) offsetof(struct tlb_core_data, esel_max)); DEFINE(TCD_ESEL_FIRST, offsetof(struct tlb_core_data, esel_first)); - DEFINE(TCD_LOCK, offsetof(struct tlb_core_data, lock)); #endif /* CONFIG_PPC_BOOK3E */ #ifdef CONFIG_PPC_STD_MMU_64 -- 2.1.0 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH] powerpc/slb: Use a local to avoid multiple calls to get_slb_shadow()
For no reason other than it looks ugly. Signed-off-by: Michael Ellerman --- arch/powerpc/mm/slb.c | 10 +- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c index 0c7115fd314b..515730e499fe 100644 --- a/arch/powerpc/mm/slb.c +++ b/arch/powerpc/mm/slb.c @@ -62,16 +62,16 @@ static inline void slb_shadow_update(unsigned long ea, int ssize, unsigned long flags, enum slb_index index) { + struct slb_shadow *p = get_slb_shadow(); + /* * Clear the ESID first so the entry is not valid while we are * updating it. No write barriers are needed here, provided * we only update the current CPU's SLB shadow buffer. */ - get_slb_shadow()->save_area[index].esid = 0; - get_slb_shadow()->save_area[index].vsid = - cpu_to_be64(mk_vsid_data(ea, ssize, flags)); - get_slb_shadow()->save_area[index].esid = - cpu_to_be64(mk_esid_data(ea, ssize, index)); + p->save_area[index].esid = 0; + p->save_area[index].vsid = cpu_to_be64(mk_vsid_data(ea, ssize, flags)); + p->save_area[index].esid = cpu_to_be64(mk_esid_data(ea, ssize, index)); } static inline void slb_shadow_clear(enum slb_index index) -- 2.1.4 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v2] powerpc/slb: Define an enum for the bolted indexes
From: Anshuman Khandual This patch defines macros for the three bolted SLB indexes we use. Switch the functions that take the indexes as an argument to use the enum. Signed-off-by: Anshuman Khandual Signed-off-by: Michael Ellerman --- v2: Use index rather than slot as that's what the ISA docs call it. Use the enum in the function signatures. arch/powerpc/mm/slb.c | 47 ++- 1 file changed, 26 insertions(+), 21 deletions(-) diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c index 8a32a2be3c53..0c7115fd314b 100644 --- a/arch/powerpc/mm/slb.c +++ b/arch/powerpc/mm/slb.c @@ -25,6 +25,11 @@ #include #include +enum slb_index { + LINEAR_INDEX= 0, /* Kernel linear map (0xc000) */ + VMALLOC_INDEX = 1, /* Kernel virtual map (0xd000) */ + KSTACK_INDEX= 2, /* Kernel stack map */ +}; extern void slb_allocate_realmode(unsigned long ea); extern void slb_allocate_user(unsigned long ea); @@ -41,9 +46,9 @@ static void slb_allocate(unsigned long ea) (((ssize) == MMU_SEGSIZE_256M)? ESID_MASK: ESID_MASK_1T) static inline unsigned long mk_esid_data(unsigned long ea, int ssize, -unsigned long entry) +enum slb_index index) { - return (ea & slb_esid_mask(ssize)) | SLB_ESID_V | entry; + return (ea & slb_esid_mask(ssize)) | SLB_ESID_V | index; } static inline unsigned long mk_vsid_data(unsigned long ea, int ssize, @@ -55,39 +60,39 @@ static inline unsigned long mk_vsid_data(unsigned long ea, int ssize, static inline void slb_shadow_update(unsigned long ea, int ssize, unsigned long flags, -unsigned long entry) +enum slb_index index) { /* * Clear the ESID first so the entry is not valid while we are * updating it. No write barriers are needed here, provided * we only update the current CPU's SLB shadow buffer. */ - get_slb_shadow()->save_area[entry].esid = 0; - get_slb_shadow()->save_area[entry].vsid = + get_slb_shadow()->save_area[index].esid = 0; + get_slb_shadow()->save_area[index].vsid = cpu_to_be64(mk_vsid_data(ea, ssize, flags)); - get_slb_shadow()->save_area[entry].esid = - cpu_to_be64(mk_esid_data(ea, ssize, entry)); + get_slb_shadow()->save_area[index].esid = + cpu_to_be64(mk_esid_data(ea, ssize, index)); } -static inline void slb_shadow_clear(unsigned long entry) +static inline void slb_shadow_clear(enum slb_index index) { - get_slb_shadow()->save_area[entry].esid = 0; + get_slb_shadow()->save_area[index].esid = 0; } static inline void create_shadowed_slbe(unsigned long ea, int ssize, unsigned long flags, - unsigned long entry) + enum slb_index index) { /* * Updating the shadow buffer before writing the SLB ensures * we don't get a stale entry here if we get preempted by PHYP * between these two statements. */ - slb_shadow_update(ea, ssize, flags, entry); + slb_shadow_update(ea, ssize, flags, index); asm volatile("slbmte %0,%1" : : "r" (mk_vsid_data(ea, ssize, flags)), - "r" (mk_esid_data(ea, ssize, entry)) + "r" (mk_esid_data(ea, ssize, index)) : "memory" ); } @@ -103,16 +108,16 @@ static void __slb_flush_and_rebolt(void) lflags = SLB_VSID_KERNEL | linear_llp; vflags = SLB_VSID_KERNEL | vmalloc_llp; - ksp_esid_data = mk_esid_data(get_paca()->kstack, mmu_kernel_ssize, 2); + ksp_esid_data = mk_esid_data(get_paca()->kstack, mmu_kernel_ssize, KSTACK_INDEX); if ((ksp_esid_data & ~0xfffUL) <= PAGE_OFFSET) { ksp_esid_data &= ~SLB_ESID_V; ksp_vsid_data = 0; - slb_shadow_clear(2); + slb_shadow_clear(KSTACK_INDEX); } else { /* Update stack entry; others don't change */ - slb_shadow_update(get_paca()->kstack, mmu_kernel_ssize, lflags, 2); + slb_shadow_update(get_paca()->kstack, mmu_kernel_ssize, lflags, KSTACK_INDEX); ksp_vsid_data = - be64_to_cpu(get_slb_shadow()->save_area[2].vsid); + be64_to_cpu(get_slb_shadow()->save_area[KSTACK_INDEX].vsid); } /* We need to do this all in asm, so we're sure we don't touch @@ -151,7 +156,7 @@ void slb_vmalloc_update(void) unsigned long vflags; vflags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_vmalloc_psize].sllp; - slb_shadow_update(VMALLOC_START, m
Re: [PATCH 02/20] powerpc/8xx: Map linear kernel RAM with 8M pages
On Wed, Aug 12, 2015 at 03:40:56PM +0200, Christophe Leroy wrote: > /* Insert level 1 index */ > rlwimi r11, r10, 32 - ((PAGE_SHIFT - 2) << 1), (PAGE_SHIFT - 2) << 1, > 29 > lwz r11, (swapper_pg_dir-PAGE_OFFSET)@l(r11)/* Get the > level 1 entry */ > + mtcrr11 Maybe mtcrf is faster? You only want one field, anyhow. > + bgt-cr7,5f /* CR7.GT = bit 29 = Large page (8M or 512K) */ You can write this as bt- 29,5f which should be easier to read. > /* Insert level 1 index */ > 3: rlwimi r11, r10, 32 - ((PAGE_SHIFT - 2) << 1), (PAGE_SHIFT - 2) << 1, > 29 > lwz r11, (swapper_pg_dir-PAGE_OFFSET)@l(r11)/* Get the > level 1 entry */ > + mtcrr11 > + bgt cr7,200f Same here... Probably good to comment it, too. Segher ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev