Re: [PATCH 04/23] scsi: initialize scsi midlayer limits before allocating the queue

2024-06-06 Thread John Garry





diff --git a/drivers/ata/pata_macio.c b/drivers/ata/pata_macio.c
index 817838e2f70e..3cb455a32d92 100644
--- a/drivers/ata/pata_macio.c
+++ b/drivers/ata/pata_macio.c
@@ -915,10 +915,13 @@ static const struct scsi_host_template pata_macio_sht = {
.sg_tablesize   = MAX_DCMDS,
/* We may not need that strict one */
.dma_boundary   = ATA_DMA_BOUNDARY,
-   /* Not sure what the real max is but we know it's less than 64K, let's
-* use 64K minus 256
+   /*
+* The SCSI core requires the segment size to cover at least a page, so
+* for 64K page size kernels this must be at least 64K. However the
+* hardware can't handle 64K, so pata_macio_qc_prep() will split large
+* requests.
 */
-   .max_segment_size   = MAX_DBDMA_SEG,
+   .max_segment_size   = SZ_64K,
.device_configure   = pata_macio_device_configure,
.sdev_groups= ata_common_sdev_groups,
.can_queue  = ATA_DEF_QUEUE,



Feel free to add:
Reviewed-by: John Garry 


Re: [PATCH v5 10/15] perf jevents: Generate metrics and events as separate tables

2023-02-01 Thread John Garry

On 30/01/2023 22:54, Ian Rogers wrote:

This is almost identical to generated perf_pmu__find_events_table(),
except we return a pmu_metrics_table * (instead of a pmu_events_table *)
and also return the metric table member (instead of event table). But
the definitions are:

/* Struct used to make the PMU event table implementation opaque to
callers. */
struct pmu_events_table {
  const struct compact_pmu_event *entries;
  size_t length;
};

/* Struct used to make the PMU metric table implementation opaque to
callers. */
struct pmu_metrics_table {
  const struct compact_pmu_event *entries;
  size_t length;
};

Those structs are defined to be the same thing, so I am failing to see
the point in a) separate structure types b) why so much duplication

As for b), I know that they are generated and the python code may be
simpler this way (is it?), but still...

Agreed. The point is to separate the two tables for the typing at the
API layer, internally the representation is the same. When we decode
one we get a pmu_event and the other we get a pmu_metric, so we don't
want to allow the tables to be switched - hence two types.


I do see the advantage of stronger types but it does seem a bit odd to 
achieve it like this.


Thanks,
John


Re: [PATCH v5 00/15] jevents/pmu-events improvements

2023-02-01 Thread John Garry

On 31/01/2023 00:39, Ian Rogers wrote:

Thanks John, will fix. Is there anything else?

Do you think that pmu-events/__pycache__/metric.cpython-36.pyc should be
deleted with a make clean? I would expect stuff like this to be deleted
(with a clean), but I am not sure if we have a policy on this (pyc files)

Should they be covered by the existing clean target?
https://urldefense.com/v3/__https://git.kernel.org/pub/scm/linux/kernel/git/acme/linux.git/tree/tools/perf/Makefile.perf?h=perf*core*n1102__;LyM!!ACWV5N9M2RV99hQ!IqhXlW8RwYRwSK4Gq_djcf1C7Zjp_q6OmUE8Kb6Cei9CvHFBoJWyMfT3IR8RHRS8iKkd7ZlvE4mvil-4Aos$  


Well it didn't seem to work for me and I was using acme tmp.perf/core 
branch.


Thanks,
John


Re: [PATCH v5 10/15] perf jevents: Generate metrics and events as separate tables

2023-01-30 Thread John Garry

On 26/01/2023 23:36, Ian Rogers wrote:

@@ -660,7 +763,29 @@ const struct pmu_events_table 
*perf_pmu__find_events_table(struct perf_pmu *pmu)
  
  const struct pmu_metrics_table *perf_pmu__find_metrics_table(struct perf_pmu *pmu)

  {
-return (struct pmu_metrics_table *)perf_pmu__find_events_table(pmu);
+const struct pmu_metrics_table *table = NULL;
+char *cpuid = perf_pmu__getcpuid(pmu);
+int i;
+
+/* on some platforms which uses cpus map, cpuid can be NULL for
+ * PMUs other than CORE PMUs.
+ */
+if (!cpuid)
+return NULL;
+
+i = 0;
+for (;;) {
+const struct pmu_events_map *map = _events_map[i++];
+if (!map->arch)
+break;
+
+if (!strcmp_cpuid_str(map->cpuid, cpuid)) {
+table = >metric_table;
+break;
+}
+}
+free(cpuid);
+return table;
  }


This is almost identical to generated perf_pmu__find_events_table(), 
except we return a pmu_metrics_table * (instead of a pmu_events_table *) 
and also return the metric table member (instead of event table). But 
the definitions are:


/* Struct used to make the PMU event table implementation opaque to 
callers. */

struct pmu_events_table {
const struct compact_pmu_event *entries;
size_t length;
};

/* Struct used to make the PMU metric table implementation opaque to 
callers. */

struct pmu_metrics_table {
const struct compact_pmu_event *entries;
size_t length;
};

Those structs are defined to be the same thing, so I am failing to see 
the point in a) separate structure types b) why so much duplication


As for b), I know that they are generated and the python code may be 
simpler this way (is it?), but still...


Thanks,
John


Re: [PATCH v5 00/15] jevents/pmu-events improvements

2023-01-30 Thread John Garry

On 27/01/2023 13:48, Ian Rogers wrote:
On Fri, Jan 27, 2023, 5:20 AM John Garry <mailto:john.g.ga...@oracle.com>> wrote:


On 26/01/2023 23:36, Ian Rogers wrote:

Hi Ian,

At a glance, none of this series has your Signed-off-by tag..

Thanks,
John



Thanks John, will fix. Is there anything else?


Do you think that pmu-events/__pycache__/metric.cpython-36.pyc should be 
deleted with a make clean? I would expect stuff like this to be deleted 
(with a clean), but I am not sure if we have a policy on this (pyc files)


Thanks,
John


Re: [PATCH v5 00/15] jevents/pmu-events improvements

2023-01-27 Thread John Garry

On 27/01/2023 13:48, Ian Rogers wrote:
On Fri, Jan 27, 2023, 5:20 AM John Garry <mailto:john.g.ga...@oracle.com>> wrote:


On 26/01/2023 23:36, Ian Rogers wrote:

Hi Ian,

At a glance, none of this series has your Signed-off-by tag..

Thanks,
John



Thanks John, will fix. Is there anything else?


Not yet, but I am just trying to get through it - there's a lot here...

Thanks,
John


Re: [PATCH v5 00/15] jevents/pmu-events improvements

2023-01-27 Thread John Garry

On 26/01/2023 23:36, Ian Rogers wrote:

Hi Ian,

At a glance, none of this series has your Signed-off-by tag..

Thanks,
John


Add an optimization to jevents using the metric code, rewrite metrics
in terms of each other in order to minimize size and improve
readability. For example, on Power8
other_stall_cpi is rewritten from:
"PM_CMPLU_STALL / PM_RUN_INST_CMPL - PM_CMPLU_STALL_BRU_CRU / PM_RUN_INST_CMPL - 
PM_CMPLU_STALL_FXU / PM_RUN_INST_CMPL - PM_CMPLU_STALL_VSU / PM_RUN_INST_CMPL - 
PM_CMPLU_STALL_LSU / PM_RUN_INST_CMPL - PM_CMPLU_STALL_NTCG_FLUSH / PM_RUN_INST_CMPL - 
PM_CMPLU_STALL_NO_NTF / PM_RUN_INST_CMPL"
to:
"stall_cpi - bru_cru_stall_cpi - fxu_stall_cpi - vsu_stall_cpi - lsu_stall_cpi - 
ntcg_flush_cpi - no_ntf_stall_cpi"
Which more closely matches the definition on Power9.

A limitation of the substitutions are that they depend on strict
equality and the shape of the tree. This means that for "a + b + c"
then a substitution of "a + b" will succeed while "b + c" will fail
(the LHS for "+ c" is "a + b" not just "b").

Separate out the events and metrics in the pmu-events tables saving
14.8% in the table size while making it that metrics no longer need to
iterate over all events and vice versa. These changes remove evsel's
direct metric support as the pmu_event no longer has a metric to
populate it. This is a minor issue as the code wasn't working
properly, metrics for this are rare and can still be properly ran
using '-M'.

Add an ability to just build certain models into the jevents generated
pmu-metrics.c code. This functionality is appropriate for operating
systems like ChromeOS, that aim to minimize binary size and know all
the target CPU models.




Re: [PATCH v5 12/15] perf pmu-events: Fix testing with JEVENTS_ARCH=all

2023-01-27 Thread John Garry

On 26/01/2023 23:36, Ian Rogers wrote:

The #slots literal will return NAN when not on ARM64 which causes a
perf test failure when not on an ARM64 for a JEVENTS_ARCH=all build:
..
  10.4: Parsing of PMU event table metrics with fake PMUs : FAILED!
..
Add an is_test boolean so that the failure can be avoided when running
as a test.

Fixes: acef233b7ca7 ("perf pmu: Add #slots literal support for arm64")
---


Again, no SoB here.

For me, though:

Reviewed-by: John Garry 


Re: [PATCH v5 11/15] perf jevents: Add model list option

2023-01-27 Thread John Garry

On 26/01/2023 23:36, Ian Rogers wrote:

This allows the set of generated jevents events and metrics be limited
to a subset of the model names. Appropriate if trying to minimize the
binary size where only a set of models are possible.


No SoB or RB tags at all.

Thanks,
John


---
  tools/perf/pmu-events/Build  |  3 ++-
  tools/perf/pmu-events/jevents.py | 14 ++
  2 files changed, 16 insertions(+), 1 deletion(-)




Re: [PATCH v4 02/12] perf jevents metric: Add ability to rewrite metrics in terms of others

2023-01-26 Thread John Garry

On 26/01/2023 01:18, Ian Rogers wrote:

Add RewriteMetricsInTermsOfOthers that iterates over pairs of names
and expressions trying to replace an expression, within the current
expression, with its name.

Signed-off-by: Ian Rogers 


hmmm ... did you test this for many python versions?

Maybe this patch causes this error:

Traceback (most recent call last):
 File "pmu-events/jevents.py", line 7, in 
   import metric
 File "/home/john/acme/tools/perf/pmu-events/metric.py", line 549, in 


   def RewriteMetricsInTermsOfOthers(metrics: list[Tuple[str, Expression]]
TypeError: 'type' object is not subscriptable
make[3]: *** [pmu-events/Build:26: pmu-events/pmu-events.c] Error 1
make[2]: *** [Makefile.perf:676: pmu-events/pmu-events-in.o] Error 2
make[2]: *** Waiting for unfinished jobs

I have python 3.6.15

Thanks,
John




Re: [PATCH v4 11/12] perf jevents: Add model list option

2023-01-26 Thread John Garry

On 26/01/2023 14:20, Arnaldo Carvalho de Melo wrote:

Em Thu, Jan 26, 2023 at 01:44:39PM +, John Garry escreveu:

On 26/01/2023 01:18, Ian Rogers wrote:

This allows the set of generated jevents events and metrics be limited
to a subset of the model names. Appropriate if trying to minimize the
binary size where only a set of models are possible.

Signed-off-by: Ian Rogers

Thanks for this:

Reviewed-by: John Garry

Thanks for reviewing the series John, I see there are a few patches for
which you didn't provide your Reviewed-by, 


Hi Arnaldo,


are you planning to review
those as well?



I will do when I get a chance over the next day or so. I am just 
checking the easier ones at the moment :)


Thanks,
John



Re: [PATCH v4 11/12] perf jevents: Add model list option

2023-01-26 Thread John Garry

On 26/01/2023 01:18, Ian Rogers wrote:

This allows the set of generated jevents events and metrics be limited
to a subset of the model names. Appropriate if trying to minimize the
binary size where only a set of models are possible.

Signed-off-by: Ian Rogers 


Thanks for this:

Reviewed-by: John Garry 


---
  tools/perf/pmu-events/Build  |  3 ++-
  tools/perf/pmu-events/jevents.py | 14 ++
  2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/tools/perf/pmu-events/Build b/tools/perf/pmu-events/Build
index 15b9e8fdbffa..a14de24ecb69 100644
--- a/tools/perf/pmu-events/Build
+++ b/tools/perf/pmu-events/Build
@@ -10,6 +10,7 @@ JEVENTS_PY=  pmu-events/jevents.py
  ifeq ($(JEVENTS_ARCH),)
  JEVENTS_ARCH=$(SRCARCH)
  endif
+JEVENTS_MODEL ?= all
  
  #

  # Locate/process JSON files in pmu-events/arch/
@@ -23,5 +24,5 @@ $(OUTPUT)pmu-events/pmu-events.c: 
pmu-events/empty-pmu-events.c
  else
  $(OUTPUT)pmu-events/pmu-events.c: $(JSON) $(JSON_TEST) $(JEVENTS_PY) 
pmu-events/metric.py
$(call rule_mkdir)
-   $(Q)$(call echo-cmd,gen)$(PYTHON) $(JEVENTS_PY) $(JEVENTS_ARCH) 
pmu-events/arch $@
+   $(Q)$(call echo-cmd,gen)$(PYTHON) $(JEVENTS_PY) $(JEVENTS_ARCH) 
$(JEVENTS_MODEL) pmu-events/arch $@
  endif
diff --git a/tools/perf/pmu-events/jevents.py b/tools/perf/pmu-events/jevents.py
index 627ee817f57f..2bcd07ce609f 100755
--- a/tools/perf/pmu-events/jevents.py
+++ b/tools/perf/pmu-events/jevents.py
@@ -599,6 +599,8 @@ const struct pmu_events_map pmu_events_map[] = {
  else:
metric_tblname = 'NULL'
metric_size = '0'
+if event_size == '0' and metric_size == '0':
+  continue
  cpuid = row[0].replace('\\', '')
  _args.output_file.write(f"""{{
  \t.arch = "{arch}",
@@ -888,12 +890,24 @@ def main() -> None:
action: Callable[[Sequence[str], os.DirEntry], None]) -> None:
  """Replicate the directory/file walking behavior of C's file tree walk."""
  for item in os.scandir(path):
+  if _args.model != 'all' and item.is_dir():
+# Check if the model matches one in _args.model.
+if len(parents) == _args.model.split(',')[0].count('/'):
+  # We're testing the correct directory.
+  item_path = '/'.join(parents) + ('/' if len(parents) > 0 else '') + 
item.name
+  if 'test' not in item_path and item_path not in 
_args.model.split(','):
+continue
action(parents, item)
if item.is_dir():
  ftw(item.path, parents + [item.name], action)
  
ap = argparse.ArgumentParser()

ap.add_argument('arch', help='Architecture name like x86')
+  ap.add_argument('model', help='''Select a model such as skylake to
+reduce the code size.  Normally set to "all". For architectures like
+ARM64 with an implementor/model, the model must include the implementor


mega-nit: /s/ARM64/arm64/

it just makes grepping easier (without -i, of course)


+such as "arm/cortex-a34".''',
+  default='all')
ap.add_argument(
'starting_dir',
type=dir_path,




Re: [PATCH v3 11/11] perf jevents: Add model list option

2023-01-25 Thread John Garry

On 24/01/2023 06:33, Ian Rogers wrote:

This allows the set of generated jevents events and metrics be limited
to a subset of the model names. Appropriate if trying to minimize the
binary size where only a set of models are possible. On ARM64 the
--model selects the implementor rather than model.

Signed-off-by: Ian Rogers 


would it be really difficult/painful to support model names as specified 
in the mapfile.csv, like "skylake" (for x86) or "hisilicon/hip08" (for 
arm64)?


Thanks,
John




Re: [PATCH v3 06/11] perf pmu-events: Remove now unused event and metric variables

2023-01-25 Thread John Garry

On 24/01/2023 06:33, Ian Rogers wrote:

Previous changes separated the uses of pmu_event and pmu_metric,
however, both structures contained all the variables of event and
metric. This change removes the event variables from metric and the
metric variables from event.

Note, this change removes the setting of evsel's metric_name/expr as
these fields are no longer part of struct pmu_event. The metric
remains but is no longer implicitly requested when the event is. This
impacts a few Intel uncore events, however, as the ScaleUnit is shared
by the event and the metric this utility is questionable. Also the
MetricNames look broken (contain spaces) in some cases and when trying
to use the functionality with '-e' the metrics fail but regular
metrics with '-M' work. For example, on SkylakeX '-M' works:



Reviewed-by: John Garry 


Re: [PATCH v3 05/11] perf pmu-events: Separate the metrics from events for no jevents

2023-01-24 Thread John Garry

On 24/01/2023 06:33, Ian Rogers wrote:

Separate the event and metric table when building without jevents. Add
find_core_metrics_table and perf_pmu__find_metrics_table while
renaming existing utilities to be event specific, so that users can
find the right table for their need.

Signed-off-by: Ian Rogers


Reviewed-by: John Garry 


Re: [PATCH v3 04/11] perf pmu-events: Add separate metric from pmu_event

2023-01-24 Thread John Garry

On 24/01/2023 06:33, Ian Rogers wrote:

Create a new pmu_metric for the metric related variables from
pmu_event but that is initially just a clone of pmu_event. Add
iterators for pmu_metric and use in places that metrics are desired
rather than events. Make the event iterator skip metric only events,
and the metric iterator skip event only events.

Signed-off-by: Ian Rogers


Reviewed-by: John Garry 


Re: [PATCH v2 4/9] perf pmu-events: Separate metric out of pmu_event

2023-01-23 Thread John Garry

On 21/12/2022 22:34, Ian Rogers wrote:

Previously both events and metrics were encoded in struct
pmu_event. Create a new pmu_metric that has the metric related
variables and remove these from pmu_event. Add iterators for
pmu_metric and use in places that metrics are desired rather than
events.

Note, this change removes the setting of evsel's metric_name/expr as
these fields are no longer part of struct pmu_event. The metric
remains but is no longer implicitly requested when the event is. This
impacts a few Intel uncore events, however, as the ScaleUnit is shared
by the event and the metric this utility is questionable. Also the
MetricNames look broken (contain spaces) in some cases and when trying
to use the functionality with '-e' the metrics fail but regular
metrics with '-M' work. For example, on SkylakeX '-M' works:



I like this change. It's quite large for a single patch. Just some 
sparse comments below.


BTW, it there a better name for metric struct variable than "pm"? To me 
and many other people, pm is power management.



```
$ perf stat -M LLC_MISSES.PCIE_WRITE -a sleep 1

  Performance counter stats for 'system wide':

  0  UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART2 #  57896.0 
Bytes  LLC_MISSES.PCIE_WRITE  (49.84%)
  7,174  UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART1
(49.85%)
  0  UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART3
(50.16%)
 63  UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART0
(50.15%)

1.004576381 seconds time elapsed
```

whilst the event '-e' version is broken even with --group/-g (fwiw, we should 
also remove -g [1]):

```
$ perf stat -g -e LLC_MISSES.PCIE_WRITE -g -a sleep 1
Add UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART2 event to groups to get metric 
expression for LLC_MISSES.PCIE_WRITE
Add UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART1 event to groups to get metric 
expression for LLC_MISSES.PCIE_WRITE
Add UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART3 event to groups to get metric 
expression for LLC_MISSES.PCIE_WRITE
Add UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART0 event to groups to get metric 
expression for LLC_MISSES.PCIE_WRITE
Add UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART2 event to groups to get metric 
expression for LLC_MISSES.PCIE_WRITE
Add UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART1 event to groups to get metric 
expression for LLC_MISSES.PCIE_WRITE
Add UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART3 event to groups to get metric 
expression for LLC_MISSES.PCIE_WRITE
Add UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART0 event to groups to get metric 
expression for LLC_MISSES.PCIE_WRITE
Add UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART2 event to groups to get metric 
expression for LLC_MISSES.PCIE_WRITE
Add UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART1 event to groups to get metric 
expression for LLC_MISSES.PCIE_WRITE
Add UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART3 event to groups to get metric 
expression for LLC_MISSES.PCIE_WRITE
Add UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART0 event to groups to get metric 
expression for LLC_MISSES.PCIE_WRITE
Add UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART2 event to groups to get metric 
expression for LLC_MISSES.PCIE_WRITE
Add UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART1 event to groups to get metric 
expression for LLC_MISSES.PCIE_WRITE
Add UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART3 event to groups to get metric 
expression for LLC_MISSES.PCIE_WRITE
Add UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART0 event to groups to get metric 
expression for LLC_MISSES.PCIE_WRITE
Add UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART2 event to groups to get metric 
expression for LLC_MISSES.PCIE_WRITE
Add UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART1 event to groups to get metric 
expression for LLC_MISSES.PCIE_WRITE
Add UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART3 event to groups to get metric 
expression for LLC_MISSES.PCIE_WRITE
Add UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART0 event to groups to get metric 
expression for LLC_MISSES.PCIE_WRITE
Add UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART2 event to groups to get metric 
expression for LLC_MISSES.PCIE_WRITE
Add UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART1 event to groups to get metric 
expression for LLC_MISSES.PCIE_WRITE
Add UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART3 event to groups to get metric 
expression for LLC_MISSES.PCIE_WRITE
Add UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART0 event to groups to get metric 
expression for LLC_MISSES.PCIE_WRITE

  Performance counter stats for 'system wide':

 27,316 Bytes LLC_MISSES.PCIE_WRITE

1.004505469 seconds time elapsed
```

The code also carries warnings where the user is supposed to select
events for metrics [2] but given the lack of use of such a feature,
let's clean the code and just remove.

With NO_JEVENTS=1 the empty-pmu-events.c is used and the separation to
use metrics causes "Parse and process metrics" and "Event expansion
for cgroups" to fail as they fail 

Re: [PATCH v2 7/9] perf pmu-events: Introduce pmu_metrics_table

2023-01-23 Thread John Garry

On 21/12/2022 22:34, Ian Rogers wrote:

Add a metrics table that is just a cast from pmu_events_table. This
changes the APIs so that event and metric usage of the underlying
table is different. Later changes will separate the tables.

This introduction fixes a NO_JEVENTS=1 regression on:
  68: Parse and process metrics   : Ok
  70: Event expansion for cgroups : Ok
caused by the necessary test metrics not being found.



I have just checked some of this code so far...


Signed-off-by: Ian Rogers 
---
  tools/perf/arch/arm64/util/pmu.c | 23 ++-
  tools/perf/pmu-events/empty-pmu-events.c | 52 
  tools/perf/pmu-events/jevents.py | 24 ---
  tools/perf/pmu-events/pmu-events.h   | 10 +++--
  tools/perf/tests/expand-cgroup.c |  4 +-
  tools/perf/tests/parse-metric.c  |  4 +-
  tools/perf/tests/pmu-events.c|  5 ++-
  tools/perf/util/metricgroup.c| 50 +++
  tools/perf/util/metricgroup.h|  2 +-
  tools/perf/util/pmu.c|  9 +++-
  tools/perf/util/pmu.h|  1 +
  11 files changed, 133 insertions(+), 51 deletions(-)

diff --git a/tools/perf/arch/arm64/util/pmu.c b/tools/perf/arch/arm64/util/pmu.c
index 477e513972a4..f8ae479a06db 100644
--- a/tools/perf/arch/arm64/util/pmu.c
+++ b/tools/perf/arch/arm64/util/pmu.c
@@ -19,7 +19,28 @@ const struct pmu_events_table *pmu_events_table__find(void)
if (pmu->cpus->nr != cpu__max_cpu().cpu)
return NULL;
  
-		return perf_pmu__find_table(pmu);

+   return perf_pmu__find_events_table(pmu);
+   }
+
+   return NULL;
+}
+
+const struct pmu_metrics_table *pmu_metrics_table__find(void)
+{
+   struct perf_pmu *pmu = NULL;
+
+   while ((pmu = perf_pmu__scan(pmu))) {
+   if (!is_pmu_core(pmu->name))
+   continue;
+
+   /*
+* The cpumap should cover all CPUs. Otherwise, some CPUs may
+* not support some events or have different event IDs.
+*/
+   if (pmu->cpus->nr != cpu__max_cpu().cpu)
+   return NULL;
+
+   return perf_pmu__find_metrics_table(pmu);


I think that this code will be conflicting with the recent arm64 metric 
support. And now it seems even more scope for factoring out code.



}
  
  	return NULL;

diff --git a/tools/perf/pmu-events/empty-pmu-events.c 
b/tools/perf/pmu-events/empty-pmu-events.c
index 5572a4d1eddb..d50f60a571dd 100644
--- a/tools/perf/pmu-events/empty-pmu-events.c
+++ b/tools/perf/pmu-events/empty-pmu-events.c
@@ -278,14 +278,12 @@ int pmu_events_table_for_each_event(const struct 
pmu_events_table *table, pmu_ev
return 0;
  }
  
-int pmu_events_table_for_each_metric(const struct pmu_events_table *etable, pmu_metric_iter_fn fn,

-void *data)
+int pmu_metrics_table_for_each_metric(const struct pmu_metrics_table *table, 
pmu_metric_iter_fn fn,
+ void *data)
  {
-   struct pmu_metrics_table *table = (struct pmu_metrics_table *)etable;
-
for (const struct pmu_metric *pm = >entries[0]


nit on coding style: do we normally declare local variables like this? 
It condenses the code but makes a bit less readable, IMHO



; pm->metric_group || pm->metric_name;
 pm++) {
-   int ret = fn(pm, etable, data);
+   int ret = fn(pm, table, data);
  
  		if (ret)

return ret;
@@ -293,7 +291,7 @@ int pmu_events_table_for_each_metric(const struct 
pmu_events_table *etable, pmu_
return 0;
  }
  
-const struct pmu_events_table *perf_pmu__find_table(struct perf_pmu *pmu)

+const struct pmu_events_table *perf_pmu__find_events_table(struct perf_pmu 
*pmu)
  {
const struct pmu_events_table *table = NULL;
char *cpuid = perf_pmu__getcpuid(pmu);
@@ -321,6 +319,34 @@ const struct pmu_events_table *perf_pmu__find_table(struct 
perf_pmu *pmu)
return table;
  }
  
+const struct pmu_metrics_table *perf_pmu__find_metrics_table(struct perf_pmu *pmu)

+{
+   const struct pmu_metrics_table *table = NULL;
+   char *cpuid = perf_pmu__getcpuid(pmu);
+   int i;
+
+   /* on some platforms which uses cpus map, cpuid can be NULL for
+* PMUs other than CORE PMUs.
+*/
+   if (!cpuid)
+   return NULL;
+
+   i = 0;
+   for (;;) {
+   const struct pmu_events_map *map = _events_map[i++];


To me, this is all strange code. Again this is a comment on the current 
code: Consider pmu_for_each_sys_event() as an example, we have a while 
loop for each member of pmu_sys_event_tables[]. But pmu_sys_event_tables 
is hardcoded for a single member, so why loop? It seems the same for all 
these "for each" helper in the "empty" 

Re: [PATCH v2 8/9] perf jevents: Generate metrics and events as separate tables

2023-01-23 Thread John Garry

On 21/12/2022 22:34, Ian Rogers wrote:

Turn a perf json event into an event, metric or both. This reduces the
number of events needed to scan to find an event or metric. As events
no longer need the relatively seldom used metric fields, 4 bytes is
saved per event. This reduces the big C string's size by 335kb (14.8%)
on x86.

Signed-off-by: Ian Rogers


It would have been good to show an example of how the output changes. I 
could not apply the series (see cover), and knowing what to expect makes 
reviewing the code easier...


Thanks,
John


Re: [PATCH v2 0/9] jevents/pmu-events improvements

2023-01-23 Thread John Garry

On 21/12/2022 22:34, Ian Rogers wrote:

Add an optimization to jevents using the metric code, rewrite metrics
in terms of each other in order to minimize size and improve
readability. For example, on Power8
other_stall_cpi is rewritten from:
"PM_CMPLU_STALL / PM_RUN_INST_CMPL - PM_CMPLU_STALL_BRU_CRU / PM_RUN_INST_CMPL - 
PM_CMPLU_STALL_FXU / PM_RUN_INST_CMPL - PM_CMPLU_STALL_VSU / PM_RUN_INST_CMPL - 
PM_CMPLU_STALL_LSU / PM_RUN_INST_CMPL - PM_CMPLU_STALL_NTCG_FLUSH / PM_RUN_INST_CMPL - 
PM_CMPLU_STALL_NO_NTF / PM_RUN_INST_CMPL"
to:
"stall_cpi - bru_cru_stall_cpi - fxu_stall_cpi - vsu_stall_cpi - lsu_stall_cpi - 
ntcg_flush_cpi - no_ntf_stall_cpi"
Which more closely matches the definition on Power9.

A limitation of the substitutions are that they depend on strict
equality and the shape of the tree. This means that for "a + b + c"
then a substitution of "a + b" will succeed while "b + c" will fail
(the LHS for "+ c" is "a + b" not just "b").

Separate out the events and metrics in the pmu-events tables saving
14.8% in the table size while making it that metrics no longer need to
iterate over all events and vice versa. These changes remove evsel's
direct metric support as the pmu_event no longer has a metric to
populate it. This is a minor issue as the code wasn't working
properly, metrics for this are rare and can still be properly ran
using '-M'.

Add an ability to just build certain models into the jevents generated
pmu-metrics.c code. This functionality is appropriate for operating
systems like ChromeOS, that aim to minimize binary size and know all
the target CPU models.


From a glance, this does not look like it would work for arm64. As I 
see in the code, we check the model in the arch folder for the test to 
see if built. For arm64, as it uses arch/implementator/model folder org, 
and not just arch/model (like x86)


So on the assumption that it does not work for arm64 (or just any arch 
which uses arch/implementator/model folder org), it would be nice to 
have that feature also. Or maybe also support not just specifying model 
but also implementator.




v2. Rebase. Modify the code that skips rewriting a metric with the
 same name with itself, to make the name check case insensitive.




Unfortunately you might need another rebase as this does not apply to 
acme perf/core (if that is what you want), now for me at:


5670ebf54bd2 (HEAD, origin/tmp.perf/core, origin/perf/core, perf/core)
perf cs-etm: Ensure that Coresight timestamps don't go backwards


Ian Rogers (9):
   perf jevents metric: Correct Function equality
   perf jevents metric: Add ability to rewrite metrics in terms of others
   perf jevents: Rewrite metrics in the same file with each other
   perf pmu-events: Separate metric out of pmu_event
   perf stat: Remove evsel metric_name/expr
   perf jevents: Combine table prefix and suffix writing
   perf pmu-events: Introduce pmu_metrics_table
   perf jevents: Generate metrics and events as separate tables
   perf jevents: Add model list option


Thanks,
John


Re: [PATCH v2 5/9] perf stat: Remove evsel metric_name/expr

2023-01-23 Thread John Garry

On 21/12/2022 22:34, Ian Rogers wrote:

Metrics are their own unit and these variables held broken metrics
previously and now just hold the value NULL. Remove code that used
these variables.

Signed-off-by: Ian Rogers


Reviewed-by: John Garry 


Re: [PATCH] scsi: ibmvfc: Stop using scsi_cmnd.tag

2021-08-18 Thread John Garry

Hi Martin,




Use scsi_cmd_to_rq(scsi_cmnd)->tag in preference to scsi_cmnd.tag.


Applied to 5.15/scsi-staging and rebased for bisectability.



Thanks, and sorry for the hassle. But I would still like the maintainers 
to have a look, as I was curious about current usage of scsi_cmnd.tag in 
that driver.



Just to be picky it looks like there's another scsi_cmmd tag lurking in
qla1280.c but it's sitting behind an #ifdef DEBUG_QLA1280.



That driver does not even compile with DEBUG_QLA1280 set beforehand. 
I'll fix that up and send as separate patches in case you want to 
shuffle the tag patch in earlier, which is prob not worth the effort.


I've done a good few more x86 randconfigs and tried to audit the code 
for more references, so hopefully that's the last.


Thanks


[PATCH] scsi: ibmvfc: Stop using scsi_cmnd.tag

2021-08-17 Thread John Garry
Use scsi_cmd_to_rq(scsi_cmnd)->tag in preference to scsi_cmnd.tag.

Signed-off-by: John Garry 
---
This patch was missed in a series to remove scsi_cmnd.tag, which caused
a build error on Martin's SCSI staging tree:
https://lore.kernel.org/linux-scsi/yq14kbppa42@ca-mkp.ca.oracle.com/T/#mb47909f38f35837686734369600051b278d124af

I note that scsi_cmnd.tag is/was an unsigned char, and I could not find
anywhere in the driver which limits scsi_host.can_queue to 255, so using
scsi_cmnd.tag looks odd to me.

diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c
index 7fa5e64e38c3..ba7150cb226a 100644
--- a/drivers/scsi/ibmvscsi/ibmvfc.c
+++ b/drivers/scsi/ibmvscsi/ibmvfc.c
@@ -1956,7 +1956,7 @@ static int ibmvfc_queuecommand(struct Scsi_Host *shost, 
struct scsi_cmnd *cmnd)
memcpy(iu->cdb, cmnd->cmnd, cmnd->cmd_len);
 
if (cmnd->flags & SCMD_TAGGED) {
-   vfc_cmd->task_tag = cpu_to_be64(cmnd->tag);
+   vfc_cmd->task_tag = cpu_to_be64(scsi_cmd_to_rq(cmnd)->tag);
iu->pri_task_attr = IBMVFC_SIMPLE_TASK;
}
 
-- 
2.17.1



Re: [PATCH v4 01/21] ibmvfc: add vhost fields and defaults for MQ enablement

2021-01-12 Thread John Garry

On 11/01/2021 23:12, Tyrel Datwyler wrote:

Introduce several new vhost fields for managing MQ state of the adapter
as well as initial defaults for MQ enablement.

Signed-off-by: Tyrel Datwyler 
---
  drivers/scsi/ibmvscsi/ibmvfc.c | 8 
  drivers/scsi/ibmvscsi/ibmvfc.h | 9 +
  2 files changed, 17 insertions(+)

diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c
index ba95438a8912..9200fe49c57e 100644
--- a/drivers/scsi/ibmvscsi/ibmvfc.c
+++ b/drivers/scsi/ibmvscsi/ibmvfc.c
@@ -3302,6 +3302,7 @@ static struct scsi_host_template driver_template = {
.max_sectors = IBMVFC_MAX_SECTORS,
.shost_attrs = ibmvfc_attrs,
.track_queue_depth = 1,
+   .host_tagset = 1,


Good to see another user :)

I didn't check the whole series very thoroughly, but I guess that you 
only need to set this when shost->nr_hw_queues > 1. Having said that, it 
should be fine when shost->nr_hw_queues = 0 or 1.


Thanks,
John


  };
  
  /**

@@ -5290,6 +5291,7 @@ static int ibmvfc_probe(struct vio_dev *vdev, const 
struct vio_device_id *id)
shost->max_sectors = IBMVFC_MAX_SECTORS;
shost->max_cmd_len = IBMVFC_MAX_CDB_LEN;
shost->unique_id = shost->host_no;
+   shost->nr_hw_queues = IBMVFC_MQ ? IBMVFC_SCSI_HW_QUEUES : 1;
  
  	vhost = shost_priv(shost);

INIT_LIST_HEAD(>targets);
@@ -5300,6 +5302,12 @@ static int ibmvfc_probe(struct vio_dev *vdev, const 
struct vio_device_id *id)
vhost->partition_number = -1;
vhost->log_level = log_level;
vhost->task_set = 1;
+
+   vhost->mq_enabled = IBMVFC_MQ;
+   vhost->client_scsi_channels = IBMVFC_SCSI_CHANNELS;
+   vhost->using_channels = 0;
+   vhost->do_enquiry = 1;
+
strcpy(vhost->partition_name, "UNKNOWN");
init_waitqueue_head(>work_wait_q);
init_waitqueue_head(>init_wait_q);
diff --git a/drivers/scsi/ibmvscsi/ibmvfc.h b/drivers/scsi/ibmvscsi/ibmvfc.h
index 632e977449c5..dd6d89292867 100644
--- a/drivers/scsi/ibmvscsi/ibmvfc.h
+++ b/drivers/scsi/ibmvscsi/ibmvfc.h
@@ -41,6 +41,11 @@
  #define IBMVFC_DEFAULT_LOG_LEVEL  2
  #define IBMVFC_MAX_CDB_LEN16
  #define IBMVFC_CLS3_ERROR 0
+#define IBMVFC_MQ  0
+#define IBMVFC_SCSI_CHANNELS   0
+#define IBMVFC_SCSI_HW_QUEUES  1
+#define IBMVFC_MIG_NO_SUB_TO_CRQ   0
+#define IBMVFC_MIG_NO_N_TO_M   0
  
  /*

   * Ensure we have resources for ERP and initialization:
@@ -840,6 +845,10 @@ struct ibmvfc_host {
int delay_init;
int scan_complete;
int logged_in;
+   int mq_enabled;
+   int using_channels;
+   int do_enquiry;
+   int client_scsi_channels;
int aborting_passthru;
int events_to_log;
  #define IBMVFC_AE_LINKUP  0x0001





Re: [PATCH v2 01/17] ibmvfc: add vhost fields and defaults for MQ enablement

2020-12-17 Thread John Garry

On 08/12/2020 22:37, Tyrel Datwyler wrote:

On 12/7/20 3:56 AM, Hannes Reinecke wrote:

On 12/4/20 3:26 PM, Brian King wrote:

On 12/2/20 11:27 AM, Tyrel Datwyler wrote:

On 12/2/20 7:14 AM, Brian King wrote:

On 12/1/20 6:53 PM, Tyrel Datwyler wrote:

Introduce several new vhost fields for managing MQ state of the adapter
as well as initial defaults for MQ enablement.

Signed-off-by: Tyrel Datwyler 
---
   drivers/scsi/ibmvscsi/ibmvfc.c |  9 -
   drivers/scsi/ibmvscsi/ibmvfc.h | 13 +++--
   2 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c
index 42e4d35e0d35..f1d677a7423d 100644
--- a/drivers/scsi/ibmvscsi/ibmvfc.c
+++ b/drivers/scsi/ibmvscsi/ibmvfc.c
@@ -5161,12 +5161,13 @@ static int ibmvfc_probe(struct vio_dev *vdev, const
struct vio_device_id *id)
   }
     shost->transportt = ibmvfc_transport_template;
-    shost->can_queue = max_requests;
+    shost->can_queue = (max_requests / IBMVFC_SCSI_HW_QUEUES);


This doesn't look right. can_queue is the SCSI host queue depth, not the MQ
queue depth.


Our max_requests is the total number commands allowed across all queues. From
what I understand is can_queue is the total number of commands in flight allowed
for each hw queue.

  /*
   * In scsi-mq mode, the number of hardware queues supported by the 
LLD.
   *
   * Note: it is assumed that each hardware queue has a queue depth of
   * can_queue. In other words, the total queue depth per host
   * is nr_hw_queues * can_queue. However, for when host_tagset is set,
   * the total queue depth is can_queue.
   */

We currently don't use the host wide shared tagset.


Ok. I missed that bit... In that case, since we allocate by default only 100
event structs. If we slice that across IBMVFC_SCSI_HW_QUEUES (16) queues, then
we end up with only about 6 commands that can be outstanding per queue,
which is going to really hurt performance... I'd suggest bumping up
IBMVFC_MAX_REQUESTS_DEFAULT from 100 to 1000 as a starting point.


Before doing that I'd rather use the host-wide shared tagset.
Increasing the number of requests will increase the memory footprint of the
driver (as each request will be statically allocated).


Exposing HW queues increases memory footprint as we allocate the static 
requests per HW queue ctx, regardless of shared hostwide tagset enabled 
or not. This could prob be improved.






In the case where we use host-wide how do I determine the queue depth per
hardware queue? Is is hypothetically can_queue or is it (can_queue /
nr_hw_queues)? We want to allocate an event pool per-queue which made sense
without host-wide tags since the queue depth per hw queue is exactly can_queue.



Generally hw queue depth should be same as can_queue. And this applies 
when hostwide shared tags is enabled as well.


We do this for hisi_sas: the host can queue max 4096 commands over all 
queues, so we set .can_queue = 4096*, set HW queue depth = 4096, and set 
.host_tagset = 1.


* we need to reserve some commands for internal IO, so this is reduced a 
little


Thanks,
John


Re: [PATCH v8 1/7] iommu: enhance IOMMU default DMA mode build options

2019-05-31 Thread John Garry




-config IOMMU_DEFAULT_PASSTHROUGH
-bool "IOMMU passthrough by default"
+choice
+prompt "IOMMU default DMA mode"
 depends on IOMMU_API
-help
-  Enable passthrough by default, removing the need to pass in
-  iommu.passthrough=on or iommu=pt through command line. If this
-  is enabled, you can still disable with iommu.passthrough=off
-  or iommu=nopt depending on the architecture.
+default IOMMU_DEFAULT_STRICT
+help
+  This option allows IOMMU DMA mode to be chose at build time, to


As before:
/s/chose/chosen/, /s/allows IOMMU/allows an IOMMU/

I'm sorry that the previous version was not modified.




+  override the default DMA mode of each ARCHs, removing the need to


Again, as before:
ARCHs should be singular

OK




+  pass in kernel parameters through command line. You can still use
+  ARCHs specific boot options to override this option again.


*


+
+config IOMMU_DEFAULT_PASSTHROUGH
+bool "passthrough"
+help
+  In this mode, the DMA access through IOMMU without any addresses
+  translation. That means, the wrong or illegal DMA access can not
+  be caught, no error information will be reported.

   If unsure, say N here.

+config IOMMU_DEFAULT_LAZY
+bool "lazy"
+help
+  Support lazy mode, where for every IOMMU DMA unmap operation, the
+  flush operation of IOTLB and the free operation of IOVA are deferred.
+  They are only guaranteed to be done before the related IOVA will be
+  reused.


why no advisory on how to set if unsure?

Because the LAZY and STRICT have their own advantages and disadvantages.

Should I say: If unsure, keep the default。


Maybe. So you could put this in the help for the choice, * above, and 
remove the advisory on IOMMU_DEFAULT_PASSTHROUGH.


However the maintainer may have a different view.

Thanks,
John






+
+config IOMMU_DEFAULT_STRICT
+bool "strict"
+help
+  For every IOMMU DMA unmap operation, the flush operation of IOTLB and
+  the free operation of IOVA are guaranteed to be done in the unmap
+  function.
+
+  This mode is safer than the two above, but it maybe slower in some
+  high performace scenarios.


and here?




Re: [PATCH v8 1/7] iommu: enhance IOMMU default DMA mode build options

2019-05-30 Thread John Garry

On 30/05/2019 04:48, Zhen Lei wrote:

First, add build option IOMMU_DEFAULT_{LAZY|STRICT}, so that we have the
opportunity to set {lazy|strict} mode as default at build time. Then put
the three config options in an choice, make people can only choose one of
the three at a time.



Since this was not picked up, but modulo (somtimes same) comments below:

Reviewed-by: John Garry 


Signed-off-by: Zhen Lei 
---
 drivers/iommu/Kconfig | 42 +++---
 drivers/iommu/iommu.c |  3 ++-
 2 files changed, 37 insertions(+), 8 deletions(-)

diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 83664db5221df02..d6a1a45f80ffbf5 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -75,17 +75,45 @@ config IOMMU_DEBUGFS
  debug/iommu directory, and then populate a subdirectory with
  entries as required.

-config IOMMU_DEFAULT_PASSTHROUGH
-   bool "IOMMU passthrough by default"
+choice
+   prompt "IOMMU default DMA mode"
depends on IOMMU_API
-help
- Enable passthrough by default, removing the need to pass in
- iommu.passthrough=on or iommu=pt through command line. If this
- is enabled, you can still disable with iommu.passthrough=off
- or iommu=nopt depending on the architecture.
+   default IOMMU_DEFAULT_STRICT
+   help
+ This option allows IOMMU DMA mode to be chose at build time, to


As before:
/s/chose/chosen/, /s/allows IOMMU/allows an IOMMU/


+ override the default DMA mode of each ARCHs, removing the need to


Again, as before:
ARCHs should be singular


+ pass in kernel parameters through command line. You can still use
+ ARCHs specific boot options to override this option again.
+
+config IOMMU_DEFAULT_PASSTHROUGH
+   bool "passthrough"
+   help
+ In this mode, the DMA access through IOMMU without any addresses
+ translation. That means, the wrong or illegal DMA access can not
+ be caught, no error information will be reported.

  If unsure, say N here.

+config IOMMU_DEFAULT_LAZY
+   bool "lazy"
+   help
+ Support lazy mode, where for every IOMMU DMA unmap operation, the
+ flush operation of IOTLB and the free operation of IOVA are deferred.
+ They are only guaranteed to be done before the related IOVA will be
+ reused.


why no advisory on how to set if unsure?


+
+config IOMMU_DEFAULT_STRICT
+   bool "strict"
+   help
+ For every IOMMU DMA unmap operation, the flush operation of IOTLB and
+ the free operation of IOVA are guaranteed to be done in the unmap
+ function.
+
+ This mode is safer than the two above, but it maybe slower in some
+ high performace scenarios.


and here?


+
+endchoice
+
 config OF_IOMMU
def_bool y
depends on OF && IOMMU_API
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 67ee6623f9b2a4d..56bce221285b15f 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -43,7 +43,8 @@
 #else
 static unsigned int iommu_def_domain_type = IOMMU_DOMAIN_DMA;
 #endif
-static bool iommu_dma_strict __read_mostly = true;
+static bool iommu_dma_strict __read_mostly =
+   IS_ENABLED(CONFIG_IOMMU_DEFAULT_STRICT);

 struct iommu_group {
struct kobject kobj;






Re: [PATCH v7 1/1] iommu: enhance IOMMU dma mode build options

2019-05-21 Thread John Garry

On 20/05/2019 14:59, Zhen Lei wrote:

First, add build option IOMMU_DEFAULT_{LAZY|STRICT}, so that we have the
opportunity to set {lazy|strict} mode as default at build time. Then put
the three config options in an choice, make people can only choose one of
the three at a time.

The default IOMMU dma modes on each ARCHs have no change.

Signed-off-by: Zhen Lei 


Apart from more minor comments, FWIW:

Reviewed-by: John Garry 


---
 arch/ia64/kernel/pci-dma.c|  2 +-
 arch/powerpc/platforms/powernv/pci-ioda.c |  3 ++-
 arch/s390/pci/pci_dma.c   |  2 +-
 arch/x86/kernel/pci-dma.c |  7 ++---
 drivers/iommu/Kconfig | 44 ++-
 drivers/iommu/amd_iommu_init.c|  3 ++-
 drivers/iommu/intel-iommu.c   |  2 +-
 drivers/iommu/iommu.c |  3 ++-
 8 files changed, 48 insertions(+), 18 deletions(-)

diff --git a/arch/ia64/kernel/pci-dma.c b/arch/ia64/kernel/pci-dma.c
index fe988c49f01ce6a..655511dbf3c3b34 100644
--- a/arch/ia64/kernel/pci-dma.c
+++ b/arch/ia64/kernel/pci-dma.c
@@ -22,7 +22,7 @@
 int force_iommu __read_mostly;
 #endif

-int iommu_pass_through;
+int iommu_pass_through = IS_ENABLED(CONFIG_IOMMU_DEFAULT_PASSTHROUGH);


As commented privately, I could never see this set for ia64, and it 
seems to exist just to keep the linker happy. Anyway, I am not sure if 
ever suitable to be set.




 static int __init pci_iommu_init(void)
 {
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 3ead4c237ed0ec9..383e082a9bb985c 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -85,7 +85,8 @@ void pe_level_printk(const struct pnv_ioda_pe *pe, const char 
*level,
va_end(args);
 }

-static bool pnv_iommu_bypass_disabled __read_mostly;
+static bool pnv_iommu_bypass_disabled __read_mostly =
+   !IS_ENABLED(CONFIG_IOMMU_DEFAULT_PASSTHROUGH);
 static bool pci_reset_phbs __read_mostly;

 static int __init iommu_setup(char *str)
diff --git a/arch/s390/pci/pci_dma.c b/arch/s390/pci/pci_dma.c
index 9e52d1527f71495..784ad1e0acecfb1 100644
--- a/arch/s390/pci/pci_dma.c
+++ b/arch/s390/pci/pci_dma.c
@@ -17,7 +17,7 @@

 static struct kmem_cache *dma_region_table_cache;
 static struct kmem_cache *dma_page_table_cache;
-static int s390_iommu_strict;
+static int s390_iommu_strict = IS_ENABLED(CONFIG_IOMMU_DEFAULT_STRICT);

 static int zpci_refresh_global(struct zpci_dev *zdev)
 {
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index d460998ae828514..fb2bab42a0a3173 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -43,11 +43,8 @@
  * It is also possible to disable by default in kernel config, and enable with
  * iommu=nopt at boot time.
  */
-#ifdef CONFIG_IOMMU_DEFAULT_PASSTHROUGH
-int iommu_pass_through __read_mostly = 1;
-#else
-int iommu_pass_through __read_mostly;
-#endif
+int iommu_pass_through __read_mostly =
+   IS_ENABLED(CONFIG_IOMMU_DEFAULT_PASSTHROUGH);

 extern struct iommu_table_entry __iommu_table[], __iommu_table_end[];

diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 6f07f3b21816c64..8a1f1793cde76b4 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -74,17 +74,47 @@ config IOMMU_DEBUGFS
  debug/iommu directory, and then populate a subdirectory with
  entries as required.

-config IOMMU_DEFAULT_PASSTHROUGH
-   bool "IOMMU passthrough by default"
+choice
+   prompt "IOMMU default DMA mode"
depends on IOMMU_API
-help
- Enable passthrough by default, removing the need to pass in
- iommu.passthrough=on or iommu=pt through command line. If this
- is enabled, you can still disable with iommu.passthrough=off
- or iommu=nopt depending on the architecture.
+   default IOMMU_DEFAULT_PASSTHROUGH if (PPC_POWERNV && PCI)
+   default IOMMU_DEFAULT_LAZY if (AMD_IOMMU || INTEL_IOMMU || S390_IOMMU)
+   default IOMMU_DEFAULT_STRICT
+   help
+ This option allows IOMMU DMA mode to be chose at build time, to


I'd say /s/allows IOMMU/allows an IOMMU/, /s/chose/chosen/


+ override the default DMA mode of each ARCHs, removing the need to


ARCHs should be singular


+ pass in kernel parameters through command line. You can still use
+ ARCHs specific boot options to override this option again.
+
+config IOMMU_DEFAULT_PASSTHROUGH
+   bool "passthrough"
+   help
+ In this mode, the DMA access through IOMMU without any addresses
+ translation. That means, the wrong or illegal DMA access can not
+ be caught, no error information will be reported.

  If unsure, say N here.

+config IOMMU_DEFAULT_LAZY
+   bool "lazy"
+   help
+ Support lazy mode, where for every IOMMU DM

Re: [PATCH v6 1/1] iommu: enhance IOMMU dma mode build options

2019-05-08 Thread John Garry

On 18/04/2019 14:57, Zhen Lei wrote:

First, add build option IOMMU_DEFAULT_{LAZY|STRICT}, so that we have the
opportunity to set {lazy|strict} mode as default at build time. Then put
the three config options in an choice, make people can only choose one of
the three at a time.

The default IOMMU dma modes on each ARCHs have no change.

Signed-off-by: Zhen Lei 
---
 arch/ia64/kernel/pci-dma.c|  2 +-
 arch/powerpc/platforms/powernv/pci-ioda.c |  3 ++-
 arch/s390/pci/pci_dma.c   |  2 +-
 arch/x86/kernel/pci-dma.c |  7 ++---
 drivers/iommu/Kconfig | 44 ++-
 drivers/iommu/amd_iommu_init.c|  3 ++-
 drivers/iommu/intel-iommu.c   |  2 +-
 drivers/iommu/iommu.c |  3 ++-
 8 files changed, 48 insertions(+), 18 deletions(-)

diff --git a/arch/ia64/kernel/pci-dma.c b/arch/ia64/kernel/pci-dma.c
index fe988c49f01ce6a..655511dbf3c3b34 100644
--- a/arch/ia64/kernel/pci-dma.c
+++ b/arch/ia64/kernel/pci-dma.c
@@ -22,7 +22,7 @@
 int force_iommu __read_mostly;
 #endif

-int iommu_pass_through;
+int iommu_pass_through = IS_ENABLED(CONFIG_IOMMU_DEFAULT_PASSTHROUGH);

 static int __init pci_iommu_init(void)
 {
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 3ead4c237ed0ec9..383e082a9bb985c 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -85,7 +85,8 @@ void pe_level_printk(const struct pnv_ioda_pe *pe, const char 
*level,
va_end(args);
 }

-static bool pnv_iommu_bypass_disabled __read_mostly;
+static bool pnv_iommu_bypass_disabled __read_mostly =
+   !IS_ENABLED(CONFIG_IOMMU_DEFAULT_PASSTHROUGH);
 static bool pci_reset_phbs __read_mostly;

 static int __init iommu_setup(char *str)
diff --git a/arch/s390/pci/pci_dma.c b/arch/s390/pci/pci_dma.c
index 9e52d1527f71495..784ad1e0acecfb1 100644
--- a/arch/s390/pci/pci_dma.c
+++ b/arch/s390/pci/pci_dma.c
@@ -17,7 +17,7 @@

 static struct kmem_cache *dma_region_table_cache;
 static struct kmem_cache *dma_page_table_cache;
-static int s390_iommu_strict;
+static int s390_iommu_strict = IS_ENABLED(CONFIG_IOMMU_DEFAULT_STRICT);

 static int zpci_refresh_global(struct zpci_dev *zdev)
 {
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index d460998ae828514..fb2bab42a0a3173 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -43,11 +43,8 @@
  * It is also possible to disable by default in kernel config, and enable with
  * iommu=nopt at boot time.
  */
-#ifdef CONFIG_IOMMU_DEFAULT_PASSTHROUGH
-int iommu_pass_through __read_mostly = 1;
-#else
-int iommu_pass_through __read_mostly;
-#endif
+int iommu_pass_through __read_mostly =
+   IS_ENABLED(CONFIG_IOMMU_DEFAULT_PASSTHROUGH);

 extern struct iommu_table_entry __iommu_table[], __iommu_table_end[];

diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 6f07f3b21816c64..8a1f1793cde76b4 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -74,17 +74,47 @@ config IOMMU_DEBUGFS
  debug/iommu directory, and then populate a subdirectory with
  entries as required.

-config IOMMU_DEFAULT_PASSTHROUGH
-   bool "IOMMU passthrough by default"
+choice
+   prompt "IOMMU dma mode"


/s/dma/DMA/

And how about add "default", as in "Default IOMMU DMA mode" or "IOMMU 
default DMA mode"?



depends on IOMMU_API
-help
- Enable passthrough by default, removing the need to pass in
- iommu.passthrough=on or iommu=pt through command line. If this
- is enabled, you can still disable with iommu.passthrough=off
- or iommu=nopt depending on the architecture.
+   default IOMMU_DEFAULT_PASSTHROUGH if (PPC_POWERNV && PCI)
+   default IOMMU_DEFAULT_LAZY if (AMD_IOMMU || INTEL_IOMMU || S390_IOMMU)
+   default IOMMU_DEFAULT_STRICT
+   help
+ This option allows IOMMU dma mode to be chose at build time, to


again, capitalize acronyms, i.e. /s/dma/DMA/ (more of these above and below)


+ override the default dma mode of each ARCHs, removing the need to
+ pass in kernel parameters through command line. You can still use
+ ARCHs specific boot options to override this option again.
+
+config IOMMU_DEFAULT_PASSTHROUGH


I think that it may need to be indented, along with the other choices


+   bool "passthrough"
+   help
+ In this mode, the dma access through IOMMU without any addresses
+ transformation. That means, the wrong or illegal dma access can not


transformation, or translation?


+ be caught, no error information will be reported.

  If unsure, say N here.

+config IOMMU_DEFAULT_LAZY
+   bool "lazy"
+   help
+ Support lazy mode, where for every IOMMU DMA unmap operation, the
+ flush operation of IOTLB and the free operation of 

Re: [PATCH v5 1/6] iommu: add generic boot option iommu.dma_mode

2019-04-12 Thread John Garry

On 09/04/2019 13:53, Zhen Lei wrote:

Currently the IOMMU dma contains 3 modes: passthrough, lazy, strict. The
passthrough mode bypass the IOMMU, the lazy mode defer the invalidation
of hardware TLBs, and the strict mode invalidate IOMMU hardware TLBs
synchronously. The three modes are mutually exclusive. But the current
boot options are confused, such as: iommu.passthrough and iommu.strict,
because they are no good to be coexist. So add iommu.dma_mode.

Signed-off-by: Zhen Lei 
---
 Documentation/admin-guide/kernel-parameters.txt | 19 
 drivers/iommu/iommu.c   | 59 -
 include/linux/iommu.h   |  5 +++
 3 files changed, 71 insertions(+), 12 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt 
b/Documentation/admin-guide/kernel-parameters.txt
index 2b8ee90bb64470d..f7766f8ac8b9084 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1811,6 +1811,25 @@
1 - Bypass the IOMMU for DMA.
unset - Use value of CONFIG_IOMMU_DEFAULT_PASSTHROUGH.

+   iommu.dma_mode= Configure default dma mode. if unset, use the value
+   of CONFIG_IOMMU_DEFAULT_PASSTHROUGH to determine
+   passthrough or not.


To me, for unset it's unclear what we default to. So if unset and also 
CONFIG_IOMMU_DEFAULT_PASSTHROUGH is not set, do we get lazy or strict 
mode? (note: I'm ignoring backwards compatibility and interaction of 
iommu.strict and .passthorugh also, more below).


Could we considering introducing config DEFAULT_IOMMU_DMA_MODE, similar 
to DEFAULT_IOSCHED?



+   Note: For historical reasons, ARM64/S390/PPC/X86 have
+   their specific options. Currently, only ARM64 support
+   this boot option, and hope other ARCHs to use this as
+   generic boot option.
+   passthrough
+   Configure DMA to bypass the IOMMU by default.
+   lazy
+   Request that DMA unmap operations use deferred
+   invalidation of hardware TLBs, for increased
+   throughput at the cost of reduced device isolation.
+   Will fall back to strict mode if not supported by
+   the relevant IOMMU driver.
+   strict
+   DMA unmap operations invalidate IOMMU hardware TLBs
+   synchronously.
+
io7=[HW] IO7 for Marvel based alpha systems
See comment before marvel_specify_io7 in
arch/alpha/kernel/core_marvel.c.
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 109de67d5d727c2..df1ce8e22385b48 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -38,12 +38,13 @@

 static struct kset *iommu_group_kset;
 static DEFINE_IDA(iommu_group_ida);
+
 #ifdef CONFIG_IOMMU_DEFAULT_PASSTHROUGH
-static unsigned int iommu_def_domain_type = IOMMU_DOMAIN_IDENTITY;
+#define IOMMU_DEFAULT_DMA_MODE IOMMU_DMA_MODE_PASSTHROUGH
 #else
-static unsigned int iommu_def_domain_type = IOMMU_DOMAIN_DMA;
+#define IOMMU_DEFAULT_DMA_MODE IOMMU_DMA_MODE_STRICT
 #endif
-static bool iommu_dma_strict __read_mostly = true;
+static int iommu_default_dma_mode __read_mostly = IOMMU_DEFAULT_DMA_MODE;

 struct iommu_callback_data {
const struct iommu_ops *ops;
@@ -147,20 +148,51 @@ static int __init iommu_set_def_domain_type(char *str)
int ret;

ret = kstrtobool(str, );
-   if (ret)
-   return ret;
+   if (!ret && pt)
+   iommu_default_dma_mode = IOMMU_DMA_MODE_PASSTHROUGH;

-   iommu_def_domain_type = pt ? IOMMU_DOMAIN_IDENTITY : IOMMU_DOMAIN_DMA;
-   return 0;
+   return ret;
 }
 early_param("iommu.passthrough", iommu_set_def_domain_type);

 static int __init iommu_dma_setup(char *str)
 {
-   return kstrtobool(str, _dma_strict);
+   bool strict;
+   int ret;
+
+   ret = kstrtobool(str, );
+   if (!ret)
+   iommu_default_dma_mode = strict ?
+   IOMMU_DMA_MODE_STRICT : IOMMU_DMA_MODE_LAZY;
+
+   return ret;
 }
 early_param("iommu.strict", iommu_dma_setup);

+static int __init iommu_dma_mode_setup(char *str)
+{
+   if (!str)
+   goto fail;
+
+   if (!strncmp(str, "passthrough", 11))
+   iommu_default_dma_mode = IOMMU_DMA_MODE_PASSTHROUGH;
+   else if (!strncmp(str, "lazy", 4))
+   iommu_default_dma_mode = IOMMU_DMA_MODE_LAZY;
+   else if (!strncmp(str, "strict", 6))
+   iommu_default_dma_mode = IOMMU_DMA_MODE_STRICT;
+   else
+   goto fail;
+
+   pr_info("Force dma mode to be %d\n", iommu_default_dma_mode);


What happens if the cmdline option iommu.dma_mode is passed