Re: [PATCH v2 0/5] Move remaining x86 Travis jobs to the gitlab-CI

2021-02-09 Thread Alex Bennée
Will do.

On Wed, 10 Feb 2021, 05:44 Thomas Huth,  wrote:

> On 09/02/2021 21.37, Alex Bennée wrote:
> >
> > Thomas Huth  writes:
> >
> >> Since Travis changed their policies, travis-ci.org will soon become
> >> completely useless for the QEMU project. We should now really make sure
> >> that we move the remaining tests as good as possible to the gitlab-CI
> >> instead.
> >
> > Queued to testing/next, thanks.
>
> Thanks, but please unqueue them again, I still want to send a v3 to
> address
> your comment on the -fsanitize=undefined patch... and I also noticed that
> the gprof/gcov job runs very long and sometimes hits the 1h time limit, so
> I
> need to revisit the set of target architectures there...
>
>   Thomas
>
>


[PATCH 2/2] hw/block/nvme: add write uncorrectable command

2021-02-09 Thread Klaus Jensen
From: Gollu Appalanaidu 

Add support for marking blocks invalid with the Write Uncorrectable
command. Block status is tracked in a (non-persistent) bitmap that is
checked on all reads and written to on all writes. This is potentially
expensive, so keep Write Uncorrectable disabled by default.

Signed-off-by: Gollu Appalanaidu 
Signed-off-by: Klaus Jensen 
---
 docs/specs/nvme.txt   |  3 ++
 hw/block/nvme-ns.h|  2 ++
 hw/block/nvme.h   |  1 +
 hw/block/nvme-ns.c|  2 ++
 hw/block/nvme.c   | 65 +--
 hw/block/trace-events |  1 +
 6 files changed, 66 insertions(+), 8 deletions(-)

diff --git a/docs/specs/nvme.txt b/docs/specs/nvme.txt
index 56d393884e7a..88f9cc278d4c 100644
--- a/docs/specs/nvme.txt
+++ b/docs/specs/nvme.txt
@@ -19,5 +19,8 @@ Known issues
 
 * The accounting numbers in the SMART/Health are reset across power cycles
 
+* Marking blocks invalid with the Write Uncorrectable is not persisted across
+  power cycles.
+
 * Interrupt Coalescing is not supported and is disabled by default in volation
   of the specification.
diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index 7af6884862b5..15fa422ded03 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -72,6 +72,8 @@ typedef struct NvmeNamespace {
 struct {
 uint32_t err_rec;
 } features;
+
+unsigned long *uncorrectable;
 } NvmeNamespace;
 
 static inline uint32_t nvme_nsid(NvmeNamespace *ns)
diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index 98082b2dfba3..9b8f85b9cf16 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -68,6 +68,7 @@ static inline const char *nvme_io_opc_str(uint8_t opc)
 case NVME_CMD_FLUSH:return "NVME_NVM_CMD_FLUSH";
 case NVME_CMD_WRITE:return "NVME_NVM_CMD_WRITE";
 case NVME_CMD_READ: return "NVME_NVM_CMD_READ";
+case NVME_CMD_WRITE_UNCOR:  return "NVME_CMD_WRITE_UNCOR";
 case NVME_CMD_COMPARE:  return "NVME_NVM_CMD_COMPARE";
 case NVME_CMD_WRITE_ZEROES: return "NVME_NVM_CMD_WRITE_ZEROES";
 case NVME_CMD_DSM:  return "NVME_NVM_CMD_DSM";
diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index ade46e2f3739..742bbc4b4b62 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -72,6 +72,8 @@ static int nvme_ns_init(NvmeNamespace *ns, Error **errp)
 id_ns->mcl = cpu_to_le32(ns->params.mcl);
 id_ns->msrc = ns->params.msrc;
 
+ns->uncorrectable = bitmap_new(id_ns->nsze);
+
 return 0;
 }
 
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index e5f725d7..56048046c193 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1112,6 +1112,20 @@ static uint16_t nvme_check_dulbe(NvmeNamespace *ns, 
uint64_t slba,
 return NVME_SUCCESS;
 }
 
+static inline uint16_t nvme_check_uncor(NvmeNamespace *ns, uint64_t slba,
+uint32_t nlb)
+{
+uint64_t elba = nlb + slba;
+
+if (ns->uncorrectable) {
+if (find_next_bit(ns->uncorrectable, elba, slba) < elba) {
+return NVME_UNRECOVERED_READ | NVME_DNR;
+}
+}
+
+return NVME_SUCCESS;
+}
+
 static void nvme_aio_err(NvmeRequest *req, int ret)
 {
 uint16_t status = NVME_SUCCESS;
@@ -1423,14 +1437,24 @@ static void nvme_rw_cb(void *opaque, int ret)
 BlockAcctCookie *acct = >acct;
 BlockAcctStats *stats = blk_get_stats(blk);
 
+bool is_write = nvme_is_write(req);
+
 trace_pci_nvme_rw_cb(nvme_cid(req), blk_name(blk));
 
-if (ns->params.zoned && nvme_is_write(req)) {
+if (ns->params.zoned && is_write) {
 nvme_finalize_zoned_write(ns, req);
 }
 
 if (!ret) {
 block_acct_done(stats, acct);
+
+if (is_write) {
+NvmeRwCmd *rw = (NvmeRwCmd *)>cmd;
+uint64_t slba = le64_to_cpu(rw->slba);
+uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
+
+bitmap_clear(ns->uncorrectable, slba, nlb);
+}
 } else {
 block_acct_failed(stats, acct);
 nvme_aio_err(req, ret);
@@ -1521,13 +1545,13 @@ static void nvme_copy_cb(void *opaque, int ret)
 {
 NvmeRequest *req = opaque;
 NvmeNamespace *ns = req->ns;
+NvmeCopyCmd *copy = (NvmeCopyCmd *)>cmd;
+uint64_t sdlba = le64_to_cpu(copy->sdlba);
 struct nvme_copy_ctx *ctx = req->opaque;
 
 trace_pci_nvme_copy_cb(nvme_cid(req));
 
 if (ns->params.zoned) {
-NvmeCopyCmd *copy = (NvmeCopyCmd *)>cmd;
-uint64_t sdlba = le64_to_cpu(copy->sdlba);
 NvmeZone *zone = nvme_get_zone_by_slba(ns, sdlba);
 
 __nvme_advance_zone_wp(ns, zone, ctx->nlb);
@@ -1535,6 +1559,7 @@ static void nvme_copy_cb(void *opaque, int ret)
 
 if (!ret) {
 block_acct_done(blk_get_stats(ns->blkconf.blk), >acct);
+bitmap_clear(ns->uncorrectable, sdlba, ctx->nlb);
 } else {
 block_acct_failed(blk_get_stats(ns->blkconf.blk), >acct);
 nvme_aio_err(req, ret);
@@ -1953,6 +1978,12 @@ static uint16_t nvme_read(NvmeCtrl *n, 

[PATCH 1/2] hw/block/nvme: add oncs device parameter

2021-02-09 Thread Klaus Jensen
From: Gollu Appalanaidu 

Add the 'oncs' nvme device parameter to allow optional features to be
enabled/disabled explicitly. Since most of these are optional commands,
make the CSE log pages dynamic to account for the value of ONCS.

Signed-off-by: Gollu Appalanaidu 
Signed-off-by: Klaus Jensen 
---
 hw/block/nvme.h |   7 
 hw/block/nvme.c | 101 
 2 files changed, 74 insertions(+), 34 deletions(-)

diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index cb2b5175f1a1..98082b2dfba3 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -9,6 +9,7 @@
 
 #define NVME_DEFAULT_ZONE_SIZE   (128 * MiB)
 #define NVME_DEFAULT_MAX_ZA_SIZE (128 * KiB)
+#define NVME_MAX_COMMANDS 0x100
 
 typedef struct NvmeParams {
 char *serial;
@@ -22,6 +23,7 @@ typedef struct NvmeParams {
 bool use_intel_id;
 uint32_t zasl_bs;
 bool legacy_cmb;
+uint16_t oncs;
 } NvmeParams;
 
 typedef struct NvmeAsyncEvent {
@@ -183,6 +185,11 @@ typedef struct NvmeCtrl {
 NvmeCQueue  admin_cq;
 NvmeIdCtrl  id_ctrl;
 NvmeFeatureVal  features;
+
+struct {
+uint32_t nvm[NVME_MAX_COMMANDS];
+uint32_t zoned[NVME_MAX_COMMANDS];
+} iocs;
 } NvmeCtrl;
 
 static inline NvmeNamespace *nvme_ns(NvmeCtrl *n, uint32_t nsid)
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 93345bf3c1fc..e5f725d7 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -71,6 +71,11 @@
  *   data size being in effect. By setting this property to 0, users can make
  *   ZASL to be equal to MDTS. This property only affects zoned namespaces.
  *
+ * - `oncs`
+ *   This field indicates the optional NVM commands and features supported
+ *   by the controller. To add support for the optional feature, needs to
+ *   set the corresponding support indicated bit.
+ *
  * nvme namespace device parameters
  * 
  * - `subsys`
@@ -165,7 +170,7 @@ static const uint32_t nvme_feature_cap[NVME_FID_MAX] = {
 [NVME_TIMESTAMP]= NVME_FEAT_CAP_CHANGE,
 };
 
-static const uint32_t nvme_cse_acs[256] = {
+static const uint32_t nvme_cse_acs[NVME_MAX_COMMANDS] = {
 [NVME_ADM_CMD_DELETE_SQ]= NVME_CMD_EFF_CSUPP,
 [NVME_ADM_CMD_CREATE_SQ]= NVME_CMD_EFF_CSUPP,
 [NVME_ADM_CMD_GET_LOG_PAGE] = NVME_CMD_EFF_CSUPP,
@@ -178,30 +183,7 @@ static const uint32_t nvme_cse_acs[256] = {
 [NVME_ADM_CMD_ASYNC_EV_REQ] = NVME_CMD_EFF_CSUPP,
 };
 
-static const uint32_t nvme_cse_iocs_none[256];
-
-static const uint32_t nvme_cse_iocs_nvm[256] = {
-[NVME_CMD_FLUSH]= NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
-[NVME_CMD_WRITE_ZEROES] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
-[NVME_CMD_WRITE]= NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
-[NVME_CMD_READ] = NVME_CMD_EFF_CSUPP,
-[NVME_CMD_DSM]  = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
-[NVME_CMD_COPY] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
-[NVME_CMD_COMPARE]  = NVME_CMD_EFF_CSUPP,
-};
-
-static const uint32_t nvme_cse_iocs_zoned[256] = {
-[NVME_CMD_FLUSH]= NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
-[NVME_CMD_WRITE_ZEROES] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
-[NVME_CMD_WRITE]= NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
-[NVME_CMD_READ] = NVME_CMD_EFF_CSUPP,
-[NVME_CMD_DSM]  = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
-[NVME_CMD_COPY] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
-[NVME_CMD_COMPARE]  = NVME_CMD_EFF_CSUPP,
-[NVME_CMD_ZONE_APPEND]  = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
-[NVME_CMD_ZONE_MGMT_SEND]   = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
-[NVME_CMD_ZONE_MGMT_RECV]   = NVME_CMD_EFF_CSUPP,
-};
+static const uint32_t nvme_cse_iocs_none[NVME_MAX_COMMANDS];
 
 static void nvme_process_sq(void *opaque);
 
@@ -2884,17 +2866,17 @@ static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint8_t 
csi, uint32_t buf_len,
 
 switch (NVME_CC_CSS(n->bar.cc)) {
 case NVME_CC_CSS_NVM:
-src_iocs = nvme_cse_iocs_nvm;
+src_iocs = n->iocs.nvm;
 /* fall through */
 case NVME_CC_CSS_ADMIN_ONLY:
 break;
 case NVME_CC_CSS_CSI:
 switch (csi) {
 case NVME_CSI_NVM:
-src_iocs = nvme_cse_iocs_nvm;
+src_iocs = n->iocs.nvm;
 break;
 case NVME_CSI_ZONED:
-src_iocs = nvme_cse_iocs_zoned;
+src_iocs = n->iocs.zoned;
 break;
 }
 }
@@ -3422,6 +3404,10 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, 
NvmeRequest *req)
 return NVME_INVALID_FIELD | NVME_DNR;
 }
 
+if (!(le16_to_cpu(n->id_ctrl.oncs) & NVME_ONCS_FEATURES) && sel) {
+return NVME_INVALID_FIELD | NVME_DNR;
+}
+
 if (nvme_feature_cap[fid] & NVME_FEAT_CAP_NS) {
 if 

[PATCH 0/2] hw/block/nvme: oncs and write uncorrectable support

2021-02-09 Thread Klaus Jensen
From: Klaus Jensen 

First, add support for toggling optional features through the new `oncs`
nvme device parameter.

Secondly, add support for the Write Uncorrectable command.

Gollu Appalanaidu (2):
  hw/block/nvme: add oncs device parameter
  hw/block/nvme: add write uncorrectable command

 docs/specs/nvme.txt   |   3 +
 hw/block/nvme-ns.h|   2 +
 hw/block/nvme.h   |   8 ++
 hw/block/nvme-ns.c|   2 +
 hw/block/nvme.c   | 166 +++---
 hw/block/trace-events |   1 +
 6 files changed, 140 insertions(+), 42 deletions(-)

-- 
2.30.0




Re: [PATCH] hw/block/nvme: add broadcast nsid support flush command

2021-02-09 Thread Klaus Jensen
On Feb 10 12:32, Keith Busch wrote:
> On Mon, Feb 08, 2021 at 08:08:17PM +0100, Klaus Jensen wrote:
> > On Feb  9 03:59, Keith Busch wrote:
> > > This whole implementation would be much simpler with the synchronous
> > > blk_flush() routine instead of the AIO equivalent. This is not really a
> > > performant feature, so I don't think it's critical to get these
> > > operations happening in parallel. What do you think?
> > 
> > It would definitely be simpler, but I believe that if there is a lot to
> > flush, then we won't just block the nvme device. We are holding the Big
> > QEMU Lock and will block most other devices as well.
> 
> Hm, I feel like you may have told me this same explanation for a
> different patch. :) Okay, I'm convinced: this is the way.
> 

Is that an Acked-by? ;)

And yes, I might have used that argument for Copy, can't remember ;)


signature.asc
Description: PGP signature


Re: [PATCH] spapr: Adjust firmware path of PCI devices

2021-02-09 Thread Alexey Kardashevskiy




On 27/01/2021 12:28, Alexey Kardashevskiy wrote:



On 25/01/2021 21:23, Greg Kurz wrote:

On Sat, 23 Jan 2021 13:36:34 +1100
Alexey Kardashevskiy  wrote:




On 23/01/2021 04:01, Greg Kurz wrote:

It is currently not possible to perform a strict boot from USB storage:

$ qemu-system-ppc64 -accel kvm -nodefaults -nographic -serial stdio \
-boot strict=on \
-device qemu-xhci \
-device usb-storage,drive=disk,bootindex=0 \
-blockdev driver=file,node-name=disk,filename=fedora-ppc64le.qcow2


SLOF 
**

QEMU Starting
   Build Date = Jul 17 2020 11:15:24
   FW Version = git-e18ddad8516ff2cf
   Press "s" to enter Open Firmware.

Populating /vdevice methods
Populating /vdevice/vty@7100
Populating /vdevice/nvram@7101
Populating /pci@8002000
   00  (D) : 1b36 000d    serial bus [ 
usb-xhci ]

No NVRAM common partition, re-initializing...
Scanning USB
    XHCI: Initializing
  USB Storage
 SCSI: Looking for devices
    101 DISK : "QEMU QEMU HARDDISK    2.5+"
Using default console: /vdevice/vty@7100

    Welcome to Open Firmware

    Copyright (c) 2004, 2017 IBM Corporation All rights reserved.
    This program and the accompanying materials are made available
    under the terms of the BSD License available at
    http://www.opensource.org/licenses/bsd-license.php


Trying to load:  from: 
/pci@8002000/usb@0/storage@1/disk@101 ...

E3405: No such device

E3407: Load failed

    Type 'boot' and press return to continue booting the system.
    Type 'reset-all' and press return to reboot the system.


Ready!
0 >

The device tree handed over by QEMU to SLOF indeed contains:

qemu,boot-list =
"/pci@8002000/usb@0/storage@1/disk@101 HALT";

but the device node is named usb-xhci@0, not usb@0.



I'd expect it to be a return of qdev_fw_name() so in this case something
like "nec-usb-xhci" (which would still be broken) but seeing a plain
"usb" is a bit strange.



The logic under get_boot_devices_list() is a bit hard to follow
because of the multiple indirections, but AFAICT it doesn't seem
to rely on qdev_fw_name() to get the names.

None of the XHCI devices seem to be setting DeviceClass::fw_name anyway:

$ git grep fw_name hw/usb/
hw/usb/bus.c: qdev_fw_name(qdev), nr);
hw/usb/dev-hub.c:    dc->fw_name = "hub";
hw/usb/dev-mtp.c:    dc->fw_name = "mtp";
hw/usb/dev-network.c:    dc->fw_name = "network";
hw/usb/dev-storage.c:    dc->fw_name = "storage";
hw/usb/dev-uas.c:    dc->fw_name = "storage";

The plain "usb" naming comes from PCI, which has its own naming
logic for PCI devices (which qemu-xhci happens to be) :



Right, this was the confusing bit for me. I thought that by just setting 
dc->fw_name to what we put in the DT should be enough but it is not.





#0  0x000100319474 in pci_dev_fw_name (len=33, buf=0x7fffd4c8 
"\020", dev=0x7fffc3320010) at ../../hw/pci/pci.c:2533
#1  0x000100319474 in pcibus_get_fw_dev_path (dev=0x7fffc3320010) 
at ../../hw/pci/pci.c:2550
#2  0x00010053118c in bus_get_fw_dev_path (dev=0x7fffc3320010, 
bus=) at ../../hw/core/qdev-fw.c:38
#3  0x00010053118c in qdev_get_fw_dev_path_helper 
(dev=0x7fffc3320010, p=0x7fffd728 "/pci@8002000/", 
size=128) at ../../hw/core/qdev-fw.c:72
#4  0x000100531064 in qdev_get_fw_dev_path_helper 
(dev=0x101c864a0, p=0x7fffd728 "/pci@8002000/", size=128) 
at ../../hw/core/qdev-fw.c:69
#5  0x000100531064 in qdev_get_fw_dev_path_helper 
(dev=0x1019f3560, p=0x7fffd728 "/pci@8002000/", size=128) 
at ../../hw/core/qdev-fw.c:69
#6  0x0001005312f0 in qdev_get_fw_dev_path (dev=) 
at ../../hw/core/qdev-fw.c:91
#7  0x000100588a68 in get_boot_device_path (dev=, 
ignore_suffixes=, ignore_suffixes@entry=true, 
suffix=) at ../../softmmu/bootdevice.c:211
#8  0x000100589540 in get_boot_devices_list (size=0x7fffd990) 
at ../../softmmu/bootdevice.c:257
#9  0x000100606764 in spapr_dt_chosen (reset=true, 
fdt=0x7fffc26f0010, spapr=0x10149aef0) at ../../hw/ppc/spapr.c:1019




While your patch works, I wonder if we should assign fw_name to all pci
nodes to avoid similar problems in the future? Thanks,



Not sure to understand "assign fw_name to all pci nodes" ...



Basically this:

=
diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index de0fae10ab9c..8a286419aaf8 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -2508,7 +2508,12 @@ static char *pci_dev_fw_name(DeviceState *dev, 
char *buf, int len)

  const char *name = NULL;
  const pci_class_desc *desc =  pci_class_descriptions;
  int class = pci_get_word(d->config + PCI_CLASS_DEVICE);
+    DeviceClass *dc = DEVICE_GET_CLASS(dev);

+    if (dc->fw_name) {
+    pstrcpy(buf, len, dc->fw_name);
+    return buf;
+    }
  while (desc->desc &&
    (class & ~desc->fw_ign_bits) !=
    

Re: [PATCH 0/3] virtio-net: graceful drop of vhost for TAP

2021-02-09 Thread Jason Wang



On 2021/2/9 下午11:04, Michael S. Tsirkin wrote:

On Tue, Feb 09, 2021 at 02:51:05PM +, Daniel P. Berrangé wrote:

On Tue, Feb 09, 2021 at 09:34:20AM -0500, Michael S. Tsirkin wrote:

On Thu, Feb 04, 2021 at 10:29:12PM +0200, Yuri Benditovich wrote:

This set of patches introduces graceful switch from tap-vhost to
tap-no-vhost depending on guest features. Before that the features
that vhost does not support were silently cleared in get_features.
This creates potential problem of migration from the machine where
some of virtio-net features are supported by the vhost kernel to the
machine where they are not supported (packed ring as an example).

I still worry that adding new features will silently disable vhost for people.
Can we limit the change to when a VM is migrated in?

Some management applications expect bi-directional live migration to
work, so taking specific actions on incoming migration only feels
dangerous.

Could you be more specific?

Bi-directional migration is currently broken
when migrating new kernel->old kernel.

This seems to be the motivation for this patch, though I wish
it was spelled out more explicitly.

People don't complain much, but I'm fine with fixing that
with a userspace fallback.


I'd rather not force the fallback on others though: vhost is generally
specified explicitly by user while features are generally set
automatically, so this patch will make us override what user specified,
not nice.



IMHO if the features we're adding cannot be expected to exist in
host kernels in general, then the feature should defualt to off
and require explicit user config to enable.
Downstream distros which can guarantee newer kernels can flip the
default in their custom machine types if they desire.

Regards,
Daniel

Unfortunately that will basically mean we are stuck with no new features
for years. We did what this patch is trying to change for years now, in
particular KVM also seems to happily disable CPU features not supported
by kernel so I wonder why we can't keep doing it, with tweaks for some
corner cases.



It's probably not the corner case.

So my understanding is when a feature is turned on via command line, it 
should not be cleared silently otherwise we may break migration for sure.


E.g when packed=on is specified, we should disable vhost instead of 
clear it from the device.


Thanks




userspace and kernel not being in 100% sync wrt features is not
a corner case though, and switching backends seems like too big
a hammer.


--
|: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o-https://fstop138.berrange.com :|
|: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|







[PULL 17/19] hw/ppc: e500: Fill in correct for the serial nodes

2021-02-09 Thread David Gibson
From: Bin Meng 

At present the  property of the serial node is
populated with value zero. U-Boot's ns16550 driver is not happy
about this, so let's fill in a meaningful value.

Signed-off-by: Bin Meng 
Reviewed-by: Philippe Mathieu-Daudé 

Message-Id: <1612362288-22216-2-git-send-email-bmeng...@gmail.com>
Signed-off-by: David Gibson 
---
 hw/ppc/e500.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/ppc/e500.c b/hw/ppc/e500.c
index c795276668..01517a6c6c 100644
--- a/hw/ppc/e500.c
+++ b/hw/ppc/e500.c
@@ -126,7 +126,7 @@ static void dt_serial_create(void *fdt, unsigned long long 
offset,
 qemu_fdt_setprop_string(fdt, ser, "compatible", "ns16550");
 qemu_fdt_setprop_cells(fdt, ser, "reg", offset, 0x100);
 qemu_fdt_setprop_cell(fdt, ser, "cell-index", idx);
-qemu_fdt_setprop_cell(fdt, ser, "clock-frequency", 0);
+qemu_fdt_setprop_cell(fdt, ser, "clock-frequency", PLATFORM_CLK_FREQ_HZ);
 qemu_fdt_setprop_cells(fdt, ser, "interrupts", 42, 2);
 qemu_fdt_setprop_phandle(fdt, ser, "interrupt-parent", mpic);
 qemu_fdt_setprop_string(fdt, "/aliases", alias, ser);
-- 
2.29.2




[PULL 15/19] ppc/pnv: Set default RAM size to 1 GB

2021-02-09 Thread David Gibson
From: Cédric Le Goater 

The memory layout of the PowerNV machine is defined as :

  #define KERNEL_LOAD_BASE  ((void *)0x2000)
  #define KERNEL_LOAD_SIZE  0x0800

  #define INITRAMFS_LOAD_BASE   KERNEL_LOAD_BASE + KERNEL_LOAD_SIZE
  #define INITRAMFS_LOAD_SIZE   0x0800

  #define SKIBOOT_BASE  0x3000
  #define SKIBOOT_SIZE  0x01c1

  #define CPU_STACKS_BASE   (SKIBOOT_BASE + SKIBOOT_SIZE)
  #define STACK_SHIFT   15
  #define STACK_SIZE(1 << STACK_SHIFT)

The overall size of the CPU stacks is (max PIR + 1) * 32K and the
machine easily reaches 800MB of minimum required RAM.

Any value below will result in a skiboot crash :

[0.034949905,3] MEM: Partial overlap detected between regions:
[0.034959039,3] MEM: ibm,firmware-stacks [0x31c1-0x3a45] (new)
[0.034968576,3] MEM: ibm,firmware-allocs-memory@0 
[0x31c1-0x3840]
[0.034980367,3] Out of memory adding skiboot reserved areas
[0.035074945,3] ***
[0.035093627,3] < assert failed at core/mem_region.c:1129 >
[0.035104247,3] .
[0.035108025,3]  .
[0.035111651,3]   .
[0.035115231,3] OO__)
[0.035119198,3]<"__/
[0.035122980,3] ^ ^

Signed-off-by: Cédric Le Goater 
Message-Id: <20210129111719.790692-1-...@kaod.org>
Reviewed-by: Greg Kurz 
Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: David Gibson 
---
 hw/ppc/pnv.c | 10 +++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/hw/ppc/pnv.c b/hw/ppc/pnv.c
index 50810df838..77af846cdf 100644
--- a/hw/ppc/pnv.c
+++ b/hw/ppc/pnv.c
@@ -21,6 +21,7 @@
 #include "qemu-common.h"
 #include "qemu/datadir.h"
 #include "qemu/units.h"
+#include "qemu/cutils.h"
 #include "qapi/error.h"
 #include "sysemu/qtest.h"
 #include "sysemu/sysemu.h"
@@ -725,8 +726,11 @@ static void pnv_init(MachineState *machine)
 DeviceState *dev;
 
 /* allocate RAM */
-if (machine->ram_size < (1 * GiB)) {
-warn_report("skiboot may not work with < 1GB of RAM");
+if (machine->ram_size < mc->default_ram_size) {
+char *sz = size_to_str(mc->default_ram_size);
+error_report("Invalid RAM size, should be bigger than %s", sz);
+g_free(sz);
+exit(EXIT_FAILURE);
 }
 memory_region_add_subregion(get_system_memory(), 0, machine->ram);
 
@@ -1994,7 +1998,7 @@ static void pnv_machine_class_init(ObjectClass *oc, void 
*data)
  * RAM defaults to less than 2048 for 32-bit hosts, and large
  * enough to fit the maximum initrd size at it's load address
  */
-mc->default_ram_size = INITRD_LOAD_ADDR + INITRD_MAX_SIZE;
+mc->default_ram_size = 1 * GiB;
 mc->default_ram_id = "pnv.ram";
 ispc->print_info = pnv_pic_print_info;
 nc->nmi_monitor_handler = pnv_nmi;
-- 
2.29.2




[PULL 19/19] target/ppc: Add E500 L2CSR0 write helper

2021-02-09 Thread David Gibson
From: Bin Meng 

Per EREF 2.0 [1] chapter 3.11.2:

The following bits in L2CSR0 (exists in the e500mc/e5500/e6500 core):

- L2FI  (L2 cache flash invalidate)
- L2FL  (L2 cache flush)
- L2LFC (L2 cache lock flash clear)

when set, a cache operation is initiated by hardware, and these bits
will be cleared when the operation is complete.

Since we don't model cache in QEMU, let's add a write helper to emulate
the cache operations completing instantly.

[1] https://www.nxp.com/files-static/32bit/doc/ref_manual/EREFRM.pdf

Signed-off-by: Bin Meng 

Message-Id: <1612925152-20913-1-git-send-email-bmeng...@gmail.com>
Signed-off-by: David Gibson 
---
 target/ppc/cpu.h|  6 ++
 target/ppc/translate_init.c.inc | 16 
 2 files changed, 22 insertions(+)

diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
index cb00210288..e73416da68 100644
--- a/target/ppc/cpu.h
+++ b/target/ppc/cpu.h
@@ -1919,6 +1919,7 @@ typedef PowerPCCPU ArchCPU;
 #define SPR_750FX_HID2(0x3F8)
 #define SPR_Exxx_L1FINV0  (0x3F8)
 #define SPR_L2CR  (0x3F9)
+#define SPR_Exxx_L2CSR0   (0x3F9)
 #define SPR_L3CR  (0x3FA)
 #define SPR_750_TDCH  (0x3FA)
 #define SPR_IABR2 (0x3FA)
@@ -1974,6 +1975,11 @@ typedef PowerPCCPU ArchCPU;
 #define   L1CSR1_ICFI   0x0002  /* Instruction Cache Flash Invalidate */
 #define   L1CSR1_ICE0x0001  /* Instruction Cache Enable */
 
+/* E500 L2CSR0 */
+#define E500_L2CSR0_L2FI(1 << 21)   /* L2 cache flash invalidate */
+#define E500_L2CSR0_L2FL(1 << 11)   /* L2 cache flush */
+#define E500_L2CSR0_L2LFC   (1 << 10)   /* L2 cache lock flash clear */
+
 /* HID0 bits */
 #define HID0_DEEPNAP(1 << 24)   /* pre-2.06 */
 #define HID0_DOZE   (1 << 23)   /* pre-2.06 */
diff --git a/target/ppc/translate_init.c.inc b/target/ppc/translate_init.c.inc
index 9867d0a6e4..3ec45cbc19 100644
--- a/target/ppc/translate_init.c.inc
+++ b/target/ppc/translate_init.c.inc
@@ -1735,6 +1735,16 @@ static void spr_write_e500_l1csr1(DisasContext *ctx, int 
sprn, int gprn)
 tcg_temp_free(t0);
 }
 
+static void spr_write_e500_l2csr0(DisasContext *ctx, int sprn, int gprn)
+{
+TCGv t0 = tcg_temp_new();
+
+tcg_gen_andi_tl(t0, cpu_gpr[gprn],
+~(E500_L2CSR0_L2FI | E500_L2CSR0_L2FL | 
E500_L2CSR0_L2LFC));
+gen_store_spr(sprn, t0);
+tcg_temp_free(t0);
+}
+
 static void spr_write_booke206_mmucsr0(DisasContext *ctx, int sprn, int gprn)
 {
 gen_helper_booke206_tlbflush(cpu_env, cpu_gpr[gprn]);
@@ -5029,6 +5039,12 @@ static void init_proc_e500(CPUPPCState *env, int version)
  SPR_NOACCESS, SPR_NOACCESS,
  _read_generic, _write_e500_l1csr1,
  0x);
+if (version != fsl_e500v1 && version != fsl_e500v2) {
+spr_register(env, SPR_Exxx_L2CSR0, "L2CSR0",
+ SPR_NOACCESS, SPR_NOACCESS,
+ _read_generic, _write_e500_l2csr0,
+ 0x);
+}
 spr_register(env, SPR_BOOKE_MCSRR0, "MCSRR0",
  SPR_NOACCESS, SPR_NOACCESS,
  _read_generic, _write_generic,
-- 
2.29.2




[PULL 11/19] ppc/pnv: Introduce a LPC FW memory region attribute to map the PNOR

2021-02-09 Thread David Gibson
From: Cédric Le Goater 

This to map the PNOR from the machine init handler directly and finish
the cleanup of the LPC model.

Signed-off-by: Cédric Le Goater 
Message-Id: <20210126171059.307867-8-...@kaod.org>
Reviewed-by: Joel Stanley 
Signed-off-by: David Gibson 
---
 hw/ppc/pnv.c | 11 +++
 hw/ppc/pnv_lpc.c |  7 ---
 include/hw/ppc/pnv.h |  1 +
 3 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/hw/ppc/pnv.c b/hw/ppc/pnv.c
index e500c2e243..50810df838 100644
--- a/hw/ppc/pnv.c
+++ b/hw/ppc/pnv.c
@@ -871,6 +871,14 @@ static void pnv_init(MachineState *machine)
 pnv_ipmi_bt_init(pnv->isa_bus, pnv->bmc, 10);
 }
 
+/*
+ * The PNOR is mapped on the LPC FW address space by the BMC.
+ * Since we can not reach the remote BMC machine with LPC memops,
+ * map it always for now.
+ */
+memory_region_add_subregion(pnv->chips[0]->fw_mr, PNOR_SPI_OFFSET,
+>pnor->mmio);
+
 /*
  * OpenPOWER systems use a IPMI SEL Event message to notify the
  * host to powerdown
@@ -1150,6 +1158,7 @@ static void pnv_chip_power8_realize(DeviceState *dev, 
Error **errp)
 qdev_realize(DEVICE(>lpc), NULL, _fatal);
 pnv_xscom_add_subregion(chip, PNV_XSCOM_LPC_BASE, >lpc.xscom_regs);
 
+chip->fw_mr = >lpc.isa_fw;
 chip->dt_isa_nodename = g_strdup_printf("/xscom@%" PRIx64 "/isa@%x",
 (uint64_t) PNV_XSCOM_BASE(chip),
 PNV_XSCOM_LPC_BASE);
@@ -1479,6 +1488,7 @@ static void pnv_chip_power9_realize(DeviceState *dev, 
Error **errp)
 memory_region_add_subregion(get_system_memory(), PNV9_LPCM_BASE(chip),
 >lpc.xscom_regs);
 
+chip->fw_mr = >lpc.isa_fw;
 chip->dt_isa_nodename = g_strdup_printf("/lpcm-opb@%" PRIx64 "/lpc@0",
 (uint64_t) PNV9_LPCM_BASE(chip));
 
@@ -1592,6 +1602,7 @@ static void pnv_chip_power10_realize(DeviceState *dev, 
Error **errp)
 memory_region_add_subregion(get_system_memory(), PNV10_LPCM_BASE(chip),
 >lpc.xscom_regs);
 
+chip->fw_mr = >lpc.isa_fw;
 chip->dt_isa_nodename = g_strdup_printf("/lpcm-opb@%" PRIx64 "/lpc@0",
 (uint64_t) PNV10_LPCM_BASE(chip));
 }
diff --git a/hw/ppc/pnv_lpc.c b/hw/ppc/pnv_lpc.c
index 11739e397b..bcbca3db97 100644
--- a/hw/ppc/pnv_lpc.c
+++ b/hw/ppc/pnv_lpc.c
@@ -824,7 +824,6 @@ ISABus *pnv_lpc_isa_create(PnvLpcController *lpc, bool 
use_cpld, Error **errp)
 ISABus *isa_bus;
 qemu_irq *irqs;
 qemu_irq_handler handler;
-PnvMachineState *pnv = PNV_MACHINE(qdev_get_machine());
 
 /* let isa_bus_new() create its own bridge on SysBus otherwise
  * devices specified on the command line won't find the bus and
@@ -850,11 +849,5 @@ ISABus *pnv_lpc_isa_create(PnvLpcController *lpc, bool 
use_cpld, Error **errp)
 
 isa_bus_irqs(isa_bus, irqs);
 
-/*
- * TODO: Map PNOR on the LPC FW address space on demand ?
- */
-memory_region_add_subregion(>isa_fw, PNOR_SPI_OFFSET,
->pnor->mmio);
-
 return isa_bus;
 }
diff --git a/include/hw/ppc/pnv.h b/include/hw/ppc/pnv.h
index ee7eda3e01..d69cee17b2 100644
--- a/include/hw/ppc/pnv.h
+++ b/include/hw/ppc/pnv.h
@@ -58,6 +58,7 @@ struct PnvChip {
 MemoryRegion xscom;
 AddressSpace xscom_as;
 
+MemoryRegion *fw_mr;
 gchar*dt_isa_nodename;
 };
 
-- 
2.29.2




[PULL 09/19] ppc/pnv: Discard internal BMC initialization when BMC is external

2021-02-09 Thread David Gibson
From: Cédric Le Goater 

The PowerNV machine can be run with an external IPMI BMC device
connected to a remote QEMU machine acting as BMC, using these options :

  -chardev socket,id=ipmi0,host=localhost,port=9002,reconnect=10 \
  -device ipmi-bmc-extern,id=bmc0,chardev=ipmi0 \
  -device isa-ipmi-bt,bmc=bmc0,irq=10 \
  -nodefaults

In that case, some aspects of the BMC initialization should be
skipped, since they rely on the simulator interface.

Signed-off-by: Cédric Le Goater 
Message-Id: <20210126171059.307867-6-...@kaod.org>
Reviewed-by: Joel Stanley 
Signed-off-by: David Gibson 
---
 hw/ppc/pnv_bmc.c | 15 ++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/hw/ppc/pnv_bmc.c b/hw/ppc/pnv_bmc.c
index 86d16b4935..b9bf5735ea 100644
--- a/hw/ppc/pnv_bmc.c
+++ b/hw/ppc/pnv_bmc.c
@@ -51,6 +51,11 @@ typedef struct OemSel {
 #define SOFT_OFF0x00
 #define SOFT_REBOOT 0x01
 
+static bool pnv_bmc_is_simulator(IPMIBmc *bmc)
+{
+return object_dynamic_cast(OBJECT(bmc), TYPE_IPMI_BMC_SIMULATOR);
+}
+
 static void pnv_gen_oem_sel(IPMIBmc *bmc, uint8_t reboot)
 {
 /* IPMI SEL Event are 16 bytes long */
@@ -79,6 +84,10 @@ void pnv_dt_bmc_sensors(IPMIBmc *bmc, void *fdt)
 const struct ipmi_sdr_compact *sdr;
 uint16_t nextrec;
 
+if (!pnv_bmc_is_simulator(bmc)) {
+return;
+}
+
 offset = fdt_add_subnode(fdt, 0, "bmc");
 _FDT(offset);
 
@@ -243,6 +252,10 @@ static const IPMINetfn hiomap_netfn = {
 
 void pnv_bmc_set_pnor(IPMIBmc *bmc, PnvPnor *pnor)
 {
+if (!pnv_bmc_is_simulator(bmc)) {
+return;
+}
+
 object_ref(OBJECT(pnor));
 object_property_add_const_link(OBJECT(bmc), "pnor", OBJECT(pnor));
 
@@ -286,7 +299,7 @@ static int bmc_find(Object *child, void *opaque)
 
 IPMIBmc *pnv_bmc_find(Error **errp)
 {
-ForeachArgs args = { TYPE_IPMI_BMC_SIMULATOR, NULL };
+ForeachArgs args = { TYPE_IPMI_BMC, NULL };
 int ret;
 
 ret = object_child_foreach_recursive(object_get_root(), bmc_find, );
-- 
2.29.2




[PULL 13/19] spapr_numa.c: create spapr_numa_initial_nvgpu_numa_id() helper

2021-02-09 Thread David Gibson
From: Daniel Henrique Barboza 

We'll need to check the initial value given to spapr->gpu_numa_id when
building the rtas DT, so put it in a helper for easier access and to
avoid repetition.

Tested-by: Cédric Le Goater 
Reviewed-by: Greg Kurz 
Signed-off-by: Daniel Henrique Barboza 
Message-Id: <20210128174213.1349181-3-danielhb...@gmail.com>
Signed-off-by: David Gibson 
---
 hw/ppc/spapr.c  | 11 +--
 hw/ppc/spapr_numa.c | 14 ++
 include/hw/ppc/spapr_numa.h |  1 +
 3 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 8a1a979257..85fe65f894 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -2770,16 +2770,7 @@ static void spapr_machine_init(MachineState *machine)
 
 }
 
-/*
- * NVLink2-connected GPU RAM needs to be placed on a separate NUMA node.
- * We assign a new numa ID per GPU in spapr_pci_collect_nvgpu() which is
- * called from vPHB reset handler so we initialize the counter here.
- * If no NUMA is configured from the QEMU side, we start from 1 as GPU RAM
- * must be equally distant from any other node.
- * The final value of spapr->gpu_numa_id is going to be written to
- * max-associativity-domains in spapr_build_fdt().
- */
-spapr->gpu_numa_id = MAX(1, machine->numa_state->num_nodes);
+spapr->gpu_numa_id = spapr_numa_initial_nvgpu_numa_id(machine);
 
 /* Init numa_assoc_array */
 spapr_numa_associativity_init(spapr, machine);
diff --git a/hw/ppc/spapr_numa.c b/hw/ppc/spapr_numa.c
index 261810525b..a757dd88b8 100644
--- a/hw/ppc/spapr_numa.c
+++ b/hw/ppc/spapr_numa.c
@@ -46,6 +46,20 @@ static bool spapr_numa_is_symmetrical(MachineState *ms)
 return true;
 }
 
+/*
+ * NVLink2-connected GPU RAM needs to be placed on a separate NUMA node.
+ * We assign a new numa ID per GPU in spapr_pci_collect_nvgpu() which is
+ * called from vPHB reset handler so we initialize the counter here.
+ * If no NUMA is configured from the QEMU side, we start from 1 as GPU RAM
+ * must be equally distant from any other node.
+ * The final value of spapr->gpu_numa_id is going to be written to
+ * max-associativity-domains in spapr_build_fdt().
+ */
+unsigned int spapr_numa_initial_nvgpu_numa_id(MachineState *machine)
+{
+return MAX(1, machine->numa_state->num_nodes);
+}
+
 /*
  * This function will translate the user distances into
  * what the kernel understand as possible values: 10
diff --git a/include/hw/ppc/spapr_numa.h b/include/hw/ppc/spapr_numa.h
index b3fd950634..6f9f02d3de 100644
--- a/include/hw/ppc/spapr_numa.h
+++ b/include/hw/ppc/spapr_numa.h
@@ -31,5 +31,6 @@ int spapr_numa_fixup_cpu_dt(SpaprMachineState *spapr, void 
*fdt,
 int offset, PowerPCCPU *cpu);
 int spapr_numa_write_assoc_lookup_arrays(SpaprMachineState *spapr, void *fdt,
  int offset);
+unsigned int spapr_numa_initial_nvgpu_numa_id(MachineState *machine);
 
 #endif /* HW_SPAPR_NUMA_H */
-- 
2.29.2




[PULL 18/19] hw/net: fsl_etsec: Reverse the RCTRL.RSF logic

2021-02-09 Thread David Gibson
From: Bin Meng 

Per MPC8548ERM [1] chapter 14.5.3.4.1:

When RCTRL.RSF is 1, frames less than 64 bytes are accepted upon
a DA match. But currently QEMU does the opposite. This commit
reverses the RCTRL.RSF testing logic to match the manual.

Due to the reverse of the logic, certain guests may potentially
break if they don't program eTSEC to have RCTRL.RSF bit set.
When RCTRL.RSF is 0, short frames are silently dropped, however
as of today both slirp and tap networking do not pad short frames
(e.g.: an ARP packet) to the minimum frame size of 60 bytes. So
ARP requests will be dropped, preventing the guest from becoming
visible on the network.

The same issue was reported on e1000 and vmxenet3 before, see:

commit 78aeb23eded2 ("e1000: Pad short frames to minimum size (60 bytes)")
commit 40a87c6c9b11 ("vmxnet3: Pad short frames to minimum size (60 bytes)")

[1] https://www.nxp.com/docs/en/reference-manual/MPC8548ERM.pdf

Fixes: eb1e7c3e5146 ("Add Enhanced Three-Speed Ethernet Controller (eTSEC)")
Signed-off-by: Bin Meng 

Message-Id: <1612923021-19746-1-git-send-email-bmeng...@gmail.com>
Signed-off-by: David Gibson 
---
 hw/net/fsl_etsec/rings.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/net/fsl_etsec/rings.c b/hw/net/fsl_etsec/rings.c
index 121415abfe..fe055d3381 100644
--- a/hw/net/fsl_etsec/rings.c
+++ b/hw/net/fsl_etsec/rings.c
@@ -502,7 +502,7 @@ ssize_t etsec_rx_ring_write(eTSEC *etsec, const uint8_t 
*buf, size_t size)
 return -1;
 }
 
-if ((etsec->regs[RCTRL].value & RCTRL_RSF) && (size < 60)) {
+if (!(etsec->regs[RCTRL].value & RCTRL_RSF) && (size < 60)) {
 /* CRC is not in the packet yet, so short frame is below 60 bytes */
 RING_DEBUG("%s: Drop short frame\n", __func__);
 return -1;
-- 
2.29.2




[PULL 05/19] ppc/pnv: Add trace events for PCI event notification

2021-02-09 Thread David Gibson
From: Cédric Le Goater 

On POWER9 systems, PHB controllers signal the XIVE interrupt controller
of a source interrupt notification using a store on a MMIO region. Add
traces for such events.

Signed-off-by: Cédric Le Goater 
Message-Id: <20210126171059.307867-2-...@kaod.org>
Signed-off-by: David Gibson 
---
 hw/intc/pnv_xive.c   | 3 +++
 hw/intc/trace-events | 3 +++
 hw/pci-host/pnv_phb4.c   | 3 +++
 hw/pci-host/trace-events | 3 +++
 4 files changed, 12 insertions(+)

diff --git a/hw/intc/pnv_xive.c b/hw/intc/pnv_xive.c
index 5f69626b3a..ad43483612 100644
--- a/hw/intc/pnv_xive.c
+++ b/hw/intc/pnv_xive.c
@@ -24,6 +24,7 @@
 #include "hw/ppc/xive_regs.h"
 #include "hw/qdev-properties.h"
 #include "hw/ppc/ppc.h"
+#include "trace.h"
 
 #include 
 
@@ -1319,6 +1320,8 @@ static void pnv_xive_ic_hw_trigger(PnvXive *xive, hwaddr 
addr, uint64_t val)
 uint8_t blk;
 uint32_t idx;
 
+trace_pnv_xive_ic_hw_trigger(addr, val);
+
 if (val & XIVE_TRIGGER_END) {
 xive_error(xive, "IC: END trigger at @0x%"HWADDR_PRIx" data 0x%"PRIx64,
addr, val);
diff --git a/hw/intc/trace-events b/hw/intc/trace-events
index 8ed397a0d5..45ddaf48df 100644
--- a/hw/intc/trace-events
+++ b/hw/intc/trace-events
@@ -236,3 +236,6 @@ xive_tctx_tm_write(uint64_t offset, unsigned int size, 
uint64_t value) "@0x0x%"P
 xive_tctx_tm_read(uint64_t offset, unsigned int size, uint64_t value) 
"@0x0x%"PRIx64" sz=%d val=0x%" PRIx64
 xive_presenter_notify(uint8_t nvt_blk, uint32_t nvt_idx, uint8_t ring) "found 
NVT 0x%x/0x%x ring=0x%x"
 xive_end_source_read(uint8_t end_blk, uint32_t end_idx, uint64_t addr) "END 
0x%x/0x%x @0x0x%"PRIx64
+
+# pnv_xive.c
+pnv_xive_ic_hw_trigger(uint64_t addr, uint64_t val) "@0x%"PRIx64" 
val=0x%"PRIx64
diff --git a/hw/pci-host/pnv_phb4.c b/hw/pci-host/pnv_phb4.c
index 6328e985f8..54f57c660a 100644
--- a/hw/pci-host/pnv_phb4.c
+++ b/hw/pci-host/pnv_phb4.c
@@ -22,6 +22,7 @@
 #include "hw/irq.h"
 #include "hw/qdev-properties.h"
 #include "qom/object.h"
+#include "trace.h"
 
 #define phb_error(phb, fmt, ...)\
 qemu_log_mask(LOG_GUEST_ERROR, "phb4[%d:%d]: " fmt "\n",\
@@ -1257,6 +1258,8 @@ static void pnv_phb4_xive_notify(XiveNotifier *xf, 
uint32_t srcno)
 uint64_t data = XIVE_TRIGGER_PQ | offset | srcno;
 MemTxResult result;
 
+trace_pnv_phb4_xive_notify(notif_port, data);
+
 address_space_stq_be(_space_memory, notif_port, data,
  MEMTXATTRS_UNSPECIFIED, );
 if (result != MEMTX_OK) {
diff --git a/hw/pci-host/trace-events b/hw/pci-host/trace-events
index d19ca9aef6..7d8063ac42 100644
--- a/hw/pci-host/trace-events
+++ b/hw/pci-host/trace-events
@@ -20,3 +20,6 @@ unin_data_write(uint64_t addr, unsigned len, uint64_t val) 
"write addr 0x%"PRIx6
 unin_data_read(uint64_t addr, unsigned len, uint64_t val) "read addr 
0x%"PRIx64 " len %d val 0x%"PRIx64
 unin_write(uint64_t addr, uint64_t value) "addr=0x%" PRIx64 " val=0x%"PRIx64
 unin_read(uint64_t addr, uint64_t value) "addr=0x%" PRIx64 " val=0x%"PRIx64
+
+# pnv_phb4.c
+pnv_phb4_xive_notify(uint64_t notif_port, uint64_t data) "notif=@0x%"PRIx64" 
data=0x%"PRIx64
-- 
2.29.2




[PULL 10/19] ppc/pnv: Remove default disablement of the PNOR contents

2021-02-09 Thread David Gibson
From: Cédric Le Goater 

On PowerNV systems, the BMC is in charge of mapping the PNOR contents
on the LPC FW address space using the HIOMAP protocol. Under QEMU, we
emulate this behavior and we also add an extra control on the flash
accesses by letting the HIOMAP command handler decide whether the
memory region is accessible or not depending on the firmware requests.

However, this behavior is not compatible with hostboot like firmwares
which need this mapping to be always available. For this reason, the
PNOR memory region is initially disabled for skiboot mode only.

This is badly placed under the LPC model and requires the use of the
machine. Since it doesn't add much, simply remove the initial setting.
The extra control in the HIOMAP command handler will still be performed.

Signed-off-by: Cédric Le Goater 
Message-Id: <20210126171059.307867-7-...@kaod.org>
Reviewed-by: Joel Stanley 
Signed-off-by: David Gibson 
---
 hw/ppc/pnv_lpc.c | 8 
 1 file changed, 8 deletions(-)

diff --git a/hw/ppc/pnv_lpc.c b/hw/ppc/pnv_lpc.c
index 5903590220..11739e397b 100644
--- a/hw/ppc/pnv_lpc.c
+++ b/hw/ppc/pnv_lpc.c
@@ -825,7 +825,6 @@ ISABus *pnv_lpc_isa_create(PnvLpcController *lpc, bool 
use_cpld, Error **errp)
 qemu_irq *irqs;
 qemu_irq_handler handler;
 PnvMachineState *pnv = PNV_MACHINE(qdev_get_machine());
-bool hostboot_mode = !!pnv->fw_load_addr;
 
 /* let isa_bus_new() create its own bridge on SysBus otherwise
  * devices specified on the command line won't find the bus and
@@ -856,13 +855,6 @@ ISABus *pnv_lpc_isa_create(PnvLpcController *lpc, bool 
use_cpld, Error **errp)
  */
 memory_region_add_subregion(>isa_fw, PNOR_SPI_OFFSET,
 >pnor->mmio);
-/*
- * Start disabled. The HIOMAP protocol will activate the mapping
- * with HIOMAP_C_CREATE_WRITE_WINDOW
- */
-if (!hostboot_mode) {
-memory_region_set_enabled(>pnor->mmio, false);
-}
 
 return isa_bus;
 }
-- 
2.29.2




[PULL 16/19] hw/ppc: e500: Use a macro for the platform clock frequency

2021-02-09 Thread David Gibson
From: Bin Meng 

At present the platform clock frequency is using a magic number.
Convert it to a macro and use it everywhere.

Signed-off-by: Bin Meng 
Reviewed-by: Philippe Mathieu-Daudé 

Message-Id: <1612362288-22216-1-git-send-email-bmeng...@gmail.com>
Signed-off-by: David Gibson 
---
 hw/ppc/e500.c | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/hw/ppc/e500.c b/hw/ppc/e500.c
index c64b5d08bd..c795276668 100644
--- a/hw/ppc/e500.c
+++ b/hw/ppc/e500.c
@@ -74,6 +74,8 @@
 #define MPC8544_I2C_IRQ43
 #define RTC_REGS_OFFSET0x68
 
+#define PLATFORM_CLK_FREQ_HZ   (400 * 1000 * 1000)
+
 struct boot_info
 {
 uint32_t dt_base;
@@ -320,8 +322,8 @@ static int ppce500_load_device_tree(PPCE500MachineState 
*pms,
 int fdt_size;
 void *fdt;
 uint8_t hypercall[16];
-uint32_t clock_freq = 4;
-uint32_t tb_freq = 4;
+uint32_t clock_freq = PLATFORM_CLK_FREQ_HZ;
+uint32_t tb_freq = PLATFORM_CLK_FREQ_HZ;
 int i;
 char compatible_sb[] = "fsl,mpc8544-immr\0simple-bus";
 char *soc;
@@ -890,7 +892,7 @@ void ppce500_init(MachineState *machine)
 env->spr_cb[SPR_BOOKE_PIR].default_value = cs->cpu_index = i;
 env->mpic_iack = pmc->ccsrbar_base + MPC8544_MPIC_REGS_OFFSET + 0xa0;
 
-ppc_booke_timers_init(cpu, 4, PPC_TIMER_E500);
+ppc_booke_timers_init(cpu, PLATFORM_CLK_FREQ_HZ, PPC_TIMER_E500);
 
 /* Register reset handler */
 if (!i) {
-- 
2.29.2




[PULL 14/19] spapr_numa.c: fix ibm, max-associativity-domains calculation

2021-02-09 Thread David Gibson
From: Daniel Henrique Barboza 

The current logic for calculating 'maxdomain' making it a sum of
numa_state->num_nodes with spapr->gpu_numa_id. spapr->gpu_numa_id is
used as a index to determine the next available NUMA id that a
given NVGPU can use.

The problem is that the initial value of gpu_numa_id, for any topology
that has more than one NUMA node, is equal to numa_state->num_nodes.
This means that our maxdomain will always be, at least, twice the
amount of existing NUMA nodes. This means that a guest with 4 NUMA
nodes will end up with the following max-associativity-domains:

rtas/ibm,max-associativity-domains
 0004 0008 0008 0008 0008

This overtuning of maxdomains doesn't go unnoticed in the guest, being
detected in SLUB during boot:

 dmesg | grep SLUB
[0.00] SLUB: HWalign=128, Order=0-3, MinObjects=0, CPUs=4, Nodes=8

SLUB is detecting 8 total nodes, with 4 nodes being online.

This patch fixes ibm,max-associativity-domains by considering the amount
of NVGPUs NUMA nodes presented in the guest, instead of just
spapr->gpu_numa_id.

Reported-by: Cédric Le Goater 
Tested-by: Cédric Le Goater 
Signed-off-by: Daniel Henrique Barboza 
Message-Id: <20210128174213.1349181-4-danielhb...@gmail.com>
Reviewed-by: Greg Kurz 
Signed-off-by: David Gibson 
---
 hw/ppc/spapr_numa.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/hw/ppc/spapr_numa.c b/hw/ppc/spapr_numa.c
index a757dd88b8..779f18b994 100644
--- a/hw/ppc/spapr_numa.c
+++ b/hw/ppc/spapr_numa.c
@@ -311,6 +311,8 @@ void spapr_numa_write_rtas_dt(SpaprMachineState *spapr, 
void *fdt, int rtas)
 {
 MachineState *ms = MACHINE(spapr);
 SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr);
+uint32_t number_nvgpus_nodes = spapr->gpu_numa_id -
+   spapr_numa_initial_nvgpu_numa_id(ms);
 uint32_t refpoints[] = {
 cpu_to_be32(0x4),
 cpu_to_be32(0x3),
@@ -318,7 +320,7 @@ void spapr_numa_write_rtas_dt(SpaprMachineState *spapr, 
void *fdt, int rtas)
 cpu_to_be32(0x1),
 };
 uint32_t nr_refpoints = ARRAY_SIZE(refpoints);
-uint32_t maxdomain = ms->numa_state->num_nodes + spapr->gpu_numa_id;
+uint32_t maxdomain = ms->numa_state->num_nodes + number_nvgpus_nodes;
 uint32_t maxdomains[] = {
 cpu_to_be32(4),
 cpu_to_be32(maxdomain),
-- 
2.29.2




[PULL 06/19] ppc/xive: Add firmware bit when dumping the ENDs

2021-02-09 Thread David Gibson
From: Cédric Le Goater 

ENDs allocated by OPAL for the HW thread VPs are tagged as owned by FW.
Dump the state in 'info pic'.

Signed-off-by: Cédric Le Goater 
Message-Id: <20210126171059.307867-3-...@kaod.org>
Signed-off-by: David Gibson 
---
 hw/intc/xive.c | 3 ++-
 include/hw/ppc/xive_regs.h | 2 ++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/hw/intc/xive.c b/hw/intc/xive.c
index fa8c3d8287..eeb4e62ba9 100644
--- a/hw/intc/xive.c
+++ b/hw/intc/xive.c
@@ -1294,7 +1294,7 @@ void xive_end_pic_print_info(XiveEND *end, uint32_t 
end_idx, Monitor *mon)
 
 pq = xive_get_field32(END_W1_ESn, end->w1);
 
-monitor_printf(mon, "  %08x %c%c %c%c%c%c%c%c%c prio:%d nvt:%02x/%04x",
+monitor_printf(mon, "  %08x %c%c %c%c%c%c%c%c%c%c prio:%d nvt:%02x/%04x",
end_idx,
pq & XIVE_ESB_VAL_P ? 'P' : '-',
pq & XIVE_ESB_VAL_Q ? 'Q' : '-',
@@ -1305,6 +1305,7 @@ void xive_end_pic_print_info(XiveEND *end, uint32_t 
end_idx, Monitor *mon)
xive_end_is_escalate(end) ? 'e' : '-',
xive_end_is_uncond_escalation(end)   ? 'u' : '-',
xive_end_is_silent_escalation(end)   ? 's' : '-',
+   xive_end_is_firmware(end)   ? 'f' : '-',
priority, nvt_blk, nvt_idx);
 
 if (qaddr_base) {
diff --git a/include/hw/ppc/xive_regs.h b/include/hw/ppc/xive_regs.h
index 7879692825..b7fde2354e 100644
--- a/include/hw/ppc/xive_regs.h
+++ b/include/hw/ppc/xive_regs.h
@@ -236,6 +236,8 @@ typedef struct XiveEND {
 (be32_to_cpu((end)->w0) & END_W0_UNCOND_ESCALATE)
 #define xive_end_is_silent_escalation(end)  \
 (be32_to_cpu((end)->w0) & END_W0_SILENT_ESCALATE)
+#define xive_end_is_firmware(end)  \
+(be32_to_cpu((end)->w0) & END_W0_FIRMWARE)
 
 static inline uint64_t xive_end_qaddr(XiveEND *end)
 {
-- 
2.29.2




[PULL 12/19] spapr: move spapr_machine_using_legacy_numa() to spapr_numa.c

2021-02-09 Thread David Gibson
From: Daniel Henrique Barboza 

This function is used only in spapr_numa.c.

Tested-by: Cédric Le Goater 
Reviewed-by: Greg Kurz 
Signed-off-by: Daniel Henrique Barboza 
Message-Id: <20210128174213.1349181-2-danielhb...@gmail.com>
Signed-off-by: David Gibson 
---
 hw/ppc/spapr.c | 9 -
 hw/ppc/spapr_numa.c| 9 +
 include/hw/ppc/spapr.h | 1 -
 3 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index f9ea9d1097..8a1a979257 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -296,15 +296,6 @@ static hwaddr spapr_node0_size(MachineState *machine)
 return machine->ram_size;
 }
 
-bool spapr_machine_using_legacy_numa(SpaprMachineState *spapr)
-{
-MachineState *machine = MACHINE(spapr);
-SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(machine);
-
-return smc->pre_5_2_numa_associativity ||
-   machine->numa_state->num_nodes <= 1;
-}
-
 static void add_str(GString *s, const gchar *s1)
 {
 g_string_append_len(s, s1, strlen(s1) + 1);
diff --git a/hw/ppc/spapr_numa.c b/hw/ppc/spapr_numa.c
index b50796bbe3..261810525b 100644
--- a/hw/ppc/spapr_numa.c
+++ b/hw/ppc/spapr_numa.c
@@ -19,6 +19,15 @@
 /* Moved from hw/ppc/spapr_pci_nvlink2.c */
 #define SPAPR_GPU_NUMA_ID   (cpu_to_be32(1))
 
+static bool spapr_machine_using_legacy_numa(SpaprMachineState *spapr)
+{
+MachineState *machine = MACHINE(spapr);
+SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(machine);
+
+return smc->pre_5_2_numa_associativity ||
+   machine->numa_state->num_nodes <= 1;
+}
+
 static bool spapr_numa_is_symmetrical(MachineState *ms)
 {
 int src, dst;
diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
index c27c7ce515..ccbeeca1de 100644
--- a/include/hw/ppc/spapr.h
+++ b/include/hw/ppc/spapr.h
@@ -851,7 +851,6 @@ int spapr_max_server_number(SpaprMachineState *spapr);
 void spapr_store_hpte(PowerPCCPU *cpu, hwaddr ptex,
   uint64_t pte0, uint64_t pte1);
 void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered);
-bool spapr_machine_using_legacy_numa(SpaprMachineState *spapr);
 
 /* DRC callbacks. */
 void spapr_core_release(DeviceState *dev);
-- 
2.29.2




[PULL 07/19] ppc/pnv: Use skiboot addresses to load kernel and ramfs

2021-02-09 Thread David Gibson
From: Cédric Le Goater 

The current settings are useful to load large kernels (with debug) but
it moves the initrd image in a memory region not protected by
skiboot. If skiboot is compiled with DEBUG=1, memory poisoning will
corrupt the initrd.

Cc: Murilo Opsfelder Araujo 
Signed-off-by: Cédric Le Goater 
Message-Id: <20210126171059.307867-4-...@kaod.org>
Reviewed-by: Murilo Opsfelder Araujo 
Reviewed-by: Joel Stanley 
Signed-off-by: David Gibson 
---
 hw/ppc/pnv.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/hw/ppc/pnv.c b/hw/ppc/pnv.c
index 14fc9758a9..e500c2e243 100644
--- a/hw/ppc/pnv.c
+++ b/hw/ppc/pnv.c
@@ -65,9 +65,9 @@
 #define FW_MAX_SIZE (16 * MiB)
 
 #define KERNEL_LOAD_ADDR0x2000
-#define KERNEL_MAX_SIZE (256 * MiB)
-#define INITRD_LOAD_ADDR0x6000
-#define INITRD_MAX_SIZE (256 * MiB)
+#define KERNEL_MAX_SIZE (128 * MiB)
+#define INITRD_LOAD_ADDR0x2800
+#define INITRD_MAX_SIZE (128 * MiB)
 
 static const char *pnv_chip_core_typename(const PnvChip *o)
 {
-- 
2.29.2




[PULL 08/19] ppc/pnv: Simplify pnv_bmc_create()

2021-02-09 Thread David Gibson
From: Cédric Le Goater 

and reuse pnv_bmc_set_pnor() to share the setting of the PNOR.

Signed-off-by: Cédric Le Goater 
Message-Id: <20210126171059.307867-5-...@kaod.org>
Signed-off-by: David Gibson 
---
 hw/ppc/pnv_bmc.c | 7 +--
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/hw/ppc/pnv_bmc.c b/hw/ppc/pnv_bmc.c
index 67ebb16c4d..86d16b4935 100644
--- a/hw/ppc/pnv_bmc.c
+++ b/hw/ppc/pnv_bmc.c
@@ -260,13 +260,8 @@ IPMIBmc *pnv_bmc_create(PnvPnor *pnor)
 Object *obj;
 
 obj = object_new(TYPE_IPMI_BMC_SIMULATOR);
-object_ref(OBJECT(pnor));
-object_property_add_const_link(obj, "pnor", OBJECT(pnor));
 qdev_realize(DEVICE(obj), NULL, _fatal);
-
-/* Install the HIOMAP protocol handlers to access the PNOR */
-ipmi_sim_register_netfn(IPMI_BMC_SIMULATOR(obj), IPMI_NETFN_OEM,
-_netfn);
+pnv_bmc_set_pnor(IPMI_BMC(obj), pnor);
 
 return IPMI_BMC(obj);
 }
-- 
2.29.2




[PULL 04/19] target/ppc: Remove unused MMU definitions

2021-02-09 Thread David Gibson
From: Philippe Mathieu-Daudé 

Remove these confusing and unused definitions.

Signed-off-by: Philippe Mathieu-Daudé 
Message-Id: <20210127232401.3525126-1-f4...@amsat.org>
Signed-off-by: David Gibson 
---
 target/ppc/cpu.h | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
index 2609e4082e..cb00210288 100644
--- a/target/ppc/cpu.h
+++ b/target/ppc/cpu.h
@@ -2205,9 +2205,6 @@ enum {
  * may be needed for precise access rights control and precise exceptions.
  */
 enum {
-/* 1 bit to define user level / supervisor access */
-ACCESS_USER  = 0x00,
-ACCESS_SUPER = 0x01,
 /* Type of instruction that generated the access */
 ACCESS_CODE  = 0x10, /* Code fetch access*/
 ACCESS_INT   = 0x20, /* Integer load/store access*/
-- 
2.29.2




[PULL 01/19] spapr.c: use g_auto* with 'nodename' in CPU DT functions

2021-02-09 Thread David Gibson
From: Daniel Henrique Barboza 

Next patch will use the 'nodename' string in spapr_core_dt_populate()
after the point it's being freed today.

Instead of moving 'g_free(nodename)' around, let's do a QoL change in
both CPU DT functions where 'nodename' is being freed, and use
g_autofree to avoid the 'g_free()' call altogether.

Signed-off-by: Daniel Henrique Barboza 
Message-Id: <20210120232305.241521-2-danielhb...@gmail.com>
Signed-off-by: David Gibson 
---
 hw/ppc/spapr.c | 6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 612356e9ec..e7992c0422 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -791,7 +791,6 @@ static void spapr_dt_cpus(void *fdt, SpaprMachineState 
*spapr)
 CPUState *cs;
 int n_cpus;
 int cpus_offset;
-char *nodename;
 int i;
 
 cpus_offset = fdt_add_subnode(fdt, 0, "cpus");
@@ -819,6 +818,7 @@ static void spapr_dt_cpus(void *fdt, SpaprMachineState 
*spapr)
 PowerPCCPU *cpu = POWERPC_CPU(cs);
 int index = spapr_get_vcpu_id(cpu);
 DeviceClass *dc = DEVICE_GET_CLASS(cs);
+g_autofree char *nodename = NULL;
 int offset;
 
 if (!spapr_is_thread0_in_vcore(spapr, cpu)) {
@@ -827,7 +827,6 @@ static void spapr_dt_cpus(void *fdt, SpaprMachineState 
*spapr)
 
 nodename = g_strdup_printf("%s@%x", dc->fw_name, index);
 offset = fdt_add_subnode(fdt, cpus_offset, nodename);
-g_free(nodename);
 _FDT(offset);
 spapr_dt_cpu(cs, fdt, offset, spapr);
 }
@@ -3749,12 +3748,11 @@ int spapr_core_dt_populate(SpaprDrc *drc, 
SpaprMachineState *spapr,
 PowerPCCPU *cpu = POWERPC_CPU(cs);
 DeviceClass *dc = DEVICE_GET_CLASS(cs);
 int id = spapr_get_vcpu_id(cpu);
-char *nodename;
+g_autofree char *nodename = NULL;
 int offset;
 
 nodename = g_strdup_printf("%s@%x", dc->fw_name, id);
 offset = fdt_add_subnode(fdt, 0, nodename);
-g_free(nodename);
 
 spapr_dt_cpu(cs, fdt, offset, spapr);
 
-- 
2.29.2




[PULL 03/19] spapr: Adjust firmware path of PCI devices

2021-02-09 Thread David Gibson
From: Greg Kurz 

It is currently not possible to perform a strict boot from USB storage:

$ qemu-system-ppc64 -accel kvm -nodefaults -nographic -serial stdio \
-boot strict=on \
-device qemu-xhci \
-device usb-storage,drive=disk,bootindex=0 \
-blockdev driver=file,node-name=disk,filename=fedora-ppc64le.qcow2

SLOF **
QEMU Starting
 Build Date = Jul 17 2020 11:15:24
 FW Version = git-e18ddad8516ff2cf
 Press "s" to enter Open Firmware.

Populating /vdevice methods
Populating /vdevice/vty@7100
Populating /vdevice/nvram@7101
Populating /pci@8002000
 00  (D) : 1b36 000dserial bus [ usb-xhci ]
No NVRAM common partition, re-initializing...
Scanning USB
  XHCI: Initializing
USB Storage
   SCSI: Looking for devices
  101 DISK : "QEMU QEMU HARDDISK2.5+"
Using default console: /vdevice/vty@7100

  Welcome to Open Firmware

  Copyright (c) 2004, 2017 IBM Corporation All rights reserved.
  This program and the accompanying materials are made available
  under the terms of the BSD License available at
  http://www.opensource.org/licenses/bsd-license.php

Trying to load:  from: 
/pci@8002000/usb@0/storage@1/disk@101 ...
E3405: No such device

E3407: Load failed

  Type 'boot' and press return to continue booting the system.
  Type 'reset-all' and press return to reboot the system.

Ready!
0 >

The device tree handed over by QEMU to SLOF indeed contains:

qemu,boot-list =
"/pci@8002000/usb@0/storage@1/disk@101 HALT";

but the device node is named usb-xhci@0, not usb@0.

This happens because the firmware names of PCI devices returned
by get_boot_devices_list() come from pcibus_get_fw_dev_path(),
while the sPAPR PHB code uses a different naming scheme for
device nodes. This inconsistency has always been there but it was
hidden for a long time because SLOF used to rename USB device
nodes, until this commit, merged in QEMU 4.2.0 :

commit 85164ad4ed9960cac842fa4cc067c6b6699b0994
Author: Alexey Kardashevskiy 
Date:   Wed Sep 11 16:24:32 2019 +1000

pseries: Update SLOF firmware image

This fixes USB host bus adapter name in the device tree to match QEMU's
one.

Signed-off-by: Alexey Kardashevskiy 
Signed-off-by: David Gibson 

Fortunately, sPAPR implements the firmware path provider interface.
This provides a way to override the default firmware paths.

Just factor out the sPAPR PHB naming logic from spapr_dt_pci_device()
to a helper, and use it in the sPAPR firmware path provider hook.

Fixes: 85164ad4ed99 ("pseries: Update SLOF firmware image")
Signed-off-by: Greg Kurz 
Message-Id: <20210122170157.246374-1-gr...@kaod.org>
Reviewed-by: Daniel Henrique Barboza 
Signed-off-by: David Gibson 
---
 hw/ppc/spapr.c  |  5 +
 hw/ppc/spapr_pci.c  | 33 ++---
 include/hw/pci-host/spapr.h |  2 ++
 3 files changed, 25 insertions(+), 15 deletions(-)

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 0ecc193468..f9ea9d1097 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -3054,6 +3054,7 @@ static char *spapr_get_fw_dev_path(FWPathProvider *p, 
BusState *bus,
 SCSIDevice *d = CAST(SCSIDevice,  dev, TYPE_SCSI_DEVICE);
 SpaprPhbState *phb = CAST(SpaprPhbState, dev, TYPE_SPAPR_PCI_HOST_BRIDGE);
 VHostSCSICommon *vsc = CAST(VHostSCSICommon, dev, TYPE_VHOST_SCSI_COMMON);
+PCIDevice *pcidev = CAST(PCIDevice, dev, TYPE_PCI_DEVICE);
 
 if (d) {
 void *spapr = CAST(void, bus->parent, "spapr-vscsi");
@@ -3127,6 +3128,10 @@ static char *spapr_get_fw_dev_path(FWPathProvider *p, 
BusState *bus,
 return g_strdup_printf("pci@%x", PCI_SLOT(pcidev->devfn));
 }
 
+if (pcidev) {
+return spapr_pci_fw_dev_name(pcidev);
+}
+
 return NULL;
 }
 
diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
index 24b4972300..f1c7479816 100644
--- a/hw/ppc/spapr_pci.c
+++ b/hw/ppc/spapr_pci.c
@@ -1344,15 +1344,29 @@ static int spapr_dt_pci_bus(SpaprPhbState *sphb, PCIBus 
*bus,
 return offset;
 }
 
+char *spapr_pci_fw_dev_name(PCIDevice *dev)
+{
+const gchar *basename;
+int slot = PCI_SLOT(dev->devfn);
+int func = PCI_FUNC(dev->devfn);
+uint32_t ccode = pci_default_read_config(dev, PCI_CLASS_PROG, 3);
+
+basename = dt_name_from_class((ccode >> 16) & 0xff, (ccode >> 8) & 0xff,
+  ccode & 0xff);
+
+if (func != 0) {
+return g_strdup_printf("%s@%x,%x", basename, slot, func);
+} else {
+return g_strdup_printf("%s@%x", basename, slot);
+}
+}
+
 /* create OF node for pci device and required OF DT properties */
 static int spapr_dt_pci_device(SpaprPhbState *sphb, PCIDevice *dev,
void *fdt, int parent_offset)
 {
 int offset;
-const gchar *basename;
-gchar *nodename;
-int slot = 

[PULL 02/19] spapr.c: add 'name' property for hotplugged CPUs nodes

2021-02-09 Thread David Gibson
From: Daniel Henrique Barboza 

In the CPU hotunplug bug [1] the guest kernel throws a scary
message in dmesg:

pseries-hotplug-cpu: Failed to offline CPU , rc: -16

The reason isn't related to the bug though. This happens because the
kernel file arch/powerpc/platform/pseries/hotplug-cpu.c, function
dlpar_cpu_remove(), is not finding the device_node.name of the offending
CPU.

We're not populating the 'name' property for hotplugged CPUs. Since the
kernel relies on device_node.name for identifying CPU nodes, and the
CPUs that are coldplugged has the 'name' property filled by SLOF, this
is creating an unneeded inconsistency between hotplug and coldplug CPUs
in the kernel.

Let's fill the 'name' property for hotplugged CPUs as well. This will
make the guest dmesg throws a less intimidating message when we try to
unplug the last online CPU:

pseries-hotplug-cpu: Failed to offline CPU PowerPC,POWER9@1, rc: -16

[1] https://bugzilla.redhat.com/1911414

Signed-off-by: Daniel Henrique Barboza 
Message-Id: <20210120232305.241521-3-danielhb...@gmail.com>
Signed-off-by: David Gibson 
---
 hw/ppc/spapr.c | 13 +
 1 file changed, 13 insertions(+)

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index e7992c0422..0ecc193468 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -3756,6 +3756,19 @@ int spapr_core_dt_populate(SpaprDrc *drc, 
SpaprMachineState *spapr,
 
 spapr_dt_cpu(cs, fdt, offset, spapr);
 
+/*
+ * spapr_dt_cpu() does not fill the 'name' property in the
+ * CPU node. The function is called during boot process, before
+ * and after CAS, and overwriting the 'name' property written
+ * by SLOF is not allowed.
+ *
+ * Write it manually after spapr_dt_cpu(). This makes the hotplug
+ * CPUs more compatible with the coldplugged ones, which have
+ * the 'name' property. Linux Kernel also relies on this
+ * property to identify CPU nodes.
+ */
+_FDT((fdt_setprop_string(fdt, offset, "name", nodename)));
+
 *fdt_start_offset = offset;
 return 0;
 }
-- 
2.29.2




[PULL 00/19] ppc-for-6.0 queue 20210210

2021-02-09 Thread David Gibson
The following changes since commit 1214d55d1c41fbab3a9973a05085b8760647e411:

  Merge remote-tracking branch 'remotes/nvme/tags/nvme-next-pull-request' into 
staging (2021-02-09 13:24:37 +)

are available in the Git repository at:

  https://gitlab.com/dgibson/qemu.git tags/ppc-for-6.0-20210210

for you to fetch changes up to 298091f831db1a8f360686369f9760849e90dd03:

  target/ppc: Add E500 L2CSR0 write helper (2021-02-10 14:50:11 +1100)


ppc patch queue for 20201-02-10

Here's the latest batch of patches for the ppc target and machine
types.  Highlights are:
 * Several fixes for E500 from Bin Meng
 * Fixes and cleanups for PowerNV from Cédric Le Goater
 * Assorted other fixes and cleanups


Bin Meng (4):
  hw/ppc: e500: Use a macro for the platform clock frequency
  hw/ppc: e500: Fill in correct  for the serial nodes
  hw/net: fsl_etsec: Reverse the RCTRL.RSF logic
  target/ppc: Add E500 L2CSR0 write helper

Cédric Le Goater (8):
  ppc/pnv: Add trace events for PCI event notification
  ppc/xive: Add firmware bit when dumping the ENDs
  ppc/pnv: Use skiboot addresses to load kernel and ramfs
  ppc/pnv: Simplify pnv_bmc_create()
  ppc/pnv: Discard internal BMC initialization when BMC is external
  ppc/pnv: Remove default disablement of the PNOR contents
  ppc/pnv: Introduce a LPC FW memory region attribute to map the PNOR
  ppc/pnv: Set default RAM size to 1 GB

Daniel Henrique Barboza (5):
  spapr.c: use g_auto* with 'nodename' in CPU DT functions
  spapr.c: add 'name' property for hotplugged CPUs nodes
  spapr: move spapr_machine_using_legacy_numa() to spapr_numa.c
  spapr_numa.c: create spapr_numa_initial_nvgpu_numa_id() helper
  spapr_numa.c: fix ibm,max-associativity-domains calculation

Greg Kurz (1):
  spapr: Adjust firmware path of PCI devices

Philippe Mathieu-Daudé (1):
  target/ppc: Remove unused MMU definitions

 hw/intc/pnv_xive.c  |  3 +++
 hw/intc/trace-events|  3 +++
 hw/intc/xive.c  |  3 ++-
 hw/net/fsl_etsec/rings.c|  2 +-
 hw/pci-host/pnv_phb4.c  |  3 +++
 hw/pci-host/trace-events|  3 +++
 hw/ppc/e500.c   | 10 ++
 hw/ppc/pnv.c| 27 +++--
 hw/ppc/pnv_bmc.c| 22 ++---
 hw/ppc/pnv_lpc.c| 15 --
 hw/ppc/spapr.c  | 44 -
 hw/ppc/spapr_numa.c | 27 -
 hw/ppc/spapr_pci.c  | 33 +--
 include/hw/pci-host/spapr.h |  2 ++
 include/hw/ppc/pnv.h|  1 +
 include/hw/ppc/spapr.h  |  1 -
 include/hw/ppc/spapr_numa.h |  1 +
 include/hw/ppc/xive_regs.h  |  2 ++
 target/ppc/cpu.h|  9 ++---
 target/ppc/translate_init.c.inc | 16 +++
 20 files changed, 150 insertions(+), 77 deletions(-)



[PATCH] linux-user: fix O_NONBLOCK in signalfd4() and eventfd2() syscalls

2021-02-09 Thread Helge Deller
On the hppa target userspace binaries may call signalfd4() and
eventfd2() with an old TARGET_O_NONBLOCK value of 00024 instead of
00020 for the "mask" syscall parameter, in which case the current
emulation doesn't handle the translation to the native O_NONBLOCK value
correctly.

The 0x04 bit is not masked out before the new O_NONBLOCK bit is set and
as such when calling the native syscall errors out with EINVAL.

Fix this by introducing TARGET_O_NONBLOCK_MASK which is used to mask off
all possible bits. This define defaults to TARGET_O_NONBLOCK when not
defined otherwise, so for all other targets the implementation will
behave as before.

This patch needs to be applied on top of my previous two patches.

Bug was found and patch was verified by using qemu-hppa as debian buildd
server on x86_64.

Signed-off-by: Helge Deller 

---

diff --git a/linux-user/hppa/target_fcntl.h b/linux-user/hppa/target_fcntl.h
index 08e3a4fcb0..4eb0ec98e2 100644
--- a/linux-user/hppa/target_fcntl.h
+++ b/linux-user/hppa/target_fcntl.h
@@ -9,6 +9,7 @@
 #define HPPA_TARGET_FCNTL_H

 #define TARGET_O_NONBLOCK00020
+#define TARGET_O_NONBLOCK_MASK 00024 /* includes old HP-UX NDELAY flag */
 #define TARGET_O_APPEND  00010
 #define TARGET_O_CREAT   00400 /* not fcntl */
 #define TARGET_O_EXCL02000 /* not fcntl */
diff --git a/linux-user/syscall.c b/linux-user/syscall.c
index 27adee908e..3031aa342f 100644
--- a/linux-user/syscall.c
+++ b/linux-user/syscall.c
@@ -273,6 +273,11 @@ static type name (type1 arg1,type2 arg2,type3 arg3,type4 
arg4,type5 arg5,  \
 #define TARGET_NR__llseek TARGET_NR_llseek
 #endif

+/* some platforms need to mask more bits than just TARGET_O_NONBLOCK */
+#ifndef TARGET_O_NONBLOCK_MASK
+#define TARGET_O_NONBLOCK_MASK TARGET_O_NONBLOCK
+#endif
+
 #define __NR_sys_gettid __NR_gettid
 _syscall0(int, sys_gettid)

@@ -7719,7 +7724,7 @@ static abi_long do_signalfd4(int fd, abi_long mask, int 
flags)
 sigset_t host_mask;
 abi_long ret;

-if (flags & ~(TARGET_O_NONBLOCK | TARGET_O_CLOEXEC)) {
+if (flags & ~(TARGET_O_NONBLOCK_MASK | TARGET_O_CLOEXEC)) {
 return -TARGET_EINVAL;
 }
 if (!lock_user_struct(VERIFY_READ, target_mask, mask, 1)) {
@@ -12508,7 +12513,7 @@ static abi_long do_syscall1(void *cpu_env, int num, 
abi_long arg1,
 #if defined(TARGET_NR_eventfd2)
 case TARGET_NR_eventfd2:
 {
-int host_flags = arg2 & (~(TARGET_O_NONBLOCK | TARGET_O_CLOEXEC));
+int host_flags = arg2 & (~(TARGET_O_NONBLOCK_MASK | TARGET_O_CLOEXEC));
 if (arg2 & TARGET_O_NONBLOCK) {
 host_flags |= O_NONBLOCK;
 }



Re: [PATCH] hw/block/nvme: improve invalid zasl value reporting

2021-02-09 Thread Philippe Mathieu-Daudé
On 2/9/21 8:39 PM, Dmitry Fomichev wrote:
> On Mon, 2021-02-08 at 09:25 +0100, Klaus Jensen wrote:
>> From: Klaus Jensen 
>>
>> The Zone Append Size Limit (ZASL) must be at least 4096 bytes, so
>> improve the user experience by adding an early parameter check in
>> nvme_check_constraints.
>>
>> When ZASL is still too small due to the host configuring the device for
>> an even larger page size, convert the trace point in nvme_start_ctrl to
>> an NVME_GUEST_ERR such that this is logged by QEMU instead of only
>> traced.
>>
>> Reported-by: "i...@dantalion.nl" 

Apparently the reporter signed 'Corne'.

>> Cc: Dmitry Fomichev 
>> Signed-off-by: Klaus Jensen 
>> ---
>>  hw/block/nvme.c | 12 ++--
>>  1 file changed, 10 insertions(+), 2 deletions(-)
>>
>> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
>> index c2f0c88fbf39..d96888cd2333 100644
>> --- a/hw/block/nvme.c
>> +++ b/hw/block/nvme.c
>> @@ -3983,8 +3983,10 @@ static int nvme_start_ctrl(NvmeCtrl *n)
>>  n->zasl = n->params.mdts;
>>  } else {
>>  if (n->params.zasl_bs < n->page_size) {
>> -trace_pci_nvme_err_startfail_zasl_too_small(n->params.zasl_bs,
>> -n->page_size);
>> +NVME_GUEST_ERR(pci_nvme_err_startfail_zasl_too_small,
>> +   "Zone Append Size Limit (ZASL) of %d bytes is 
>> too "
>> +   "small; must be at least %d bytes",
>> +   n->params.zasl_bs, n->page_size);
>>  return -1;
>>  }
>>  n->zasl = 31 - clz32(n->params.zasl_bs / n->page_size);
>> @@ -4503,6 +4505,12 @@ static void nvme_check_constraints(NvmeCtrl *n, Error 
>> **errp)
>>  error_setg(errp, "zone append size limit has to be a power of 
>> 2");
>>  return;
>>  }
>> +
>> +if (n->params.zasl_bs < 4096) {
>> +error_setg(errp, "zone append size limit must be at least "
>> +   "4096 bytes");
>> +return;
>> +}
>>  }
>>  }
> 
> The guest error is less confusing than simply a trace. LGTM.

Trace events are meant for the developers when debugging, they
are usually stripped out in final build.

Errors are reported to the user / operator (i.e. incorrect
configuration).

Regards,

Phil.




Re: [RFC 09/10] vhost: Route guest->host notification through shadow virtqueue

2021-02-09 Thread Jason Wang



On 2021/2/9 下午11:02, Eugenio Perez Martin wrote:

On Thu, Feb 4, 2021 at 4:27 AM Jason Wang  wrote:


On 2021/2/2 下午6:08, Eugenio Perez Martin wrote:

On Mon, Feb 1, 2021 at 7:29 AM Jason Wang  wrote:

On 2021/1/30 上午4:54, Eugenio Pérez wrote:

Shadow virtqueue notifications forwarding is disabled when vhost_dev
stops.

Signed-off-by: Eugenio Pérez 
---
hw/virtio/vhost-shadow-virtqueue.h |   5 ++
include/hw/virtio/vhost.h  |   4 +
hw/virtio/vhost-shadow-virtqueue.c | 123 +-
hw/virtio/vhost.c  | 135 -
4 files changed, 264 insertions(+), 3 deletions(-)

diff --git a/hw/virtio/vhost-shadow-virtqueue.h 
b/hw/virtio/vhost-shadow-virtqueue.h
index 6cc18d6acb..466f8ae595 100644
--- a/hw/virtio/vhost-shadow-virtqueue.h
+++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -17,6 +17,11 @@

typedef struct VhostShadowVirtqueue VhostShadowVirtqueue;

+bool vhost_shadow_vq_start_rcu(struct vhost_dev *dev,
+   VhostShadowVirtqueue *svq);
+void vhost_shadow_vq_stop_rcu(struct vhost_dev *dev,
+  VhostShadowVirtqueue *svq);
+
VhostShadowVirtqueue *vhost_shadow_vq_new(struct vhost_dev *dev, int idx);

void vhost_shadow_vq_free(VhostShadowVirtqueue *vq);
diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h
index 2be782cefd..732a4b2a2b 100644
--- a/include/hw/virtio/vhost.h
+++ b/include/hw/virtio/vhost.h
@@ -55,6 +55,8 @@ struct vhost_iommu {
QLIST_ENTRY(vhost_iommu) iommu_next;
};

+typedef struct VhostShadowVirtqueue VhostShadowVirtqueue;
+
typedef struct VhostDevConfigOps {
/* Vhost device config space changed callback
 */
@@ -83,7 +85,9 @@ struct vhost_dev {
uint64_t backend_cap;
bool started;
bool log_enabled;
+bool sw_lm_enabled;
uint64_t log_size;
+VhostShadowVirtqueue **shadow_vqs;
Error *migration_blocker;
const VhostOps *vhost_ops;
void *opaque;
diff --git a/hw/virtio/vhost-shadow-virtqueue.c 
b/hw/virtio/vhost-shadow-virtqueue.c
index c0c967a7c5..908c36c66d 100644
--- a/hw/virtio/vhost-shadow-virtqueue.c
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -8,15 +8,129 @@
 */

#include "hw/virtio/vhost-shadow-virtqueue.h"
+#include "hw/virtio/vhost.h"
+#include "hw/virtio/virtio-access.h"
+
+#include "standard-headers/linux/vhost_types.h"
+#include "standard-headers/linux/virtio_ring.h"

#include "qemu/error-report.h"
-#include "qemu/event_notifier.h"
+#include "qemu/main-loop.h"

typedef struct VhostShadowVirtqueue {
EventNotifier kick_notifier;
EventNotifier call_notifier;
+const struct vhost_virtqueue *hvq;
+VirtIODevice *vdev;
+VirtQueue *vq;
} VhostShadowVirtqueue;

So instead of doing things at virtio level, how about do the shadow
stuffs at vhost level?

It works like:

virtio -> [shadow vhost backend] -> vhost backend

Then the QMP is used to plug the shadow vhost backend in the middle or not.

It looks kind of easier since we don't need to deal with virtqueue
handlers etc.. Instead, we just need to deal with eventfd stuffs:

When shadow vhost mode is enabled, we just intercept the host_notifiers
and guest_notifiers. When it was disabled, we just pass the host/guest
notifiers to the real vhost backends?


Hi Jason.

Sure we can try that model, but it seems to me that it comes with a
different set of problems.

For example, there are code in vhost.c that checks if implementations
are available in vhost_ops, like:

if (dev->vhost_ops->vhost_vq_get_addr) {
  r = dev->vhost_ops->vhost_vq_get_addr(dev, , vq);
  ...
}

I can count 14 of these, checking:

dev->vhost_ops->vhost_backend_can_merge
dev->vhost_ops->vhost_backend_mem_section_filter
dev->vhost_ops->vhost_force_iommu
dev->vhost_ops->vhost_requires_shm_log
dev->vhost_ops->vhost_set_backend_cap
dev->vhost_ops->vhost_set_vring_busyloop_timeout
dev->vhost_ops->vhost_vq_get_addr
hdev->vhost_ops->vhost_dev_start
hdev->vhost_ops->vhost_get_config
hdev->vhost_ops->vhost_get_inflight_fd
hdev->vhost_ops->vhost_net_set_backend
hdev->vhost_ops->vhost_set_config
hdev->vhost_ops->vhost_set_inflight_fd
hdev->vhost_ops->vhost_set_iotlb_callback

So we should Implement all of the vhost_ops callbacks, forwarding them
to actual vhost_backed, and delete conditionally these ones? In other
words, dynamically generate the new shadow vq vhost_ops? If a new
callback is added to any vhost backend in the future, do we have to
force the adding / checking for NULL in shadow backend vhost_ops?
Would this be a good moment to check if all backends implement these
and delete the checks?


I think it won't be easy if we want to support all kinds of vhost
backends from the start. So we can go with vhost-vdpa one first.

Actually how it work might be something like (no need to switch
vhost_ops, we can do everything silently in the ops)

1) when device to switch to shadow 

Re: [RFC 05/10] vhost: Add vhost_dev_from_virtio

2021-02-09 Thread Jason Wang



On 2021/2/9 下午11:35, Eugenio Perez Martin wrote:

On Fri, Feb 5, 2021 at 4:52 AM Jason Wang  wrote:


On 2021/2/4 下午5:25, Eugenio Perez Martin wrote:

On Thu, Feb 4, 2021 at 4:14 AM Jason Wang  wrote:

On 2021/2/2 下午6:17, Eugenio Perez Martin wrote:

On Tue, Feb 2, 2021 at 4:31 AM Jason Wang  wrote:

On 2021/2/1 下午4:28, Eugenio Perez Martin wrote:

On Mon, Feb 1, 2021 at 7:13 AM Jason Wang  wrote:

On 2021/1/30 上午4:54, Eugenio Pérez wrote:

Signed-off-by: Eugenio Pérez 
---
  include/hw/virtio/vhost.h |  1 +
  hw/virtio/vhost.c | 17 +
  2 files changed, 18 insertions(+)

diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h
index 4a8bc75415..fca076e3f0 100644
--- a/include/hw/virtio/vhost.h
+++ b/include/hw/virtio/vhost.h
@@ -123,6 +123,7 @@ uint64_t vhost_get_features(struct vhost_dev *hdev, const 
int *feature_bits,
  void vhost_ack_features(struct vhost_dev *hdev, const int *feature_bits,
  uint64_t features);
  bool vhost_has_free_slot(void);
+struct vhost_dev *vhost_dev_from_virtio(const VirtIODevice *vdev);

  int vhost_net_set_backend(struct vhost_dev *hdev,
struct vhost_vring_file *file);
diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
index 28c7d78172..8683d507f5 100644
--- a/hw/virtio/vhost.c
+++ b/hw/virtio/vhost.c
@@ -61,6 +61,23 @@ bool vhost_has_free_slot(void)
  return slots_limit > used_memslots;
  }

+/*
+ * Get the vhost device associated to a VirtIO device.
+ */
+struct vhost_dev *vhost_dev_from_virtio(const VirtIODevice *vdev)
+{
+struct vhost_dev *hdev;
+
+QLIST_FOREACH(hdev, _devices, entry) {
+if (hdev->vdev == vdev) {
+return hdev;
+}
+}
+
+assert(hdev);
+return NULL;
+}

I'm not sure this can work in the case of multiqueue. E.g vhost-net
multiqueue is a N:1 mapping between vhost devics and virtio devices.

Thanks


Right. We could add an "vdev vq index" parameter to the function in
this case, but I guess the most reliable way to do this is to add a
vhost_opaque value to VirtQueue, as Stefan proposed in previous RFC.

So the question still, it looks like it's easier to hide the shadow
virtqueue stuffs at vhost layer instead of expose them to virtio layer:

1) vhost protocol is stable ABI
2) no need to deal with virtio stuffs which is more complex than vhost

Or are there any advantages if we do it at virtio layer?


As far as I can tell, we will need the virtio layer the moment we
start copying/translating buffers.

In this series, the virtio dependency can be reduced if qemu does not
check the used ring _F_NO_NOTIFY flag before writing to irqfd. It
would enable packed queues and IOMMU immediately, and I think the cost
should not be so high. In the previous RFC this check was deleted
later anyway, so I think it was a bad idea to include it from the start.

I am not sure I understand here. For vhost, we can still do anything we
want, e.g accessing guest memory etc. Any blocker that prevent us from
copying/translating buffers? (Note that qemu will propagate memory
mappings to vhost).


There is nothing that forbids us to access directly, but if we don't
reuse the virtio layer functionality we would have to duplicate every
access function. "Need" was a too strong word maybe :).

In other words: for the shadow vq vring exposed for the device, qemu
treats it as a driver, and this functionality needs to be added to
qemu. But for accessing the guest's one do not reuse virtio.c would be
a bad idea in my opinion.


The problem is, virtio.c is not a library and it has a lot of dependency
with other qemu modules basically makes it impossible to be reused at
vhost level.


While virtio.c as a whole has dependencies, I think that the functions
needed in the original RFC do not have these dependencies.

However I see how to split vring dataplane from virtio device
management can benefit.



If you can split them out, that would be fine.





We can solve this by:

1) split the core functions out as a library or
2) switch to use contrib/lib-vhostuser but needs to decouple UNIX socket
transport

None of the above looks trivial and they are only device codes. For
shadow virtqueue, we need driver codes as well where no code can be reused.

As we discussed, we probably need IOVA allocated when forwarding
descriptors between the two virtqueues. So my feeling is we can have our
own codes to start then we can consider whether we can reuse some from
the existing virtio.c or lib-vhostuser.


As I see it, if we develop our own code a lot of it will be copied
from current virtio.c, which itself duplicates a lot of contrib/ lib
functionality.

Maybe it's better to combine your proposals and decouple the vring
functions, the vhost transport, and the qemu virtio device management,
so other projects can reuse them directly?



I think this can work.




I still think this can be left for a later series with buffer
forwarding on 

Re: [PATCH v2 0/5] Move remaining x86 Travis jobs to the gitlab-CI

2021-02-09 Thread Thomas Huth

On 09/02/2021 21.37, Alex Bennée wrote:


Thomas Huth  writes:


Since Travis changed their policies, travis-ci.org will soon become
completely useless for the QEMU project. We should now really make sure
that we move the remaining tests as good as possible to the gitlab-CI
instead.


Queued to testing/next, thanks.


Thanks, but please unqueue them again, I still want to send a v3 to address 
your comment on the -fsanitize=undefined patch... and I also noticed that 
the gprof/gcov job runs very long and sometimes hits the 1h time limit, so I 
need to revisit the set of target architectures there...


 Thomas




Re: [PATCH] target/ppc: Fix truncation of env->hflags

2021-02-09 Thread David Gibson
On Sat, Jan 23, 2021 at 05:24:22PM -1000, Richard Henderson wrote:
> Use the cs_base field, because it happens to be the same
> size as hflags (and MSR, from which hflags is derived).
> 
> In translate, extract most bits from a local hflags variable.
> Mark several cases where code generation is *not* derived from
> data stored within the hashed elements of the TranslationBlock.
> 
> Cc: David Gibson 
> Reported-by: Ivan Warren 
> Signed-off-by: Richard Henderson 

Well, I don't know why, but somehow this patch is breaking one of the
acceptance tests:

 (043/134) 
tests/acceptance/boot_linux_console.py:BootLinuxConsole.test_ppc64_e500: 
INTERRUPTED: Test interrupted by SIGTERM\nRunner error occurred: Timeout 
reached\nOriginal status: ERROR\n{'name': 
'043-tests/acceptance/boot_linux_console.py:BootLinuxConsole.test_ppc64_e500', 
'logdir': 
'/home/dwg/src/qemu/build/normal/tests/results/job-2021-02-10T15.04... (90.26 s)

From that timeout, I'm guessing something about this is causing the
boot to wedge.

So, I've removed this from my tree for now, I'll need a fixed version
to proceed with.


> ---
>  target/ppc/cpu.h   |  4 +--
>  target/ppc/translate.c | 64 --
>  2 files changed, 26 insertions(+), 42 deletions(-)
> 
> diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
> index 2609e4082e..4a05e4e544 100644
> --- a/target/ppc/cpu.h
> +++ b/target/ppc/cpu.h
> @@ -2396,8 +2396,8 @@ static inline void cpu_get_tb_cpu_state(CPUPPCState 
> *env, target_ulong *pc,
>  target_ulong *cs_base, uint32_t 
> *flags)
>  {
>  *pc = env->nip;
> -*cs_base = 0;
> -*flags = env->hflags;
> +*cs_base = env->hflags;
> +*flags = 0;
>  }
>  
>  void QEMU_NORETURN raise_exception(CPUPPCState *env, uint32_t exception);
> diff --git a/target/ppc/translate.c b/target/ppc/translate.c
> index 0984ce637b..1eb2e1b0c6 100644
> --- a/target/ppc/translate.c
> +++ b/target/ppc/translate.c
> @@ -7879,47 +7879,37 @@ static void 
> ppc_tr_init_disas_context(DisasContextBase *dcbase, CPUState *cs)
>  {
>  DisasContext *ctx = container_of(dcbase, DisasContext, base);
>  CPUPPCState *env = cs->env_ptr;
> +target_ulong hflags = ctx->base.tb->cs_base;
>  int bound;
>  
>  ctx->exception = POWERPC_EXCP_NONE;
>  ctx->spr_cb = env->spr_cb;
> -ctx->pr = msr_pr;
> +ctx->pr = (hflags >> MSR_PR) & 1;
>  ctx->mem_idx = env->dmmu_idx;
> -ctx->dr = msr_dr;
> -#if !defined(CONFIG_USER_ONLY)
> -ctx->hv = msr_hv || !env->has_hv_mode;
> +ctx->dr = (hflags >> MSR_DR) & 1;
> +#if defined(TARGET_PPC64) && !defined(CONFIG_USER_ONLY)
> +ctx->hv = (hflags >> MSR_HV) & 1;
>  #endif
>  ctx->insns_flags = env->insns_flags;
>  ctx->insns_flags2 = env->insns_flags2;
>  ctx->access_type = -1;
>  ctx->need_access_type = !mmu_is_64bit(env->mmu_model);
> -ctx->le_mode = !!(env->hflags & (1 << MSR_LE));
> +ctx->le_mode = (hflags >> MSR_LE) & 1;
>  ctx->default_tcg_memop_mask = ctx->le_mode ? MO_LE : MO_BE;
>  ctx->flags = env->flags;
>  #if defined(TARGET_PPC64)
> -ctx->sf_mode = msr_is_64bit(env, env->msr);
> +ctx->sf_mode = (hflags >> MSR_SF) & 1;
>  ctx->has_cfar = !!(env->flags & POWERPC_FLAG_CFAR);
>  #endif
>  ctx->lazy_tlb_flush = env->mmu_model == POWERPC_MMU_32B
>  || env->mmu_model == POWERPC_MMU_601
>  || env->mmu_model & POWERPC_MMU_64;
>  
> -ctx->fpu_enabled = !!msr_fp;
> -if ((env->flags & POWERPC_FLAG_SPE) && msr_spe) {
> -ctx->spe_enabled = !!msr_spe;
> -} else {
> -ctx->spe_enabled = false;
> -}
> -if ((env->flags & POWERPC_FLAG_VRE) && msr_vr) {
> -ctx->altivec_enabled = !!msr_vr;
> -} else {
> -ctx->altivec_enabled = false;
> -}
> -if ((env->flags & POWERPC_FLAG_VSX) && msr_vsx) {
> -ctx->vsx_enabled = !!msr_vsx;
> -} else {
> -ctx->vsx_enabled = false;
> -}
> +ctx->fpu_enabled = (hflags >> MSR_FP) & 1;
> +ctx->spe_enabled = (hflags >> MSR_SPE) & 1;
> +ctx->altivec_enabled = (hflags >> MSR_VR) & 1;
> +ctx->vsx_enabled = (hflags >> MSR_VSX) & 1;
> +/* FIXME: This needs to be stored in env->hflags_nmsr. */
>  if ((env->flags & POWERPC_FLAG_SCV)
>  && (env->spr[SPR_FSCR] & (1ull << FSCR_SCV))) {
>  ctx->scv_enabled = true;
> @@ -7927,23 +7917,21 @@ static void 
> ppc_tr_init_disas_context(DisasContextBase *dcbase, CPUState *cs)
>  ctx->scv_enabled = false;
>  }
>  #if defined(TARGET_PPC64)
> -if ((env->flags & POWERPC_FLAG_TM) && msr_tm) {
> -ctx->tm_enabled = !!msr_tm;
> -} else {
> -ctx->tm_enabled = false;
> -}
> +ctx->tm_enabled = (hflags >> MSR_TM) & 1;
>  #endif
> +/* FIXME: This needs to be stored in env->hflags_nmsr. */
>  ctx->gtse = !!(env->spr[SPR_LPCR] & LPCR_GTSE);
> -if ((env->flags & POWERPC_FLAG_SE) && msr_se) {
> -ctx->singlestep_enabled = 

Re: [PATCH] vhost: Unbreak SMMU and virtio-iommu on dev-iotlb support

2021-02-09 Thread Jason Wang



On 2021/2/9 上午2:26, Peter Xu wrote:

Kevin,

On Mon, Feb 08, 2021 at 07:03:08AM +, Tian, Kevin wrote:

It really depends on the definition of dev-iotlb in this context. To me the
fact that virtio-iommu needs to notify the kernel for updating split cache
is already sort of dev-iotlb semantics, regardless of whether it's delivered
through a iotlb message or dev-iotlb message in a specific implementation. 

Yeah maybe it turns out that we'll just need to implement dev-iotlb for
virtio-iommu.



Note that on top of device-IOTLB, device may choose to implement an 
IOMMU which support #PF. In this case, dev-iotlb semantic is not a must. 
(Or it can co-operate with things like ATS if driver wants)


Virtio will probably provide this feature in the future.

Thanks




I am completely fine with that and I'm never against it. :) I was throwing out
a pure question only, because I don't know the answer.

My question was majorly based on the fact that dev-iotlb and iotlb messages
really look the same; it's not obvious then whether it would always matter a
lot when in a full emulation environment.

One example is current vhost - vhost previously would work without dev-iotlb
(ats=on) because trapping UNMAP would work too for vhost to work.  It's also
simply because at least for VT-d the driver needs to send both one dev-iotlb
and one (probably same) iotlb message for a single page invalidation.  The
dev-iotlb won't help a lot in full emulation here but instead it slows thing
down a little bit (QEMU has full knowledge as long as it receives either of the
message).

Thanks,






Re: [PATCH v3] hw/net: fsl_etsec: Reverse the RCTRL.RSF logic

2021-02-09 Thread David Gibson
On Wed, Feb 10, 2021 at 10:10:21AM +0800, Bin Meng wrote:
> From: Bin Meng 
> 
> Per MPC8548ERM [1] chapter 14.5.3.4.1:
> 
> When RCTRL.RSF is 1, frames less than 64 bytes are accepted upon
> a DA match. But currently QEMU does the opposite. This commit
> reverses the RCTRL.RSF testing logic to match the manual.
> 
> Due to the reverse of the logic, certain guests may potentially
> break if they don't program eTSEC to have RCTRL.RSF bit set.
> When RCTRL.RSF is 0, short frames are silently dropped, however
> as of today both slirp and tap networking do not pad short frames
> (e.g.: an ARP packet) to the minimum frame size of 60 bytes. So
> ARP requests will be dropped, preventing the guest from becoming
> visible on the network.
> 
> The same issue was reported on e1000 and vmxenet3 before, see:
> 
> commit 78aeb23eded2 ("e1000: Pad short frames to minimum size (60 bytes)")
> commit 40a87c6c9b11 ("vmxnet3: Pad short frames to minimum size (60 bytes)")
> 
> [1] https://www.nxp.com/docs/en/reference-manual/MPC8548ERM.pdf
> 
> Fixes: eb1e7c3e5146 ("Add Enhanced Three-Speed Ethernet Controller (eTSEC)")
> Signed-off-by: Bin Meng 

Applied to ppc-for-6.0.

> 
> ---
> 
> Changes in v3:
> - remove the slirp/tap networking workaround and only do the reverse
> 
> Changes in v2:
> - rewrite the commit message and reverse the RCTRL.RSF test logic
> 
>  hw/net/fsl_etsec/rings.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/hw/net/fsl_etsec/rings.c b/hw/net/fsl_etsec/rings.c
> index 121415a..fe055d3 100644
> --- a/hw/net/fsl_etsec/rings.c
> +++ b/hw/net/fsl_etsec/rings.c
> @@ -502,7 +502,7 @@ ssize_t etsec_rx_ring_write(eTSEC *etsec, const uint8_t 
> *buf, size_t size)
>  return -1;
>  }
>  
> -if ((etsec->regs[RCTRL].value & RCTRL_RSF) && (size < 60)) {
> +if (!(etsec->regs[RCTRL].value & RCTRL_RSF) && (size < 60)) {
>  /* CRC is not in the packet yet, so short frame is below 60 bytes */
>  RING_DEBUG("%s: Drop short frame\n", __func__);
>  return -1;

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [PATCH] hw/block/nvme: add broadcast nsid support flush command

2021-02-09 Thread Keith Busch
On Mon, Feb 08, 2021 at 08:08:17PM +0100, Klaus Jensen wrote:
> On Feb  9 03:59, Keith Busch wrote:
> > This whole implementation would be much simpler with the synchronous
> > blk_flush() routine instead of the AIO equivalent. This is not really a
> > performant feature, so I don't think it's critical to get these
> > operations happening in parallel. What do you think?
> 
> It would definitely be simpler, but I believe that if there is a lot to
> flush, then we won't just block the nvme device. We are holding the Big
> QEMU Lock and will block most other devices as well.

Hm, I feel like you may have told me this same explanation for a
different patch. :) Okay, I'm convinced: this is the way.



[PATCH v2] target/ppc: Add E500 L2CSR0 write helper

2021-02-09 Thread Bin Meng
From: Bin Meng 

Per EREF 2.0 [1] chapter 3.11.2:

The following bits in L2CSR0 (exists in the e500mc/e5500/e6500 core):

- L2FI  (L2 cache flash invalidate)
- L2FL  (L2 cache flush)
- L2LFC (L2 cache lock flash clear)

when set, a cache operation is initiated by hardware, and these bits
will be cleared when the operation is complete.

Since we don't model cache in QEMU, let's add a write helper to emulate
the cache operations completing instantly.

[1] https://www.nxp.com/files-static/32bit/doc/ref_manual/EREFRM.pdf

Signed-off-by: Bin Meng 

---

Changes in v2:
- Add Freescale manual link and clarifications in the commit message

 target/ppc/cpu.h|  6 ++
 target/ppc/translate_init.c.inc | 16 
 2 files changed, 22 insertions(+)

diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
index 2609e40..e77911a 100644
--- a/target/ppc/cpu.h
+++ b/target/ppc/cpu.h
@@ -1919,6 +1919,7 @@ typedef PowerPCCPU ArchCPU;
 #define SPR_750FX_HID2(0x3F8)
 #define SPR_Exxx_L1FINV0  (0x3F8)
 #define SPR_L2CR  (0x3F9)
+#define SPR_Exxx_L2CSR0   (0x3F9)
 #define SPR_L3CR  (0x3FA)
 #define SPR_750_TDCH  (0x3FA)
 #define SPR_IABR2 (0x3FA)
@@ -1974,6 +1975,11 @@ typedef PowerPCCPU ArchCPU;
 #define   L1CSR1_ICFI   0x0002  /* Instruction Cache Flash Invalidate */
 #define   L1CSR1_ICE0x0001  /* Instruction Cache Enable */
 
+/* E500 L2CSR0 */
+#define E500_L2CSR0_L2FI(1 << 21)   /* L2 cache flash invalidate */
+#define E500_L2CSR0_L2FL(1 << 11)   /* L2 cache flush */
+#define E500_L2CSR0_L2LFC   (1 << 10)   /* L2 cache lock flash clear */
+
 /* HID0 bits */
 #define HID0_DEEPNAP(1 << 24)   /* pre-2.06 */
 #define HID0_DOZE   (1 << 23)   /* pre-2.06 */
diff --git a/target/ppc/translate_init.c.inc b/target/ppc/translate_init.c.inc
index 9867d0a..3ec45cb 100644
--- a/target/ppc/translate_init.c.inc
+++ b/target/ppc/translate_init.c.inc
@@ -1735,6 +1735,16 @@ static void spr_write_e500_l1csr1(DisasContext *ctx, int 
sprn, int gprn)
 tcg_temp_free(t0);
 }
 
+static void spr_write_e500_l2csr0(DisasContext *ctx, int sprn, int gprn)
+{
+TCGv t0 = tcg_temp_new();
+
+tcg_gen_andi_tl(t0, cpu_gpr[gprn],
+~(E500_L2CSR0_L2FI | E500_L2CSR0_L2FL | 
E500_L2CSR0_L2LFC));
+gen_store_spr(sprn, t0);
+tcg_temp_free(t0);
+}
+
 static void spr_write_booke206_mmucsr0(DisasContext *ctx, int sprn, int gprn)
 {
 gen_helper_booke206_tlbflush(cpu_env, cpu_gpr[gprn]);
@@ -5029,6 +5039,12 @@ static void init_proc_e500(CPUPPCState *env, int version)
  SPR_NOACCESS, SPR_NOACCESS,
  _read_generic, _write_e500_l1csr1,
  0x);
+if (version != fsl_e500v1 && version != fsl_e500v2) {
+spr_register(env, SPR_Exxx_L2CSR0, "L2CSR0",
+ SPR_NOACCESS, SPR_NOACCESS,
+ _read_generic, _write_e500_l2csr0,
+ 0x);
+}
 spr_register(env, SPR_BOOKE_MCSRR0, "MCSRR0",
  SPR_NOACCESS, SPR_NOACCESS,
  _read_generic, _write_generic,
-- 
2.7.4




Re: [PATCH v3 0/3]

2021-02-09 Thread Doug Evans
On Thu, Feb 4, 2021 at 10:25 AM Doug Evans  wrote:

> On Thu, Feb 4, 2021 at 2:03 AM Daniel P. Berrangé 
> wrote:
>
>> On Wed, Feb 03, 2021 at 03:35:36PM -0800, dje--- via wrote:
>> > Add support for ipv6 host forwarding
>> >
>> > This patchset takes the original patch from Maxim,
>> > https://www.mail-archive.com/qemu-devel@nongnu.org/msg569573.html
>> > and updates it.
>> >
>> > New option: -ipv6-hostfwd
>> >
>> > New commands: ipv6_hostfwd_add, ipv6_hostfwd_remove
>> >
>> > These are the ipv6 equivalents of their ipv4 counterparts.
>>
>> Before I noticed this v3, I send a reply to your v2 sugesting
>> that we don't need to add any new commands/options. We can
>> use existing inet_parse() helper function to parse the address
>> info and transparently support IPv4/6 in the existing commands
>> and options. This matches normal practice elsewhere in QEMU
>> for IP dual stack.
>>
>
> I'm all for this, fwiw.
>


I should say I'm all for not adding new commands/options.
Looking at inet_parse() it cannot be used as-is.
The question then becomes: Will refactoring it buy enough?


Re: [PATCH] target/ppc: Add E500 L2CSR0 write helper

2021-02-09 Thread Bin Meng
Hi David,

On Wed, Feb 10, 2021 at 10:09 AM David Gibson
 wrote:
>
> On Wed, Feb 10, 2021 at 09:53:53AM +0800, Bin Meng wrote:
> > Hi David,
> >
> > On Wed, Feb 10, 2021 at 9:50 AM David Gibson
> >  wrote:
> > >
> > > On Mon, Feb 08, 2021 at 05:40:58PM +0800, Bin Meng wrote:
> > > > From: Bin Meng 
> > > >
> > > > There are several bits in L2CSR0 (exists in the e500mc/e5500/e6500
> > > > core) that should be self-cleared when written:
> > > >
> > > > - L2FI  (L2 cache flash invalidate)
> > > > - L2FL  (L2 cache flush)
> > > > - L2LFC (L2 cache lock flash clear)
> > > >
> > > > Add a write helper to emulate this behavior.
> > > >
> > > > Signed-off-by: Bin Meng 
> > >
> > > IIUC, these are essentially write-only bits - they have some side
> > > effect when written on real hardware, but won't ever be read back.  Is
> > > that correct?  Do you have a reference to hardware docs describing
> > > this behaviour?
> > >
> >
> > Please see https://www.nxp.com/files-static/32bit/doc/ref_manual/EREFRM.pdf,
> > chapter 3.11.2
>
> Ah, thanks.  So these actually don't operate quite how I was
> suggesting - they are readable, and return 1 until the operation is
> completed.
>
> So what you're effectively doing here is simulating the cache
> operations completing instantly - which is correct because we don't
> model the cache.
>
> Can you please clarify that in your commit message, including the
> pointer to the chip doc.

Sure, will do in v2.

>
> > > I'm assuming that because we don't model the L2 cache, it's ok that
> > > your implementation just ignores writing these bits, rather than
> > > performing the cache operations requested?
> >
> > Yes, guests may read back these bits to confirm the operation is done
> > by hardware after writing 1 to these bits.
> >
> > >
> > > Is that still true for the flash clear operation?
> >
> > Yes.
>
> Ah, yes, I see.  The name made me think this might be something like
> dcbz, which has visible effects on architected state.  This is just
> clearing cache locks, which we don't model in any case.
>

Thanks for the review.

Regards,
Bin



[PATCH v3] hw/net: fsl_etsec: Reverse the RCTRL.RSF logic

2021-02-09 Thread Bin Meng
From: Bin Meng 

Per MPC8548ERM [1] chapter 14.5.3.4.1:

When RCTRL.RSF is 1, frames less than 64 bytes are accepted upon
a DA match. But currently QEMU does the opposite. This commit
reverses the RCTRL.RSF testing logic to match the manual.

Due to the reverse of the logic, certain guests may potentially
break if they don't program eTSEC to have RCTRL.RSF bit set.
When RCTRL.RSF is 0, short frames are silently dropped, however
as of today both slirp and tap networking do not pad short frames
(e.g.: an ARP packet) to the minimum frame size of 60 bytes. So
ARP requests will be dropped, preventing the guest from becoming
visible on the network.

The same issue was reported on e1000 and vmxenet3 before, see:

commit 78aeb23eded2 ("e1000: Pad short frames to minimum size (60 bytes)")
commit 40a87c6c9b11 ("vmxnet3: Pad short frames to minimum size (60 bytes)")

[1] https://www.nxp.com/docs/en/reference-manual/MPC8548ERM.pdf

Fixes: eb1e7c3e5146 ("Add Enhanced Three-Speed Ethernet Controller (eTSEC)")
Signed-off-by: Bin Meng 

---

Changes in v3:
- remove the slirp/tap networking workaround and only do the reverse

Changes in v2:
- rewrite the commit message and reverse the RCTRL.RSF test logic

 hw/net/fsl_etsec/rings.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/net/fsl_etsec/rings.c b/hw/net/fsl_etsec/rings.c
index 121415a..fe055d3 100644
--- a/hw/net/fsl_etsec/rings.c
+++ b/hw/net/fsl_etsec/rings.c
@@ -502,7 +502,7 @@ ssize_t etsec_rx_ring_write(eTSEC *etsec, const uint8_t 
*buf, size_t size)
 return -1;
 }
 
-if ((etsec->regs[RCTRL].value & RCTRL_RSF) && (size < 60)) {
+if (!(etsec->regs[RCTRL].value & RCTRL_RSF) && (size < 60)) {
 /* CRC is not in the packet yet, so short frame is below 60 bytes */
 RING_DEBUG("%s: Drop short frame\n", __func__);
 return -1;
-- 
2.7.4




Re: [PATCH] target/ppc: Add E500 L2CSR0 write helper

2021-02-09 Thread David Gibson
On Wed, Feb 10, 2021 at 09:53:53AM +0800, Bin Meng wrote:
> Hi David,
> 
> On Wed, Feb 10, 2021 at 9:50 AM David Gibson
>  wrote:
> >
> > On Mon, Feb 08, 2021 at 05:40:58PM +0800, Bin Meng wrote:
> > > From: Bin Meng 
> > >
> > > There are several bits in L2CSR0 (exists in the e500mc/e5500/e6500
> > > core) that should be self-cleared when written:
> > >
> > > - L2FI  (L2 cache flash invalidate)
> > > - L2FL  (L2 cache flush)
> > > - L2LFC (L2 cache lock flash clear)
> > >
> > > Add a write helper to emulate this behavior.
> > >
> > > Signed-off-by: Bin Meng 
> >
> > IIUC, these are essentially write-only bits - they have some side
> > effect when written on real hardware, but won't ever be read back.  Is
> > that correct?  Do you have a reference to hardware docs describing
> > this behaviour?
> >
> 
> Please see https://www.nxp.com/files-static/32bit/doc/ref_manual/EREFRM.pdf,
> chapter 3.11.2

Ah, thanks.  So these actually don't operate quite how I was
suggesting - they are readable, and return 1 until the operation is
completed.

So what you're effectively doing here is simulating the cache
operations completing instantly - which is correct because we don't
model the cache.

Can you please clarify that in your commit message, including the
pointer to the chip doc.

> > I'm assuming that because we don't model the L2 cache, it's ok that
> > your implementation just ignores writing these bits, rather than
> > performing the cache operations requested?
> 
> Yes, guests may read back these bits to confirm the operation is done
> by hardware after writing 1 to these bits.
> 
> >
> > Is that still true for the flash clear operation?
> 
> Yes.

Ah, yes, I see.  The name made me think this might be something like
dcbz, which has visible effects on architected state.  This is just
clearing cache locks, which we don't model in any case.

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [PATCH] target/ppc: Add E500 L2CSR0 write helper

2021-02-09 Thread Bin Meng
Hi David,

On Wed, Feb 10, 2021 at 9:50 AM David Gibson
 wrote:
>
> On Mon, Feb 08, 2021 at 05:40:58PM +0800, Bin Meng wrote:
> > From: Bin Meng 
> >
> > There are several bits in L2CSR0 (exists in the e500mc/e5500/e6500
> > core) that should be self-cleared when written:
> >
> > - L2FI  (L2 cache flash invalidate)
> > - L2FL  (L2 cache flush)
> > - L2LFC (L2 cache lock flash clear)
> >
> > Add a write helper to emulate this behavior.
> >
> > Signed-off-by: Bin Meng 
>
> IIUC, these are essentially write-only bits - they have some side
> effect when written on real hardware, but won't ever be read back.  Is
> that correct?  Do you have a reference to hardware docs describing
> this behaviour?
>

Please see https://www.nxp.com/files-static/32bit/doc/ref_manual/EREFRM.pdf,
chapter 3.11.2

> I'm assuming that because we don't model the L2 cache, it's ok that
> your implementation just ignores writing these bits, rather than
> performing the cache operations requested?

Yes, guests may read back these bits to confirm the operation is done
by hardware after writing 1 to these bits.

>
> Is that still true for the flash clear operation?

Yes.

>
> > ---
> >
> >  target/ppc/cpu.h|  6 ++
> >  target/ppc/translate_init.c.inc | 16 
> >  2 files changed, 22 insertions(+)
> >

Regards,
Bin



Re: [PATCH] target/ppc: Add E500 L2CSR0 write helper

2021-02-09 Thread David Gibson
On Mon, Feb 08, 2021 at 05:40:58PM +0800, Bin Meng wrote:
> From: Bin Meng 
> 
> There are several bits in L2CSR0 (exists in the e500mc/e5500/e6500
> core) that should be self-cleared when written:
> 
> - L2FI  (L2 cache flash invalidate)
> - L2FL  (L2 cache flush)
> - L2LFC (L2 cache lock flash clear)
> 
> Add a write helper to emulate this behavior.
> 
> Signed-off-by: Bin Meng 

IIUC, these are essentially write-only bits - they have some side
effect when written on real hardware, but won't ever be read back.  Is
that correct?  Do you have a reference to hardware docs describing
this behaviour?

I'm assuming that because we don't model the L2 cache, it's ok that
your implementation just ignores writing these bits, rather than
performing the cache operations requested?

Is that still true for the flash clear operation?

> ---
> 
>  target/ppc/cpu.h|  6 ++
>  target/ppc/translate_init.c.inc | 16 
>  2 files changed, 22 insertions(+)
> 
> diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
> index 2609e40..e77911a 100644
> --- a/target/ppc/cpu.h
> +++ b/target/ppc/cpu.h
> @@ -1919,6 +1919,7 @@ typedef PowerPCCPU ArchCPU;
>  #define SPR_750FX_HID2(0x3F8)
>  #define SPR_Exxx_L1FINV0  (0x3F8)
>  #define SPR_L2CR  (0x3F9)
> +#define SPR_Exxx_L2CSR0   (0x3F9)
>  #define SPR_L3CR  (0x3FA)
>  #define SPR_750_TDCH  (0x3FA)
>  #define SPR_IABR2 (0x3FA)
> @@ -1974,6 +1975,11 @@ typedef PowerPCCPU ArchCPU;
>  #define   L1CSR1_ICFI   0x0002  /* Instruction Cache Flash Invalidate */
>  #define   L1CSR1_ICE0x0001  /* Instruction Cache Enable */
>  
> +/* E500 L2CSR0 */
> +#define E500_L2CSR0_L2FI(1 << 21)   /* L2 cache flash invalidate */
> +#define E500_L2CSR0_L2FL(1 << 11)   /* L2 cache flush */
> +#define E500_L2CSR0_L2LFC   (1 << 10)   /* L2 cache lock flash clear */
> +
>  /* HID0 bits */
>  #define HID0_DEEPNAP(1 << 24)   /* pre-2.06 */
>  #define HID0_DOZE   (1 << 23)   /* pre-2.06 */
> diff --git a/target/ppc/translate_init.c.inc b/target/ppc/translate_init.c.inc
> index 9867d0a..3ec45cb 100644
> --- a/target/ppc/translate_init.c.inc
> +++ b/target/ppc/translate_init.c.inc
> @@ -1735,6 +1735,16 @@ static void spr_write_e500_l1csr1(DisasContext *ctx, 
> int sprn, int gprn)
>  tcg_temp_free(t0);
>  }
>  
> +static void spr_write_e500_l2csr0(DisasContext *ctx, int sprn, int gprn)
> +{
> +TCGv t0 = tcg_temp_new();
> +
> +tcg_gen_andi_tl(t0, cpu_gpr[gprn],
> +~(E500_L2CSR0_L2FI | E500_L2CSR0_L2FL | 
> E500_L2CSR0_L2LFC));
> +gen_store_spr(sprn, t0);
> +tcg_temp_free(t0);
> +}
> +
>  static void spr_write_booke206_mmucsr0(DisasContext *ctx, int sprn, int gprn)
>  {
>  gen_helper_booke206_tlbflush(cpu_env, cpu_gpr[gprn]);
> @@ -5029,6 +5039,12 @@ static void init_proc_e500(CPUPPCState *env, int 
> version)
>   SPR_NOACCESS, SPR_NOACCESS,
>   _read_generic, _write_e500_l1csr1,
>   0x);
> +if (version != fsl_e500v1 && version != fsl_e500v2) {
> +spr_register(env, SPR_Exxx_L2CSR0, "L2CSR0",
> + SPR_NOACCESS, SPR_NOACCESS,
> + _read_generic, _write_e500_l2csr0,
> + 0x);
> +}
>  spr_register(env, SPR_BOOKE_MCSRR0, "MCSRR0",
>   SPR_NOACCESS, SPR_NOACCESS,
>   _read_generic, _write_generic,

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [PATCH v2] hw/net: fsl_etsec: Reverse the RCTRL.RSF logic

2021-02-09 Thread Bin Meng
Hi David, Peter,

On Wed, Feb 10, 2021 at 9:16 AM David Gibson
 wrote:
>
> On Tue, Feb 09, 2021 at 09:48:18AM +, Peter Maydell wrote:
> > On Tue, 9 Feb 2021 at 01:22, Bin Meng  wrote:
> > >
> > > From: Bin Meng 
> > >
> > > Per MPC8548ERM [1] chapter 14.5.3.4.1:
> > >
> > > When RCTRL.RSF is 1, frames less than 64 bytes are accepted upon
> > > a DA match. But currently QEMU does the opposite.
> > >
> > > When RCTRL.RSF is 0, short frames are silently dropped, however
> > > we cannot drop such frames in QEMU as of today, due to both slirp
> > > and tap networking do not pad short frames (e.g.: an ARP packet)
> > > to the minimum frame size of 60 bytes.
> > >
> > > If eTSEC is programmed to reject short frames, ARP requests will be
> > > dropped, preventing the guest from becoming visible on the network.
> > >
> > > The same issue was reported on e1000 and vmxenet3 before, see:
> > >
> > > commit 78aeb23eded2 ("e1000: Pad short frames to minimum size (60 bytes)")
> > > commit 40a87c6c9b11 ("vmxnet3: Pad short frames to minimum size (60 
> > > bytes)")
> > >
> > > Ideally this should be fixed on the slirp/tap networking side to
> > > pad short frames to the minimum frame length, but I am not sure
> > > whether that's doable.
> > >
> > > This commit reverses the RCTRL.RSF testing logic to match the spec.
> > > The log message is updated to mention the reject short frames
> > > functionality is unimplemented.
> > >
> > > [1] https://www.nxp.com/docs/en/reference-manual/MPC8548ERM.pdf
> > >
> > > Fixes: eb1e7c3e5146 ("Add Enhanced Three-Speed Ethernet Controller 
> > > (eTSEC)")
> > > Signed-off-by: Bin Meng 
> >
> >
> > > -if ((etsec->regs[RCTRL].value & RCTRL_RSF) && (size < 60)) {
> > > +/*
> > > + * Both slirp and tap networking do not pad short frames
> > > + * (e.g.: an ARP packet) to the minimum frame size of 60 bytes.
> > > + *
> > > + * If eTSEC is programmed to reject short frames, ARP requests
> > > + * will be dropped, preventing the guest from becoming visible
> > > + * on the network.
> > > + */
> > > +if (!(etsec->regs[RCTRL].value & RCTRL_RSF) && (size < 60)) {
> > >  /* CRC is not in the packet yet, so short frame is below 60 
> > > bytes */
> > > -RING_DEBUG("%s: Drop short frame\n", __func__);
> > > -return -1;
> > > +RING_DEBUG("%s: Drop short frame not implemented\n", __func__);
> > >  }
> >
> > This change is doing two things at once.
>
> Oops, I missed that.
>
> > One of them is an entirely uncontroversial bug fix: we
> > got the sense of the RCTRL_RSF test the wrong way round.
> >
> > The other is different: it is working around a bug elsewhere in QEMU.
> >
> > If there's a problem with packets that should not be short
> > frames being presented to ethernet devices as short frames,
> > please fix that bug at the source. I don't think we should
> > take any more device-model workarounds for it. We have lots
> > and lots of ethernet device models: it will be much more
> > effort to try to fix them all one by one as people encounter
> > this bug than it would be to just fix the code that's creating
> > bogus short frames.
> >
> > David, could you drop this from your queue, please ?
>
> Done.

OK, I will only do the reverse then.

Regards,
Bin



Re: [PATCH v5 0/9] block: Add retry for werror=/rerror= mechanism

2021-02-09 Thread Jiahui Cen
Kindly ping.
Any comments and reviews are wellcome :)

Thanks,
Jiahui

On 2021/2/5 18:13, Jiahui Cen wrote:
> A VM in the cloud environment may use a virutal disk as the backend storage,
> and there are usually filesystems on the virtual block device. When backend
> storage is temporarily down, any I/O issued to the virtual block device
> will cause an error. For example, an error occurred in ext4 filesystem would
> make the filesystem readonly. In production environment, a cloud backend
> storage can be soon recovered. For example, an IP-SAN may be down due to
> network failure and will be online soon after network is recovered. However,
> the error in the filesystem may not be recovered unless a device reattach
> or system restart. Thus an I/O retry mechanism is in need to implement a
> self-healing system.
> 
> This patch series propose to extend the werror=/rerror= mechanism to add
> a 'retry' feature. It can automatically retry failed I/O requests on error
> without sending error back to guest, and guest can get back running smoothly
> when I/O is recovred.
> 
> v4->v5:
> * Add document for 'retry' in qapi.
> * Support werror=/rerror=retry for scsi-disk.
> * Pause retry when draining.
> 
> v3->v4:
> * Adapt to werror=/rerror= mechanism.
> 
> v2->v3:
> * Add a doc to describe I/O hang.
> 
> v1->v2:
> * Rebase to fix compile problems.
> * Fix incorrect remove of rehandle list.
> * Provide rehandle pause interface.
> 
> REF: https://lists.gnu.org/archive/html/qemu-devel/2020-10/msg06560.html
> 
> Jiahui Cen (9):
>   qapi/block-core: Add retry option for error action
>   block-backend: Introduce retry timer
>   block-backend: Add device specific retry callback
>   block-backend: Enable retry action on errors
>   block-backend: Add timeout support for retry
>   block: Add error retry param setting
>   virtio_blk: Add support for retry on errors
>   scsi-bus: Refactor the code that retries requests
>   scsi-disk: Add support for retry on errors
> 
>  block/block-backend.c  | 68 
>  blockdev.c | 52 +++
>  hw/block/block.c   | 10 +++
>  hw/block/virtio-blk.c  | 21 +-
>  hw/scsi/scsi-bus.c | 16 +++--
>  hw/scsi/scsi-disk.c| 16 +
>  include/hw/block/block.h   |  7 +-
>  include/hw/scsi/scsi.h |  1 +
>  include/sysemu/block-backend.h | 10 +++
>  qapi/block-core.json   |  9 ++-
>  10 files changed, 199 insertions(+), 11 deletions(-)
> 



Re: [PATCH 2/2] spapr_iommu: Fix vhost integration regression

2021-02-09 Thread David Gibson
On Tue, Feb 09, 2021 at 10:32:33PM +0100, Eric Auger wrote:
> Previous work on dev-iotlb message broke spapr_iommu/vhost integration
> as it did for SMMU and virtio-iommu. The spapr_iommu currently
> only sends IOMMU_NOTIFIER_UNMAP notifications. Since commit
> 958ec334bca3 ("vhost: Unbreak SMMU and virtio-iommu on dev-iotlb support"),
> VHOST first tries to register IOMMU_NOTIFIER_DEVIOTLB_UNMAP notifier
> and if it fails, falls back to legacy IOMMU_NOTIFIER_UNMAP. So
> spapr_iommu must fail on the IOMMU_NOTIFIER_DEVIOTLB_UNMAP
> registration.
> 
> Reported-by: Peter Xu 
> Fixes: b68ba1ca57677acf870d5ab10579e6105c1f5338
> Signed-off-by: Eric Auger 

Acked-by: David Gibson 

> ---
>  hw/ppc/spapr_iommu.c | 5 +
>  1 file changed, 5 insertions(+)
> 
> diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c
> index 30352df00e..24537ffcbd 100644
> --- a/hw/ppc/spapr_iommu.c
> +++ b/hw/ppc/spapr_iommu.c
> @@ -212,6 +212,11 @@ static int 
> spapr_tce_notify_flag_changed(IOMMUMemoryRegion *iommu,
>  {
>  struct SpaprTceTable *tbl = container_of(iommu, SpaprTceTable, iommu);
>  
> +if (new & IOMMU_NOTIFIER_DEVIOTLB_UNMAP) {
> +error_setg(errp, "spart_tce does not support dev-iotlb yet");
> +return -EINVAL;
> +}
> +
>  if (old == IOMMU_NOTIFIER_NONE && new != IOMMU_NOTIFIER_NONE) {
>  spapr_tce_set_need_vfio(tbl, true);
>  } else if (old != IOMMU_NOTIFIER_NONE && new == IOMMU_NOTIFIER_NONE) {

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [PATCH v2] hw/net: fsl_etsec: Reverse the RCTRL.RSF logic

2021-02-09 Thread David Gibson
On Tue, Feb 09, 2021 at 09:48:18AM +, Peter Maydell wrote:
> On Tue, 9 Feb 2021 at 01:22, Bin Meng  wrote:
> >
> > From: Bin Meng 
> >
> > Per MPC8548ERM [1] chapter 14.5.3.4.1:
> >
> > When RCTRL.RSF is 1, frames less than 64 bytes are accepted upon
> > a DA match. But currently QEMU does the opposite.
> >
> > When RCTRL.RSF is 0, short frames are silently dropped, however
> > we cannot drop such frames in QEMU as of today, due to both slirp
> > and tap networking do not pad short frames (e.g.: an ARP packet)
> > to the minimum frame size of 60 bytes.
> >
> > If eTSEC is programmed to reject short frames, ARP requests will be
> > dropped, preventing the guest from becoming visible on the network.
> >
> > The same issue was reported on e1000 and vmxenet3 before, see:
> >
> > commit 78aeb23eded2 ("e1000: Pad short frames to minimum size (60 bytes)")
> > commit 40a87c6c9b11 ("vmxnet3: Pad short frames to minimum size (60 bytes)")
> >
> > Ideally this should be fixed on the slirp/tap networking side to
> > pad short frames to the minimum frame length, but I am not sure
> > whether that's doable.
> >
> > This commit reverses the RCTRL.RSF testing logic to match the spec.
> > The log message is updated to mention the reject short frames
> > functionality is unimplemented.
> >
> > [1] https://www.nxp.com/docs/en/reference-manual/MPC8548ERM.pdf
> >
> > Fixes: eb1e7c3e5146 ("Add Enhanced Three-Speed Ethernet Controller (eTSEC)")
> > Signed-off-by: Bin Meng 
> 
> 
> > -if ((etsec->regs[RCTRL].value & RCTRL_RSF) && (size < 60)) {
> > +/*
> > + * Both slirp and tap networking do not pad short frames
> > + * (e.g.: an ARP packet) to the minimum frame size of 60 bytes.
> > + *
> > + * If eTSEC is programmed to reject short frames, ARP requests
> > + * will be dropped, preventing the guest from becoming visible
> > + * on the network.
> > + */
> > +if (!(etsec->regs[RCTRL].value & RCTRL_RSF) && (size < 60)) {
> >  /* CRC is not in the packet yet, so short frame is below 60 bytes 
> > */
> > -RING_DEBUG("%s: Drop short frame\n", __func__);
> > -return -1;
> > +RING_DEBUG("%s: Drop short frame not implemented\n", __func__);
> >  }
> 
> This change is doing two things at once.

Oops, I missed that.

> One of them is an entirely uncontroversial bug fix: we
> got the sense of the RCTRL_RSF test the wrong way round.
> 
> The other is different: it is working around a bug elsewhere in QEMU.
> 
> If there's a problem with packets that should not be short
> frames being presented to ethernet devices as short frames,
> please fix that bug at the source. I don't think we should
> take any more device-model workarounds for it. We have lots
> and lots of ethernet device models: it will be much more
> effort to try to fix them all one by one as people encounter
> this bug than it would be to just fix the code that's creating
> bogus short frames.
> 
> David, could you drop this from your queue, please ?

Done.

> 
> thanks
> -- PMM
> 

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [PATCH 00/26] ppc: qemu: Convert qemu-ppce500 to driver model

2021-02-09 Thread Bin Meng
On Sun, Feb 7, 2021 at 11:11 PM Bin Meng  wrote:
>
> At present when building qemu-ppce500 the following warnings are seen:
>
> = WARNING ==
> This board does not use CONFIG_DM. CONFIG_DM will be
> compulsory starting with the v2020.01 release.
> Failure to update may result in board removal.
>   UPD include/generated/timestamp_autogenerated.h
> See doc/driver-model/migration.rst for more info.
> 
> = WARNING ==
> This board does not use CONFIG_DM_PCI Please update
> the board to use CONFIG_DM_PCI before the v2019.07 release.
> Failure to update by the deadline may result in board removal.
> See doc/driver-model/migration.rst for more info.
> 
> = WARNING ==
> This board does not use CONFIG_DM_ETH (Driver Model
> for Ethernet drivers). Please update the board to use
> CONFIG_DM_ETH before the v2020.07 release. Failure to
> update by the deadline may result in board removal.
> See doc/driver-model/migration.rst for more info.
> 
>
> The conversion of qemu-ppce500 board to driver model is long overdue.
>
> When testing the exisitng qemu-ppce500 support, PCI was found broken.
> This is caused by 2 separate issues:
>
> - One issue was caused by U-Boot:
>   Commit e002474158d1 ("pci: pci-uclass: Dynamically allocate the PCI 
> regions")
>   Patch #1 reverts this commit as it broken all boards that have not converted
>   to driver model PCI.
> - One issue was caused by QEMU:
>   commit e6b4e5f4795b ("PPC: e500: Move CCSR and MMIO space to upper end of 
> address space")
>   commit cb3778a0455a ("PPC: e500 pci host: Add support for ATMUs")
>   Patch #3-4 fixed this issue to keep in sync with latest QEMU upstream
>
> Patch #5-8 are minor fixes and clean-ups.
>
> Starting from patch#9, these are driver model conversion patches.
>
> Patch #11-16 are mainly related to CONFIG_ADDR_MAP, a library to support 
> targets
> that have non-identity virtual-physical address mappings. A new command 
> 'addrmap'
> is introduced to aid debugging, and a fix to arch/powerpc/asm/include/io.h is
> made to correct the usage of CONFIG_ADDR_MAP as it can only be used in the 
> post-
> relocation phase. Also the initialization of this library is moved a bit 
> earlier
> in the post-relocation phase otherwise device drivers won't work.
>
> Patch #18-20 are 85xx PCI driver fixes. It adds support to controller register
> physical address beyond 32-bit, as well as support to 64-bit bus and cpu 
> address
> as current upstream QEMU uses 64-bit cpu address.
>
> Patch #23 is minor fix to the 'virtio' command dependency.
>
> Patch #24 enables the VirtIO NET support as by default a VirtIO standard PCI
> networking device is connected as an ethernet interface at PCI address 0.1.0.
>
> Patch #25 moves the qemu-ppce500 boards codes to board/emulation as that is 
> the
> place for other QEMU targets like x86, arm, riscv.
>
> Patch #26 adds a reST document to describe how to build and run U-Boot for the
> QEMU ppce500 machine.
>
> I hope we can make this series to U-Boot v2021.04 release.
>
> This series is available at u-boot-x86/qemu-ppc for testing.
>
> This cover letter is cc'ed to QEMU mailing list for a heads-up.
> A future patch will be sent to QEMU mailing list to bring its in-tree
> U-Boot source codes up-to-date.
>
>
> Bin Meng (26):
>   Revert "pci: pci-uclass: Dynamically allocate the PCI regions"
>   ppc: qemu: Update MAINTAINERS for correct email address
>   common: fdt_support: Support special case of PCI address in
> fdt_read_prop()
>   ppc: qemu: Support non-identity PCI bus address
>   ppc: qemu: Fix CONFIG_SYS_PCI_MAP_END
>   ppc: mpc85xx: Wrap LAW related codes with CONFIG_FSL_LAW
>   ppc: qemu: Drop init_laws() and print_laws()
>   ppc: qemu: Drop board_early_init_f()
>   ppc: qemu: Enable OF_CONTROL
>   ppc: qemu: Enable driver model
>   include: Remove extern from addr_map.h
>   lib: addr_map: Move address_map[] type to the header file
>   cmd: Add a command to display the address map
>   lib: kconfig: Mention CONFIG_ADDR_MAP limitation in the help
>   ppc: io.h: Use addrmap_ translation APIs only in post-relocation phase
>   common: Move initr_addr_map() to a bit earlier
>   ppc: qemu: Switch over to use DM serial
>   pci: mpc85xx: Wrap LAW programming with CONFIG_FSL_LAW
>   pci: mpc85xx: Support controller register physical address beyond
> 32-bit
>   pci: mpc85xx: Support 64-bit bus and cpu address
>   ppc: qemu: Switch over to use DM ETH and PCI
>   ppc: qemu: Drop CONFIG_OF_BOARD_SETUP
>   cmd: Fix virtio command dependency
>   ppc: qemu: Enable VirtIO NET support
>   ppc: qemu: Move board directory from board/freescale to
> board/emulation
>   doc: Add a reST document for qemu-ppce500
>
>  arch/powerpc/cpu/mpc85xx/Kconfig 

[PATCH v6 29/31] target/arm: Add allocation tag storage for user mode

2021-02-09 Thread Richard Henderson
Use the now-saved PAGE_ANON and PAGE_MTE bits,
and the per-page saved data.

Reviewed-by: Peter Maydell 
Signed-off-by: Richard Henderson 
---
 target/arm/mte_helper.c | 29 +++--
 1 file changed, 27 insertions(+), 2 deletions(-)

diff --git a/target/arm/mte_helper.c b/target/arm/mte_helper.c
index d55f8d1e1e..1c569336ea 100644
--- a/target/arm/mte_helper.c
+++ b/target/arm/mte_helper.c
@@ -78,8 +78,33 @@ static uint8_t *allocation_tag_mem(CPUARMState *env, int 
ptr_mmu_idx,
int tag_size, uintptr_t ra)
 {
 #ifdef CONFIG_USER_ONLY
-/* Tag storage not implemented.  */
-return NULL;
+uint64_t clean_ptr = useronly_clean_ptr(ptr);
+int flags = page_get_flags(clean_ptr);
+uint8_t *tags;
+uintptr_t index;
+
+if (!(flags & (ptr_access == MMU_DATA_STORE ? PAGE_WRITE : PAGE_READ))) {
+/* SIGSEGV */
+arm_cpu_tlb_fill(env_cpu(env), ptr, ptr_size, ptr_access,
+ ptr_mmu_idx, false, ra);
+g_assert_not_reached();
+}
+
+/* Require both MAP_ANON and PROT_MTE for the page. */
+if (!(flags & PAGE_ANON) || !(flags & PAGE_MTE)) {
+return NULL;
+}
+
+tags = page_get_target_data(clean_ptr);
+if (tags == NULL) {
+size_t alloc_size = TARGET_PAGE_SIZE >> (LOG2_TAG_GRANULE + 1);
+tags = page_alloc_target_data(clean_ptr, alloc_size);
+assert(tags != NULL);
+}
+
+index = extract32(ptr, LOG2_TAG_GRANULE + 1,
+  TARGET_PAGE_BITS - LOG2_TAG_GRANULE - 1);
+return tags + index;
 #else
 uintptr_t index;
 CPUIOTLBEntry *iotlbentry;
-- 
2.25.1




[PATCH v6 28/31] linux-user/aarch64: Signal SEGV_MTEAERR for async tag check error

2021-02-09 Thread Richard Henderson
The real kernel collects _TIF_MTE_ASYNC_FAULT into the current thread's
state on any kernel entry (interrupt, exception etc), and then delivers
the signal in advance of resuming the thread.

This means that while the signal won't be delivered immediately, it will
not be delayed forever -- at minimum it will be delivered after the next
clock interrupt.

We don't have a clock interrupt in linux-user, so we issue a cpu_kick
to signal a return to the main loop at the end of the current TB.

Reviewed-by: Peter Maydell 
Signed-off-by: Richard Henderson 
---
 linux-user/aarch64/target_signal.h |  1 +
 linux-user/aarch64/cpu_loop.c  | 11 +++
 target/arm/mte_helper.c| 10 ++
 3 files changed, 22 insertions(+)

diff --git a/linux-user/aarch64/target_signal.h 
b/linux-user/aarch64/target_signal.h
index 777fb667fe..18013e1b23 100644
--- a/linux-user/aarch64/target_signal.h
+++ b/linux-user/aarch64/target_signal.h
@@ -21,6 +21,7 @@ typedef struct target_sigaltstack {
 
 #include "../generic/signal.h"
 
+#define TARGET_SEGV_MTEAERR  8  /* Asynchronous ARM MTE error */
 #define TARGET_SEGV_MTESERR  9  /* Synchronous ARM MTE exception */
 
 #define TARGET_ARCH_HAS_SETUP_FRAME
diff --git a/linux-user/aarch64/cpu_loop.c b/linux-user/aarch64/cpu_loop.c
index b6a2e65593..7c42f65706 100644
--- a/linux-user/aarch64/cpu_loop.c
+++ b/linux-user/aarch64/cpu_loop.c
@@ -164,6 +164,17 @@ void cpu_loop(CPUARMState *env)
 EXCP_DUMP(env, "qemu: unhandled CPU exception 0x%x - aborting\n", 
trapnr);
 abort();
 }
+
+/* Check for MTE asynchronous faults */
+if (unlikely(env->cp15.tfsr_el[0])) {
+env->cp15.tfsr_el[0] = 0;
+info.si_signo = TARGET_SIGSEGV;
+info.si_errno = 0;
+info._sifields._sigfault._addr = 0;
+info.si_code = TARGET_SEGV_MTEAERR;
+queue_signal(env, info.si_signo, QEMU_SI_FAULT, );
+}
+
 process_pending_signals(env);
 /* Exception return on AArch64 always clears the exclusive monitor,
  * so any return to running guest code implies this.
diff --git a/target/arm/mte_helper.c b/target/arm/mte_helper.c
index 153bd1e9df..d55f8d1e1e 100644
--- a/target/arm/mte_helper.c
+++ b/target/arm/mte_helper.c
@@ -565,6 +565,16 @@ static void mte_check_fail(CPUARMState *env, uint32_t desc,
 select = 0;
 }
 env->cp15.tfsr_el[el] |= 1 << select;
+#ifdef CONFIG_USER_ONLY
+/*
+ * Stand in for a timer irq, setting _TIF_MTE_ASYNC_FAULT,
+ * which then sends a SIGSEGV when the thread is next scheduled.
+ * This cpu will return to the main loop at the end of the TB,
+ * which is rather sooner than "normal".  But the alternative
+ * is waiting until the next syscall.
+ */
+qemu_cpu_kick(env_cpu(env));
+#endif
 break;
 
 default:
-- 
2.25.1




[PATCH v6 31/31] tests/tcg/aarch64: Add mte smoke tests

2021-02-09 Thread Richard Henderson
Reviewed-by: Peter Maydell 
Signed-off-by: Richard Henderson 
---
 tests/tcg/aarch64/mte.h   | 60 +++
 tests/tcg/aarch64/mte-1.c | 28 +++
 tests/tcg/aarch64/mte-2.c | 45 +++
 tests/tcg/aarch64/mte-3.c | 51 ++
 tests/tcg/aarch64/mte-4.c | 45 +++
 tests/tcg/aarch64/Makefile.target |  6 
 tests/tcg/configure.sh|  4 +++
 7 files changed, 239 insertions(+)
 create mode 100644 tests/tcg/aarch64/mte.h
 create mode 100644 tests/tcg/aarch64/mte-1.c
 create mode 100644 tests/tcg/aarch64/mte-2.c
 create mode 100644 tests/tcg/aarch64/mte-3.c
 create mode 100644 tests/tcg/aarch64/mte-4.c

diff --git a/tests/tcg/aarch64/mte.h b/tests/tcg/aarch64/mte.h
new file mode 100644
index 00..141cef522c
--- /dev/null
+++ b/tests/tcg/aarch64/mte.h
@@ -0,0 +1,60 @@
+/*
+ * Linux kernel fallback API definitions for MTE and test helpers.
+ *
+ * Copyright (c) 2021 Linaro Ltd
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#ifndef PR_SET_TAGGED_ADDR_CTRL
+# define PR_SET_TAGGED_ADDR_CTRL  55
+#endif
+#ifndef PR_TAGGED_ADDR_ENABLE
+# define PR_TAGGED_ADDR_ENABLE(1UL << 0)
+#endif
+#ifndef PR_MTE_TCF_SHIFT
+# define PR_MTE_TCF_SHIFT 1
+# define PR_MTE_TCF_NONE  (0UL << PR_MTE_TCF_SHIFT)
+# define PR_MTE_TCF_SYNC  (1UL << PR_MTE_TCF_SHIFT)
+# define PR_MTE_TCF_ASYNC (2UL << PR_MTE_TCF_SHIFT)
+# define PR_MTE_TAG_SHIFT 3
+#endif
+
+#ifndef PROT_MTE
+# define PROT_MTE 0x20
+#endif
+
+#ifndef SEGV_MTEAERR
+# define SEGV_MTEAERR8
+# define SEGV_MTESERR9
+#endif
+
+static void enable_mte(int tcf)
+{
+int r = prctl(PR_SET_TAGGED_ADDR_CTRL,
+  PR_TAGGED_ADDR_ENABLE | tcf | (0xfffe << PR_MTE_TAG_SHIFT),
+  0, 0, 0);
+if (r < 0) {
+perror("PR_SET_TAGGED_ADDR_CTRL");
+exit(2);
+}
+}
+
+static void *alloc_mte_mem(size_t size)
+{
+void *p = mmap(NULL, size, PROT_READ | PROT_WRITE | PROT_MTE,
+   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+if (p == MAP_FAILED) {
+perror("mmap PROT_MTE");
+exit(2);
+}
+return p;
+}
diff --git a/tests/tcg/aarch64/mte-1.c b/tests/tcg/aarch64/mte-1.c
new file mode 100644
index 00..88dcd617ad
--- /dev/null
+++ b/tests/tcg/aarch64/mte-1.c
@@ -0,0 +1,28 @@
+/*
+ * Memory tagging, basic pass cases.
+ *
+ * Copyright (c) 2021 Linaro Ltd
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "mte.h"
+
+int main(int ac, char **av)
+{
+int *p0, *p1, *p2;
+long c;
+
+enable_mte(PR_MTE_TCF_NONE);
+p0 = alloc_mte_mem(sizeof(*p0));
+
+asm("irg %0,%1,%2" : "=r"(p1) : "r"(p0), "r"(1));
+assert(p1 != p0);
+asm("subp %0,%1,%2" : "=r"(c) : "r"(p0), "r"(p1));
+assert(c == 0);
+
+asm("stg %0, [%0]" : : "r"(p1));
+asm("ldg %0, [%1]" : "=r"(p2) : "r"(p0), "0"(p0));
+assert(p1 == p2);
+
+return 0;
+}
diff --git a/tests/tcg/aarch64/mte-2.c b/tests/tcg/aarch64/mte-2.c
new file mode 100644
index 00..a62278276a
--- /dev/null
+++ b/tests/tcg/aarch64/mte-2.c
@@ -0,0 +1,45 @@
+/*
+ * Memory tagging, basic fail cases, synchronous signals.
+ *
+ * Copyright (c) 2021 Linaro Ltd
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "mte.h"
+
+void pass(int sig, siginfo_t *info, void *uc)
+{
+assert(info->si_code == SEGV_MTESERR);
+exit(0);
+}
+
+int main(int ac, char **av)
+{
+struct sigaction sa;
+int *p0, *p1, *p2;
+long excl = 1;
+
+enable_mte(PR_MTE_TCF_SYNC);
+p0 = alloc_mte_mem(sizeof(*p0));
+
+/* Create two differently tagged pointers.  */
+asm("irg %0,%1,%2" : "=r"(p1) : "r"(p0), "r"(excl));
+asm("gmi %0,%1,%0" : "+r"(excl) : "r" (p1));
+assert(excl != 1);
+asm("irg %0,%1,%2" : "=r"(p2) : "r"(p0), "r"(excl));
+assert(p1 != p2);
+
+/* Store the tag from the first pointer.  */
+asm("stg %0, [%0]" : : "r"(p1));
+
+*p1 = 0;
+
+memset(, 0, sizeof(sa));
+sa.sa_sigaction = pass;
+sa.sa_flags = SA_SIGINFO;
+sigaction(SIGSEGV, , NULL);
+
+*p2 = 0;
+
+abort();
+}
diff --git a/tests/tcg/aarch64/mte-3.c b/tests/tcg/aarch64/mte-3.c
new file mode 100644
index 00..424ea685c2
--- /dev/null
+++ b/tests/tcg/aarch64/mte-3.c
@@ -0,0 +1,51 @@
+/*
+ * Memory tagging, basic fail cases, asynchronous signals.
+ *
+ * Copyright (c) 2021 Linaro Ltd
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "mte.h"
+
+void pass(int sig, siginfo_t *info, void *uc)
+{
+assert(info->si_code == SEGV_MTEAERR);
+exit(0);
+}
+
+int main(int ac, char **av)
+{
+struct sigaction sa;
+long *p0, *p1, *p2;
+long excl = 1;
+
+enable_mte(PR_MTE_TCF_ASYNC);
+p0 = alloc_mte_mem(sizeof(*p0));
+
+/* Create two differently tagged pointers.  */
+asm("irg %0,%1,%2" : 

[PATCH v6 30/31] target/arm: Enable MTE for user-only

2021-02-09 Thread Richard Henderson
Reviewed-by: Peter Maydell 
Signed-off-by: Richard Henderson 
---
 target/arm/cpu.c | 15 +++
 1 file changed, 15 insertions(+)

diff --git a/target/arm/cpu.c b/target/arm/cpu.c
index 761f0c61bd..929de1071b 100644
--- a/target/arm/cpu.c
+++ b/target/arm/cpu.c
@@ -209,6 +209,21 @@ static void arm_cpu_reset(DeviceState *dev)
  * Note that this must match useronly_clean_ptr.
  */
 env->cp15.tcr_el[1].raw_tcr = (1ULL << 37);
+
+/* Enable MTE */
+if (cpu_isar_feature(aa64_mte, cpu)) {
+/* Enable tag access, but leave TCF0 as No Effect (0). */
+env->cp15.sctlr_el[1] |= SCTLR_ATA0;
+/*
+ * Exclude all tags, so that tag 0 is always used.
+ * This corresponds to Linux current->thread.gcr_incl = 0.
+ *
+ * Set RRND, so that helper_irg() will generate a seed later.
+ * Here in cpu_reset(), the crypto subsystem has not yet been
+ * initialized.
+ */
+env->cp15.gcr_el1 = 0x1;
+}
 #else
 /* Reset into the highest available EL */
 if (arm_feature(env, ARM_FEATURE_EL3)) {
-- 
2.25.1




[PATCH v6 20/31] linux-user/aarch64: Implement PR_TAGGED_ADDR_ENABLE

2021-02-09 Thread Richard Henderson
This is the prctl bit that controls whether syscalls accept tagged
addresses.  See Documentation/arm64/tagged-address-abi.rst in the
linux kernel.

Reviewed-by: Peter Maydell 
Signed-off-by: Richard Henderson 
---
 linux-user/aarch64/target_syscall.h |  4 
 target/arm/cpu-param.h  |  3 +++
 target/arm/cpu.h| 31 +
 linux-user/syscall.c| 24 ++
 4 files changed, 62 insertions(+)

diff --git a/linux-user/aarch64/target_syscall.h 
b/linux-user/aarch64/target_syscall.h
index 3194e6b009..820601dfcc 100644
--- a/linux-user/aarch64/target_syscall.h
+++ b/linux-user/aarch64/target_syscall.h
@@ -30,4 +30,8 @@ struct target_pt_regs {
 # define TARGET_PR_PAC_APDBKEY   (1 << 3)
 # define TARGET_PR_PAC_APGAKEY   (1 << 4)
 
+#define TARGET_PR_SET_TAGGED_ADDR_CTRL 55
+#define TARGET_PR_GET_TAGGED_ADDR_CTRL 56
+# define TARGET_PR_TAGGED_ADDR_ENABLE  (1UL << 0)
+
 #endif /* AARCH64_TARGET_SYSCALL_H */
diff --git a/target/arm/cpu-param.h b/target/arm/cpu-param.h
index 00e7d9e937..7f38d33b8e 100644
--- a/target/arm/cpu-param.h
+++ b/target/arm/cpu-param.h
@@ -20,6 +20,9 @@
 
 #ifdef CONFIG_USER_ONLY
 #define TARGET_PAGE_BITS 12
+# ifdef TARGET_AARCH64
+#  define TARGET_TAGGED_ADDRESSES
+# endif
 #else
 /*
  * ARMv7 and later CPUs have 4K pages minimum, but ARMv5 and v6
diff --git a/target/arm/cpu.h b/target/arm/cpu.h
index d080239863..558ad1466b 100644
--- a/target/arm/cpu.h
+++ b/target/arm/cpu.h
@@ -721,6 +721,11 @@ typedef struct CPUARMState {
 const struct arm_boot_info *boot_info;
 /* Store GICv3CPUState to access from this struct */
 void *gicv3state;
+
+#ifdef TARGET_TAGGED_ADDRESSES
+/* Linux syscall tagged address support */
+bool tagged_addr_enable;
+#endif
 } CPUARMState;
 
 static inline void set_feature(CPUARMState *env, int feature)
@@ -3602,6 +3607,32 @@ static inline MemTxAttrs 
*typecheck_memtxattrs(MemTxAttrs *x)
  */
 #define PAGE_BTI  PAGE_TARGET_1
 
+#ifdef TARGET_TAGGED_ADDRESSES
+/**
+ * cpu_untagged_addr:
+ * @cs: CPU context
+ * @x: tagged address
+ *
+ * Remove any address tag from @x.  This is explicitly related to the
+ * linux syscall TIF_TAGGED_ADDR setting, not TBI in general.
+ *
+ * There should be a better place to put this, but we need this in
+ * include/exec/cpu_ldst.h, and not some place linux-user specific.
+ */
+static inline target_ulong cpu_untagged_addr(CPUState *cs, target_ulong x)
+{
+ARMCPU *cpu = ARM_CPU(cs);
+if (cpu->env.tagged_addr_enable) {
+/*
+ * TBI is enabled for userspace but not kernelspace addresses.
+ * Only clear the tag if bit 55 is clear.
+ */
+x &= sextract64(x, 0, 56);
+}
+return x;
+}
+#endif
+
 /*
  * Naming convention for isar_feature functions:
  * Functions which test 32-bit ID registers should have _aa32_ in
diff --git a/linux-user/syscall.c b/linux-user/syscall.c
index 24fc1daf02..ba4da7f8a6 100644
--- a/linux-user/syscall.c
+++ b/linux-user/syscall.c
@@ -10981,6 +10981,30 @@ static abi_long do_syscall1(void *cpu_env, int num, 
abi_long arg1,
 }
 }
 return -TARGET_EINVAL;
+case TARGET_PR_SET_TAGGED_ADDR_CTRL:
+{
+abi_ulong valid_mask = TARGET_PR_TAGGED_ADDR_ENABLE;
+CPUARMState *env = cpu_env;
+
+if ((arg2 & ~valid_mask) || arg3 || arg4 || arg5) {
+return -TARGET_EINVAL;
+}
+env->tagged_addr_enable = arg2 & TARGET_PR_TAGGED_ADDR_ENABLE;
+return 0;
+}
+case TARGET_PR_GET_TAGGED_ADDR_CTRL:
+{
+abi_long ret = 0;
+CPUARMState *env = cpu_env;
+
+if (arg2 || arg3 || arg4 || arg5) {
+return -TARGET_EINVAL;
+}
+if (env->tagged_addr_enable) {
+ret |= TARGET_PR_TAGGED_ADDR_ENABLE;
+}
+return ret;
+}
 #endif /* AARCH64 */
 case PR_GET_SECCOMP:
 case PR_SET_SECCOMP:
-- 
2.25.1




[PATCH v6 27/31] linux-user/aarch64: Signal SEGV_MTESERR for sync tag check fault

2021-02-09 Thread Richard Henderson
Reviewed-by: Peter Maydell 
Signed-off-by: Richard Henderson 
---
 linux-user/aarch64/target_signal.h | 2 ++
 linux-user/aarch64/cpu_loop.c  | 3 +++
 2 files changed, 5 insertions(+)

diff --git a/linux-user/aarch64/target_signal.h 
b/linux-user/aarch64/target_signal.h
index ddd73169f0..777fb667fe 100644
--- a/linux-user/aarch64/target_signal.h
+++ b/linux-user/aarch64/target_signal.h
@@ -21,5 +21,7 @@ typedef struct target_sigaltstack {
 
 #include "../generic/signal.h"
 
+#define TARGET_SEGV_MTESERR  9  /* Synchronous ARM MTE exception */
+
 #define TARGET_ARCH_HAS_SETUP_FRAME
 #endif /* AARCH64_TARGET_SIGNAL_H */
diff --git a/linux-user/aarch64/cpu_loop.c b/linux-user/aarch64/cpu_loop.c
index 4e43906e66..b6a2e65593 100644
--- a/linux-user/aarch64/cpu_loop.c
+++ b/linux-user/aarch64/cpu_loop.c
@@ -134,6 +134,9 @@ void cpu_loop(CPUARMState *env)
 case 0x0d ... 0x0f: /* Permission fault, level {1-3} */
 info.si_code = TARGET_SEGV_ACCERR;
 break;
+case 0x11: /* Synchronous Tag Check Fault */
+info.si_code = TARGET_SEGV_MTESERR;
+break;
 default:
 g_assert_not_reached();
 }
-- 
2.25.1




[PATCH v6 21/31] target/arm: Improve gen_top_byte_ignore

2021-02-09 Thread Richard Henderson
Use simple arithmetic instead of a conditional
move when tbi0 != tbi1.

Reviewed-by: Peter Maydell 
Signed-off-by: Richard Henderson 
---
 target/arm/translate-a64.c | 25 ++---
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
index ffc060e5d7..3ec0dc17d8 100644
--- a/target/arm/translate-a64.c
+++ b/target/arm/translate-a64.c
@@ -183,17 +183,20 @@ static void gen_top_byte_ignore(DisasContext *s, TCGv_i64 
dst,
 /* Sign-extend from bit 55.  */
 tcg_gen_sextract_i64(dst, src, 0, 56);
 
-if (tbi != 3) {
-TCGv_i64 tcg_zero = tcg_const_i64(0);
-
-/*
- * The two TBI bits differ.
- * If tbi0, then !tbi1: only use the extension if positive.
- * if !tbi0, then tbi1: only use the extension if negative.
- */
-tcg_gen_movcond_i64(tbi == 1 ? TCG_COND_GE : TCG_COND_LT,
-dst, dst, tcg_zero, dst, src);
-tcg_temp_free_i64(tcg_zero);
+switch (tbi) {
+case 1:
+/* tbi0 but !tbi1: only use the extension if positive */
+tcg_gen_and_i64(dst, dst, src);
+break;
+case 2:
+/* !tbi0 but tbi1: only use the extension if negative */
+tcg_gen_or_i64(dst, dst, src);
+break;
+case 3:
+/* tbi0 and tbi1: always use the extension */
+break;
+default:
+g_assert_not_reached();
 }
 }
 }
-- 
2.25.1




[PATCH v6 26/31] linux-user/aarch64: Pass syndrome to EXC_*_ABORT

2021-02-09 Thread Richard Henderson
A proper syndrome is required to fill in the proper si_code.
Use page_get_flags to determine permission vs translation for user-only.

Reviewed-by: Peter Maydell 
Signed-off-by: Richard Henderson 
---
 linux-user/aarch64/cpu_loop.c | 24 +---
 target/arm/tlb_helper.c   | 15 +--
 2 files changed, 30 insertions(+), 9 deletions(-)

diff --git a/linux-user/aarch64/cpu_loop.c b/linux-user/aarch64/cpu_loop.c
index 42b9c15f53..4e43906e66 100644
--- a/linux-user/aarch64/cpu_loop.c
+++ b/linux-user/aarch64/cpu_loop.c
@@ -23,6 +23,7 @@
 #include "cpu_loop-common.h"
 #include "qemu/guest-random.h"
 #include "hw/semihosting/common-semi.h"
+#include "target/arm/syndrome.h"
 
 #define get_user_code_u32(x, gaddr, env)\
 ({ abi_long __r = get_user_u32((x), (gaddr));   \
@@ -76,7 +77,7 @@
 void cpu_loop(CPUARMState *env)
 {
 CPUState *cs = env_cpu(env);
-int trapnr;
+int trapnr, ec, fsc;
 abi_long ret;
 target_siginfo_t info;
 
@@ -117,9 +118,26 @@ void cpu_loop(CPUARMState *env)
 case EXCP_DATA_ABORT:
 info.si_signo = TARGET_SIGSEGV;
 info.si_errno = 0;
-/* XXX: check env->error_code */
-info.si_code = TARGET_SEGV_MAPERR;
 info._sifields._sigfault._addr = env->exception.vaddress;
+
+/* We should only arrive here with EC in {DATAABORT, INSNABORT}. */
+ec = syn_get_ec(env->exception.syndrome);
+assert(ec == EC_DATAABORT || ec == EC_INSNABORT);
+
+/* Both EC have the same format for FSC, or close enough. */
+fsc = extract32(env->exception.syndrome, 0, 6);
+switch (fsc) {
+case 0x04 ... 0x07: /* Translation fault, level {0-3} */
+info.si_code = TARGET_SEGV_MAPERR;
+break;
+case 0x09 ... 0x0b: /* Access flag fault, level {1-3} */
+case 0x0d ... 0x0f: /* Permission fault, level {1-3} */
+info.si_code = TARGET_SEGV_ACCERR;
+break;
+default:
+g_assert_not_reached();
+}
+
 queue_signal(env, info.si_signo, QEMU_SI_FAULT, );
 break;
 case EXCP_DEBUG:
diff --git a/target/arm/tlb_helper.c b/target/arm/tlb_helper.c
index df85079d9f..9609333cbd 100644
--- a/target/arm/tlb_helper.c
+++ b/target/arm/tlb_helper.c
@@ -154,21 +154,24 @@ bool arm_cpu_tlb_fill(CPUState *cs, vaddr address, int 
size,
   bool probe, uintptr_t retaddr)
 {
 ARMCPU *cpu = ARM_CPU(cs);
+ARMMMUFaultInfo fi = {};
 
 #ifdef CONFIG_USER_ONLY
-cpu->env.exception.vaddress = address;
-if (access_type == MMU_INST_FETCH) {
-cs->exception_index = EXCP_PREFETCH_ABORT;
+int flags = page_get_flags(useronly_clean_ptr(address));
+if (flags & PAGE_VALID) {
+fi.type = ARMFault_Permission;
 } else {
-cs->exception_index = EXCP_DATA_ABORT;
+fi.type = ARMFault_Translation;
 }
-cpu_loop_exit_restore(cs, retaddr);
+
+/* now we have a real cpu fault */
+cpu_restore_state(cs, retaddr, true);
+arm_deliver_fault(cpu, address, access_type, mmu_idx, );
 #else
 hwaddr phys_addr;
 target_ulong page_size;
 int prot, ret;
 MemTxAttrs attrs = {};
-ARMMMUFaultInfo fi = {};
 ARMCacheAttrs cacheattrs = {};
 
 /*
-- 
2.25.1




[PATCH v6 23/31] linux-user/aarch64: Implement PR_MTE_TCF and PR_MTE_TAG

2021-02-09 Thread Richard Henderson
These prctl fields are required for the function of MTE.

Reviewed-by: Peter Maydell 
Signed-off-by: Richard Henderson 
---
 linux-user/aarch64/target_syscall.h |  9 ++
 linux-user/syscall.c| 43 +
 2 files changed, 52 insertions(+)

diff --git a/linux-user/aarch64/target_syscall.h 
b/linux-user/aarch64/target_syscall.h
index 820601dfcc..76f6c3391d 100644
--- a/linux-user/aarch64/target_syscall.h
+++ b/linux-user/aarch64/target_syscall.h
@@ -33,5 +33,14 @@ struct target_pt_regs {
 #define TARGET_PR_SET_TAGGED_ADDR_CTRL 55
 #define TARGET_PR_GET_TAGGED_ADDR_CTRL 56
 # define TARGET_PR_TAGGED_ADDR_ENABLE  (1UL << 0)
+/* MTE tag check fault modes */
+# define TARGET_PR_MTE_TCF_SHIFT   1
+# define TARGET_PR_MTE_TCF_NONE(0UL << TARGET_PR_MTE_TCF_SHIFT)
+# define TARGET_PR_MTE_TCF_SYNC(1UL << TARGET_PR_MTE_TCF_SHIFT)
+# define TARGET_PR_MTE_TCF_ASYNC   (2UL << TARGET_PR_MTE_TCF_SHIFT)
+# define TARGET_PR_MTE_TCF_MASK(3UL << TARGET_PR_MTE_TCF_SHIFT)
+/* MTE tag inclusion mask */
+# define TARGET_PR_MTE_TAG_SHIFT   3
+# define TARGET_PR_MTE_TAG_MASK(0xUL << TARGET_PR_MTE_TAG_SHIFT)
 
 #endif /* AARCH64_TARGET_SYSCALL_H */
diff --git a/linux-user/syscall.c b/linux-user/syscall.c
index ba4da7f8a6..61bf6148e7 100644
--- a/linux-user/syscall.c
+++ b/linux-user/syscall.c
@@ -10985,17 +10985,53 @@ static abi_long do_syscall1(void *cpu_env, int num, 
abi_long arg1,
 {
 abi_ulong valid_mask = TARGET_PR_TAGGED_ADDR_ENABLE;
 CPUARMState *env = cpu_env;
+ARMCPU *cpu = env_archcpu(env);
+
+if (cpu_isar_feature(aa64_mte, cpu)) {
+valid_mask |= TARGET_PR_MTE_TCF_MASK;
+valid_mask |= TARGET_PR_MTE_TAG_MASK;
+}
 
 if ((arg2 & ~valid_mask) || arg3 || arg4 || arg5) {
 return -TARGET_EINVAL;
 }
 env->tagged_addr_enable = arg2 & TARGET_PR_TAGGED_ADDR_ENABLE;
+
+if (cpu_isar_feature(aa64_mte, cpu)) {
+switch (arg2 & TARGET_PR_MTE_TCF_MASK) {
+case TARGET_PR_MTE_TCF_NONE:
+case TARGET_PR_MTE_TCF_SYNC:
+case TARGET_PR_MTE_TCF_ASYNC:
+break;
+default:
+return -EINVAL;
+}
+
+/*
+ * Write PR_MTE_TCF to SCTLR_EL1[TCF0].
+ * Note that the syscall values are consistent with hw.
+ */
+env->cp15.sctlr_el[1] =
+deposit64(env->cp15.sctlr_el[1], 38, 2,
+  arg2 >> TARGET_PR_MTE_TCF_SHIFT);
+
+/*
+ * Write PR_MTE_TAG to GCR_EL1[Exclude].
+ * Note that the syscall uses an include mask,
+ * and hardware uses an exclude mask -- invert.
+ */
+env->cp15.gcr_el1 =
+deposit64(env->cp15.gcr_el1, 0, 16,
+  ~arg2 >> TARGET_PR_MTE_TAG_SHIFT);
+arm_rebuild_hflags(env);
+}
 return 0;
 }
 case TARGET_PR_GET_TAGGED_ADDR_CTRL:
 {
 abi_long ret = 0;
 CPUARMState *env = cpu_env;
+ARMCPU *cpu = env_archcpu(env);
 
 if (arg2 || arg3 || arg4 || arg5) {
 return -TARGET_EINVAL;
@@ -11003,6 +11039,13 @@ static abi_long do_syscall1(void *cpu_env, int num, 
abi_long arg1,
 if (env->tagged_addr_enable) {
 ret |= TARGET_PR_TAGGED_ADDR_ENABLE;
 }
+if (cpu_isar_feature(aa64_mte, cpu)) {
+/* See above. */
+ret |= (extract64(env->cp15.sctlr_el[1], 38, 2)
+<< TARGET_PR_MTE_TCF_SHIFT);
+ret = deposit64(ret, TARGET_PR_MTE_TAG_SHIFT, 16,
+~env->cp15.gcr_el1);
+}
 return ret;
 }
 #endif /* AARCH64 */
-- 
2.25.1




[PATCH v6 25/31] target/arm: Split out syndrome.h from internals.h

2021-02-09 Thread Richard Henderson
Move everything related to syndromes to a new file,
which can be shared with linux-user.

Reviewed-by: Peter Maydell 
Signed-off-by: Richard Henderson 
---
 target/arm/internals.h | 245 +---
 target/arm/syndrome.h  | 273 +
 2 files changed, 274 insertions(+), 244 deletions(-)
 create mode 100644 target/arm/syndrome.h

diff --git a/target/arm/internals.h b/target/arm/internals.h
index 1f7f81f10e..d1156cd0c2 100644
--- a/target/arm/internals.h
+++ b/target/arm/internals.h
@@ -26,6 +26,7 @@
 #define TARGET_ARM_INTERNALS_H
 
 #include "hw/registerfields.h"
+#include "syndrome.h"
 
 /* register banks for CPU modes */
 #define BANK_USRSYS 0
@@ -262,250 +263,6 @@ static inline bool extended_addresses_enabled(CPUARMState 
*env)
(arm_feature(env, ARM_FEATURE_LPAE) && (tcr->raw_tcr & TTBCR_EAE));
 }
 
-/* Valid Syndrome Register EC field values */
-enum arm_exception_class {
-EC_UNCATEGORIZED  = 0x00,
-EC_WFX_TRAP   = 0x01,
-EC_CP15RTTRAP = 0x03,
-EC_CP15RRTTRAP= 0x04,
-EC_CP14RTTRAP = 0x05,
-EC_CP14DTTRAP = 0x06,
-EC_ADVSIMDFPACCESSTRAP= 0x07,
-EC_FPIDTRAP   = 0x08,
-EC_PACTRAP= 0x09,
-EC_CP14RRTTRAP= 0x0c,
-EC_BTITRAP= 0x0d,
-EC_ILLEGALSTATE   = 0x0e,
-EC_AA32_SVC   = 0x11,
-EC_AA32_HVC   = 0x12,
-EC_AA32_SMC   = 0x13,
-EC_AA64_SVC   = 0x15,
-EC_AA64_HVC   = 0x16,
-EC_AA64_SMC   = 0x17,
-EC_SYSTEMREGISTERTRAP = 0x18,
-EC_SVEACCESSTRAP  = 0x19,
-EC_INSNABORT  = 0x20,
-EC_INSNABORT_SAME_EL  = 0x21,
-EC_PCALIGNMENT= 0x22,
-EC_DATAABORT  = 0x24,
-EC_DATAABORT_SAME_EL  = 0x25,
-EC_SPALIGNMENT= 0x26,
-EC_AA32_FPTRAP= 0x28,
-EC_AA64_FPTRAP= 0x2c,
-EC_SERROR = 0x2f,
-EC_BREAKPOINT = 0x30,
-EC_BREAKPOINT_SAME_EL = 0x31,
-EC_SOFTWARESTEP   = 0x32,
-EC_SOFTWARESTEP_SAME_EL   = 0x33,
-EC_WATCHPOINT = 0x34,
-EC_WATCHPOINT_SAME_EL = 0x35,
-EC_AA32_BKPT  = 0x38,
-EC_VECTORCATCH= 0x3a,
-EC_AA64_BKPT  = 0x3c,
-};
-
-#define ARM_EL_EC_SHIFT 26
-#define ARM_EL_IL_SHIFT 25
-#define ARM_EL_ISV_SHIFT 24
-#define ARM_EL_IL (1 << ARM_EL_IL_SHIFT)
-#define ARM_EL_ISV (1 << ARM_EL_ISV_SHIFT)
-
-static inline uint32_t syn_get_ec(uint32_t syn)
-{
-return syn >> ARM_EL_EC_SHIFT;
-}
-
-/* Utility functions for constructing various kinds of syndrome value.
- * Note that in general we follow the AArch64 syndrome values; in a
- * few cases the value in HSR for exceptions taken to AArch32 Hyp
- * mode differs slightly, and we fix this up when populating HSR in
- * arm_cpu_do_interrupt_aarch32_hyp().
- * The exception is FP/SIMD access traps -- these report extra information
- * when taking an exception to AArch32. For those we include the extra coproc
- * and TA fields, and mask them out when taking the exception to AArch64.
- */
-static inline uint32_t syn_uncategorized(void)
-{
-return (EC_UNCATEGORIZED << ARM_EL_EC_SHIFT) | ARM_EL_IL;
-}
-
-static inline uint32_t syn_aa64_svc(uint32_t imm16)
-{
-return (EC_AA64_SVC << ARM_EL_EC_SHIFT) | ARM_EL_IL | (imm16 & 0x);
-}
-
-static inline uint32_t syn_aa64_hvc(uint32_t imm16)
-{
-return (EC_AA64_HVC << ARM_EL_EC_SHIFT) | ARM_EL_IL | (imm16 & 0x);
-}
-
-static inline uint32_t syn_aa64_smc(uint32_t imm16)
-{
-return (EC_AA64_SMC << ARM_EL_EC_SHIFT) | ARM_EL_IL | (imm16 & 0x);
-}
-
-static inline uint32_t syn_aa32_svc(uint32_t imm16, bool is_16bit)
-{
-return (EC_AA32_SVC << ARM_EL_EC_SHIFT) | (imm16 & 0x)
-| (is_16bit ? 0 : ARM_EL_IL);
-}
-
-static inline uint32_t syn_aa32_hvc(uint32_t imm16)
-{
-return (EC_AA32_HVC << ARM_EL_EC_SHIFT) | ARM_EL_IL | (imm16 & 0x);
-}
-
-static inline uint32_t syn_aa32_smc(void)
-{
-return (EC_AA32_SMC << ARM_EL_EC_SHIFT) | ARM_EL_IL;
-}
-
-static inline uint32_t syn_aa64_bkpt(uint32_t imm16)
-{
-return (EC_AA64_BKPT << ARM_EL_EC_SHIFT) | ARM_EL_IL | (imm16 & 0x);
-}
-
-static inline uint32_t syn_aa32_bkpt(uint32_t imm16, bool is_16bit)
-{
-return (EC_AA32_BKPT << ARM_EL_EC_SHIFT) | (imm16 & 0x)
-| (is_16bit ? 0 : ARM_EL_IL);
-}
-
-static inline uint32_t syn_aa64_sysregtrap(int op0, int op1, int op2,
-   int crn, int crm, int rt,
-   int isread)
-{
-return (EC_SYSTEMREGISTERTRAP << ARM_EL_EC_SHIFT) | ARM_EL_IL
-| (op0 << 20) | (op2 << 17) | (op1 << 14) | (crn << 10) | (rt << 5)
-| (crm << 1) | isread;
-}
-
-static inline uint32_t syn_cp14_rt_trap(int cv, int cond, int opc1, 

[PATCH v6 24/31] linux-user/aarch64: Implement PROT_MTE

2021-02-09 Thread Richard Henderson
Remember the PROT_MTE bit as PAGE_MTE/PAGE_TARGET_2.
Otherwise this does not yet have effect.

Reviewed-by: Peter Maydell 
Signed-off-by: Richard Henderson 
---
 include/exec/cpu-all.h|  1 +
 linux-user/syscall_defs.h |  1 +
 target/arm/cpu.h  |  1 +
 linux-user/mmap.c | 22 ++
 4 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/include/exec/cpu-all.h b/include/exec/cpu-all.h
index d6ad774c01..09b9be845d 100644
--- a/include/exec/cpu-all.h
+++ b/include/exec/cpu-all.h
@@ -284,6 +284,7 @@ extern intptr_t qemu_host_page_mask;
 #endif
 /* Target-specific bits that will be used via page_get_flags().  */
 #define PAGE_TARGET_1  0x0080
+#define PAGE_TARGET_2  0x0200
 
 #if defined(CONFIG_USER_ONLY)
 void page_dump(FILE *f);
diff --git a/linux-user/syscall_defs.h b/linux-user/syscall_defs.h
index f98c1c1c8d..46a960fccb 100644
--- a/linux-user/syscall_defs.h
+++ b/linux-user/syscall_defs.h
@@ -1311,6 +1311,7 @@ struct target_winsize {
 
 #ifdef TARGET_AARCH64
 #define TARGET_PROT_BTI 0x10
+#define TARGET_PROT_MTE 0x20
 #endif
 
 /* Common */
diff --git a/target/arm/cpu.h b/target/arm/cpu.h
index 558ad1466b..e3e61ce7ab 100644
--- a/target/arm/cpu.h
+++ b/target/arm/cpu.h
@@ -3606,6 +3606,7 @@ static inline MemTxAttrs *typecheck_memtxattrs(MemTxAttrs 
*x)
  * AArch64 usage of the PAGE_TARGET_* bits for linux-user.
  */
 #define PAGE_BTI  PAGE_TARGET_1
+#define PAGE_MTE  PAGE_TARGET_2
 
 #ifdef TARGET_TAGGED_ADDRESSES
 /**
diff --git a/linux-user/mmap.c b/linux-user/mmap.c
index 6690384752..85e218ab1d 100644
--- a/linux-user/mmap.c
+++ b/linux-user/mmap.c
@@ -84,18 +84,24 @@ static int validate_prot_to_pageflags(int *host_prot, int 
prot)
| (prot & PROT_EXEC ? PROT_READ : 0);
 
 #ifdef TARGET_AARCH64
-/*
- * The PROT_BTI bit is only accepted if the cpu supports the feature.
- * Since this is the unusual case, don't bother checking unless
- * the bit has been requested.  If set and valid, record the bit
- * within QEMU's page_flags.
- */
-if (prot & TARGET_PROT_BTI) {
+{
 ARMCPU *cpu = ARM_CPU(thread_cpu);
-if (cpu_isar_feature(aa64_bti, cpu)) {
+
+/*
+ * The PROT_BTI bit is only accepted if the cpu supports the feature.
+ * Since this is the unusual case, don't bother checking unless
+ * the bit has been requested.  If set and valid, record the bit
+ * within QEMU's page_flags.
+ */
+if ((prot & TARGET_PROT_BTI) && cpu_isar_feature(aa64_bti, cpu)) {
 valid |= TARGET_PROT_BTI;
 page_flags |= PAGE_BTI;
 }
+/* Similarly for the PROT_MTE bit. */
+if ((prot & TARGET_PROT_MTE) && cpu_isar_feature(aa64_mte, cpu)) {
+valid |= TARGET_PROT_MTE;
+page_flags |= PAGE_MTE;
+}
 }
 #endif
 
-- 
2.25.1




[PATCH v6 22/31] target/arm: Use the proper TBI settings for linux-user

2021-02-09 Thread Richard Henderson
We were fudging TBI1 enabled to speed up the generated code.
Now that we've improved the code generation, remove this.
Also, tidy the comment to reflect the current code.

The pauth test was testing a kernel address (-1) and making
incorrect assumptions about TBI1; stick to userland addresses.

Reviewed-by: Peter Maydell 
Signed-off-by: Richard Henderson 
---
 target/arm/internals.h  |  4 ++--
 target/arm/cpu.c| 10 +++---
 tests/tcg/aarch64/pauth-2.c |  1 -
 3 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/target/arm/internals.h b/target/arm/internals.h
index 448982dd2f..1f7f81f10e 100644
--- a/target/arm/internals.h
+++ b/target/arm/internals.h
@@ -1419,9 +1419,9 @@ static inline bool tcma_check(uint32_t desc, int bit55, 
int ptr_tag)
  */
 static inline uint64_t useronly_clean_ptr(uint64_t ptr)
 {
-/* TBI is known to be enabled. */
 #ifdef CONFIG_USER_ONLY
-ptr = sextract64(ptr, 0, 56);
+/* TBI0 is known to be enabled, while TBI1 is disabled. */
+ptr &= sextract64(ptr, 0, 56);
 #endif
 return ptr;
 }
diff --git a/target/arm/cpu.c b/target/arm/cpu.c
index 8ddb2556f8..761f0c61bd 100644
--- a/target/arm/cpu.c
+++ b/target/arm/cpu.c
@@ -205,14 +205,10 @@ static void arm_cpu_reset(DeviceState *dev)
 env->vfp.zcr_el[1] = MIN(cpu->sve_max_vq - 1, 3);
 }
 /*
- * Enable TBI0 and TBI1.  While the real kernel only enables TBI0,
- * turning on both here will produce smaller code and otherwise
- * make no difference to the user-level emulation.
- *
- * In sve_probe_page, we assume that this is set.
- * Do not modify this without other changes.
+ * Enable TBI0 but not TBI1.
+ * Note that this must match useronly_clean_ptr.
  */
-env->cp15.tcr_el[1].raw_tcr = (3ULL << 37);
+env->cp15.tcr_el[1].raw_tcr = (1ULL << 37);
 #else
 /* Reset into the highest available EL */
 if (arm_feature(env, ARM_FEATURE_EL3)) {
diff --git a/tests/tcg/aarch64/pauth-2.c b/tests/tcg/aarch64/pauth-2.c
index 9bba0beb63..978652ede3 100644
--- a/tests/tcg/aarch64/pauth-2.c
+++ b/tests/tcg/aarch64/pauth-2.c
@@ -53,7 +53,6 @@ void do_test(uint64_t value)
 int main()
 {
 do_test(0);
-do_test(-1);
 do_test(0xda004acedeadbeefull);
 return 0;
 }
-- 
2.25.1




[PATCH v6 15/31] exec: Rename guest_{addr,range}_valid to *_untagged

2021-02-09 Thread Richard Henderson
The places that use these are better off using untagged
addresses, so do not provide a tagged versions.  Rename
to make it clear about the address type.

Reviewed-by: Peter Maydell 
Signed-off-by: Richard Henderson 
---
 include/exec/cpu_ldst.h |  4 ++--
 linux-user/qemu.h   |  4 ++--
 accel/tcg/user-exec.c   |  3 ++-
 linux-user/mmap.c   | 12 ++--
 linux-user/syscall.c|  2 +-
 5 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/include/exec/cpu_ldst.h b/include/exec/cpu_ldst.h
index c54069e3cd..ce6ce82618 100644
--- a/include/exec/cpu_ldst.h
+++ b/include/exec/cpu_ldst.h
@@ -87,12 +87,12 @@ static inline void *g2h(CPUState *cs, abi_ptr x)
 return g2h_untagged(cpu_untagged_addr(cs, x));
 }
 
-static inline bool guest_addr_valid(abi_ulong x)
+static inline bool guest_addr_valid_untagged(abi_ulong x)
 {
 return x <= GUEST_ADDR_MAX;
 }
 
-static inline bool guest_range_valid(abi_ulong start, abi_ulong len)
+static inline bool guest_range_valid_untagged(abi_ulong start, abi_ulong len)
 {
 return len - 1 <= GUEST_ADDR_MAX && start <= GUEST_ADDR_MAX - len + 1;
 }
diff --git a/linux-user/qemu.h b/linux-user/qemu.h
index ba122a7903..b3ccffbf0f 100644
--- a/linux-user/qemu.h
+++ b/linux-user/qemu.h
@@ -494,8 +494,8 @@ extern unsigned long guest_stack_size;
 static inline bool access_ok(int type, abi_ulong addr, abi_ulong size)
 {
 if (size == 0
-? !guest_addr_valid(addr)
-: !guest_range_valid(addr, size)) {
+? !guest_addr_valid_untagged(addr)
+: !guest_range_valid_untagged(addr, size)) {
 return false;
 }
 return page_check_range((target_ulong)addr, size, type) == 0;
diff --git a/accel/tcg/user-exec.c b/accel/tcg/user-exec.c
index fa1847b2a6..0d8cc27b21 100644
--- a/accel/tcg/user-exec.c
+++ b/accel/tcg/user-exec.c
@@ -213,7 +213,8 @@ static int probe_access_internal(CPUArchState *env, 
target_ulong addr,
 g_assert_not_reached();
 }
 
-if (!guest_addr_valid(addr) || page_check_range(addr, 1, flags) < 0) {
+if (!guest_addr_valid_untagged(addr) ||
+page_check_range(addr, 1, flags) < 0) {
 if (nonfault) {
 return TLB_INVALID_MASK;
 } else {
diff --git a/linux-user/mmap.c b/linux-user/mmap.c
index 088c50592c..6690384752 100644
--- a/linux-user/mmap.c
+++ b/linux-user/mmap.c
@@ -119,7 +119,7 @@ int target_mprotect(abi_ulong start, abi_ulong len, int 
target_prot)
 }
 len = TARGET_PAGE_ALIGN(len);
 end = start + len;
-if (!guest_range_valid(start, len)) {
+if (!guest_range_valid_untagged(start, len)) {
 return -TARGET_ENOMEM;
 }
 if (len == 0) {
@@ -528,7 +528,7 @@ abi_long target_mmap(abi_ulong start, abi_ulong len, int 
target_prot,
  * It can fail only on 64-bit host with 32-bit target.
  * On any other target/host host mmap() handles this error correctly.
  */
-if (end < start || !guest_range_valid(start, len)) {
+if (end < start || !guest_range_valid_untagged(start, len)) {
 errno = ENOMEM;
 goto fail;
 }
@@ -669,7 +669,7 @@ int target_munmap(abi_ulong start, abi_ulong len)
 if (start & ~TARGET_PAGE_MASK)
 return -TARGET_EINVAL;
 len = TARGET_PAGE_ALIGN(len);
-if (len == 0 || !guest_range_valid(start, len)) {
+if (len == 0 || !guest_range_valid_untagged(start, len)) {
 return -TARGET_EINVAL;
 }
 
@@ -727,9 +727,9 @@ abi_long target_mremap(abi_ulong old_addr, abi_ulong 
old_size,
 int prot;
 void *host_addr;
 
-if (!guest_range_valid(old_addr, old_size) ||
+if (!guest_range_valid_untagged(old_addr, old_size) ||
 ((flags & MREMAP_FIXED) &&
- !guest_range_valid(new_addr, new_size))) {
+ !guest_range_valid_untagged(new_addr, new_size))) {
 errno = ENOMEM;
 return -1;
 }
@@ -777,7 +777,7 @@ abi_long target_mremap(abi_ulong old_addr, abi_ulong 
old_size,
 
 if (host_addr != MAP_FAILED) {
 /* Check if address fits target address space */
-if (!guest_range_valid(h2g(host_addr), new_size)) {
+if (!guest_range_valid_untagged(h2g(host_addr), new_size)) {
 /* Revert mremap() changes */
 host_addr = mremap(g2h_untagged(old_addr),
new_size, old_size, flags);
diff --git a/linux-user/syscall.c b/linux-user/syscall.c
index 4451f8e4f0..30a5021509 100644
--- a/linux-user/syscall.c
+++ b/linux-user/syscall.c
@@ -4608,7 +4608,7 @@ static inline abi_ulong do_shmat(CPUArchState *cpu_env,
 return -TARGET_EINVAL;
 }
 }
-if (!guest_range_valid(shmaddr, shm_info.shm_segsz)) {
+if (!guest_range_valid_untagged(shmaddr, shm_info.shm_segsz)) {
 return -TARGET_EINVAL;
 }
 
-- 
2.25.1




[PATCH v6 19/31] linux-user: Handle tags in lock_user/unlock_user

2021-02-09 Thread Richard Henderson
Resolve the untagged address once, using thread_cpu.
Tidy the DEBUG_REMAP code using glib routines.

Signed-off-by: Richard Henderson 
---
 linux-user/uaccess.c | 27 ++-
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/linux-user/uaccess.c b/linux-user/uaccess.c
index 76af6a92b1..c696913016 100644
--- a/linux-user/uaccess.c
+++ b/linux-user/uaccess.c
@@ -6,36 +6,37 @@
 
 void *lock_user(int type, abi_ulong guest_addr, size_t len, bool copy)
 {
+void *host_addr;
+
+guest_addr = cpu_untagged_addr(thread_cpu, guest_addr);
 if (!access_ok_untagged(type, guest_addr, len)) {
 return NULL;
 }
+host_addr = g2h_untagged(guest_addr);
 #ifdef DEBUG_REMAP
-{
-void *addr;
-addr = g_malloc(len);
-if (copy) {
-memcpy(addr, g2h(guest_addr), len);
-} else {
-memset(addr, 0, len);
-}
-return addr;
+if (copy) {
+host_addr = g_memdup(host_addr, len);
+} else {
+host_addr = g_malloc0(len);
 }
-#else
-return g2h_untagged(guest_addr);
 #endif
+return host_addr;
 }
 
 #ifdef DEBUG_REMAP
 void unlock_user(void *host_ptr, abi_ulong guest_addr, size_t len);
 {
+void *host_ptr_conv;
+
 if (!host_ptr) {
 return;
 }
-if (host_ptr == g2h_untagged(guest_addr)) {
+host_ptr_conv = g2h(thread_cpu, guest_addr);
+if (host_ptr == host_ptr_conv) {
 return;
 }
 if (len != 0) {
-memcpy(g2h_untagged(guest_addr), host_ptr, len);
+memcpy(host_ptr_conv, host_ptr, len);
 }
 g_free(host_ptr);
 }
-- 
2.25.1




[PATCH v6 12/31] exec: Use cpu_untagged_addr in g2h; split out g2h_untagged

2021-02-09 Thread Richard Henderson
Use g2h_untagged in contexts that have no cpu, e.g. the binary
loaders that operate before the primary cpu is created.  As a
colollary, target_mmap and friends must use untagged addresses,
since they are used by the loaders.

Use g2h_untagged on values returned from target_mmap, as the
kernel never applies a tag itself.

Use g2h_untagged on all pc values.  The only current user of
tags, aarch64, removes tags from code addresses upon branch,
so "pc" is always untagged.

Use g2h with the cpu context on hand wherever possible.

Use g2h_untagged in lock_user, which will be updated soon.

Reviewed-by: Peter Maydell 
Signed-off-by: Richard Henderson 
---
 include/exec/cpu_ldst.h  | 12 +-
 include/exec/exec-all.h  |  2 +-
 linux-user/qemu.h|  6 +--
 accel/tcg/translate-all.c|  4 +-
 accel/tcg/user-exec.c| 48 
 linux-user/elfload.c | 12 +++---
 linux-user/flatload.c|  2 +-
 linux-user/hppa/cpu_loop.c   | 31 
 linux-user/i386/cpu_loop.c   |  4 +-
 linux-user/mmap.c| 45 +++---
 linux-user/ppc/signal.c  |  4 +-
 linux-user/syscall.c | 72 +++-
 target/arm/helper-a64.c  |  4 +-
 target/hppa/op_helper.c  |  2 +-
 target/i386/tcg/mem_helper.c |  2 +-
 target/s390x/mem_helper.c|  4 +-
 16 files changed, 135 insertions(+), 119 deletions(-)

diff --git a/include/exec/cpu_ldst.h b/include/exec/cpu_ldst.h
index d9dc1de414..c54069e3cd 100644
--- a/include/exec/cpu_ldst.h
+++ b/include/exec/cpu_ldst.h
@@ -77,7 +77,15 @@ static inline abi_ptr cpu_untagged_addr(CPUState *cs, 
abi_ptr x)
 #endif
 
 /* All direct uses of g2h and h2g need to go away for usermode softmmu.  */
-#define g2h(x) ((void *)((uintptr_t)(abi_ptr)(x) + guest_base))
+static inline void *g2h_untagged(abi_ptr x)
+{
+return (void *)((uintptr_t)(x) + guest_base);
+}
+
+static inline void *g2h(CPUState *cs, abi_ptr x)
+{
+return g2h_untagged(cpu_untagged_addr(cs, x));
+}
 
 static inline bool guest_addr_valid(abi_ulong x)
 {
@@ -448,7 +456,7 @@ static inline int cpu_ldsw_code(CPUArchState *env, abi_ptr 
addr)
 static inline void *tlb_vaddr_to_host(CPUArchState *env, abi_ptr addr,
   MMUAccessType access_type, int mmu_idx)
 {
-return g2h(addr);
+return g2h(env_cpu(env), addr);
 }
 #else
 void *tlb_vaddr_to_host(CPUArchState *env, abi_ptr addr,
diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index f933c74c44..d30c7a84f6 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -616,7 +616,7 @@ static inline tb_page_addr_t 
get_page_addr_code_hostp(CPUArchState *env,
   void **hostp)
 {
 if (hostp) {
-*hostp = g2h(addr);
+*hostp = g2h_untagged(addr);
 }
 return addr;
 }
diff --git a/linux-user/qemu.h b/linux-user/qemu.h
index 9251337daf..9fbc5edc4b 100644
--- a/linux-user/qemu.h
+++ b/linux-user/qemu.h
@@ -652,7 +652,7 @@ static inline void *lock_user(int type, abi_ulong 
guest_addr, long len, int copy
 return addr;
 }
 #else
-return g2h(guest_addr);
+return g2h_untagged(guest_addr);
 #endif
 }
 
@@ -666,10 +666,10 @@ static inline void unlock_user(void *host_ptr, abi_ulong 
guest_addr,
 #ifdef DEBUG_REMAP
 if (!host_ptr)
 return;
-if (host_ptr == g2h(guest_addr))
+if (host_ptr == g2h_untagged(guest_addr))
 return;
 if (len > 0)
-memcpy(g2h(guest_addr), host_ptr, len);
+memcpy(g2h_untagged(guest_addr), host_ptr, len);
 g_free(host_ptr);
 #endif
 }
diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index bba9c8e0b3..2c34adccce 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -1762,7 +1762,7 @@ static inline void tb_page_add(PageDesc *p, 
TranslationBlock *tb,
 prot |= p2->flags;
 p2->flags &= ~PAGE_WRITE;
   }
-mprotect(g2h(page_addr), qemu_host_page_size,
+mprotect(g2h_untagged(page_addr), qemu_host_page_size,
  (prot & PAGE_BITS) & ~PAGE_WRITE);
 if (DEBUG_TB_INVALIDATE_GATE) {
 printf("protecting code page: 0x" TB_PAGE_ADDR_FMT "\n", 
page_addr);
@@ -2912,7 +2912,7 @@ int page_unprotect(target_ulong address, uintptr_t pc)
 }
 #endif
 }
-mprotect((void *)g2h(host_start), qemu_host_page_size,
+mprotect((void *)g2h_untagged(host_start), qemu_host_page_size,
  prot & PAGE_BITS);
 }
 mmap_unlock();
diff --git a/accel/tcg/user-exec.c b/accel/tcg/user-exec.c
index 0b6f56ca40..fa1847b2a6 100644
--- a/accel/tcg/user-exec.c
+++ b/accel/tcg/user-exec.c
@@ -234,7 +234,7 @@ int probe_access_flags(CPUArchState *env, target_ulong addr,
 int flags;
 
 flags = probe_access_internal(env, addr, 0, access_type, nonfault, ra);
-*phost = flags ? NULL : g2h(addr);
+   

[PATCH v6 18/31] linux-user: Fix types in uaccess.c

2021-02-09 Thread Richard Henderson
For copy_*_user, only 0 and -TARGET_EFAULT are returned; no need
to involve abi_long.  Use size_t for lengths.  Use bool for the
lock_user copy argument.  Use ssize_t for target_strlen, because
we can't overflow the host memory space.

Reviewed-by: Peter Maydell 
Signed-off-by: Richard Henderson 
---
 linux-user/qemu.h| 14 ++
 linux-user/uaccess.c | 45 ++--
 2 files changed, 29 insertions(+), 30 deletions(-)

diff --git a/linux-user/qemu.h b/linux-user/qemu.h
index 36b58bd840..d25a5dafc0 100644
--- a/linux-user/qemu.h
+++ b/linux-user/qemu.h
@@ -7,8 +7,6 @@
 #include "exec/cpu_ldst.h"
 
 #undef DEBUG_REMAP
-#ifdef DEBUG_REMAP
-#endif /* DEBUG_REMAP */
 
 #include "exec/user/abitypes.h"
 
@@ -629,8 +627,8 @@ static inline bool access_ok(CPUState *cpu, int type,
  * buffers between the target and host.  These internally perform
  * locking/unlocking of the memory.
  */
-abi_long copy_from_user(void *hptr, abi_ulong gaddr, size_t len);
-abi_long copy_to_user(abi_ulong gaddr, void *hptr, size_t len);
+int copy_from_user(void *hptr, abi_ulong gaddr, size_t len);
+int copy_to_user(abi_ulong gaddr, void *hptr, size_t len);
 
 /* Functions for accessing guest memory.  The tget and tput functions
read/write single values, byteswapping as necessary.  The lock_user function
@@ -640,13 +638,13 @@ abi_long copy_to_user(abi_ulong gaddr, void *hptr, size_t 
len);
 
 /* Lock an area of guest memory into the host.  If copy is true then the
host area will have the same contents as the guest.  */
-void *lock_user(int type, abi_ulong guest_addr, long len, int copy);
+void *lock_user(int type, abi_ulong guest_addr, size_t len, bool copy);
 
 /* Unlock an area of guest memory.  The first LEN bytes must be
flushed back to guest memory. host_ptr = NULL is explicitly
allowed and does nothing. */
-#ifdef DEBUG_REMAP
-static inline void unlock_user(void *host_ptr, abi_ulong guest_addr, long len)
+#ifndef DEBUG_REMAP
+static inline void unlock_user(void *host_ptr, abi_ulong guest_addr, size_t 
len)
 { }
 #else
 void unlock_user(void *host_ptr, abi_ulong guest_addr, long len);
@@ -654,7 +652,7 @@ void unlock_user(void *host_ptr, abi_ulong guest_addr, long 
len);
 
 /* Return the length of a string in target memory or -TARGET_EFAULT if
access error. */
-abi_long target_strlen(abi_ulong gaddr);
+ssize_t target_strlen(abi_ulong gaddr);
 
 /* Like lock_user but for null terminated strings.  */
 void *lock_user_string(abi_ulong guest_addr);
diff --git a/linux-user/uaccess.c b/linux-user/uaccess.c
index bba012ed15..76af6a92b1 100644
--- a/linux-user/uaccess.c
+++ b/linux-user/uaccess.c
@@ -4,7 +4,7 @@
 
 #include "qemu.h"
 
-void *lock_user(int type, abi_ulong guest_addr, long len, int copy)
+void *lock_user(int type, abi_ulong guest_addr, size_t len, bool copy)
 {
 if (!access_ok_untagged(type, guest_addr, len)) {
 return NULL;
@@ -26,7 +26,7 @@ void *lock_user(int type, abi_ulong guest_addr, long len, int 
copy)
 }
 
 #ifdef DEBUG_REMAP
-void unlock_user(void *host_ptr, abi_ulong guest_addr, long len);
+void unlock_user(void *host_ptr, abi_ulong guest_addr, size_t len);
 {
 if (!host_ptr) {
 return;
@@ -34,7 +34,7 @@ void unlock_user(void *host_ptr, abi_ulong guest_addr, long 
len);
 if (host_ptr == g2h_untagged(guest_addr)) {
 return;
 }
-if (len > 0) {
+if (len != 0) {
 memcpy(g2h_untagged(guest_addr), host_ptr, len);
 }
 g_free(host_ptr);
@@ -43,53 +43,53 @@ void unlock_user(void *host_ptr, abi_ulong guest_addr, long 
len);
 
 void *lock_user_string(abi_ulong guest_addr)
 {
-abi_long len = target_strlen(guest_addr);
+ssize_t len = target_strlen(guest_addr);
 if (len < 0) {
 return NULL;
 }
-return lock_user(VERIFY_READ, guest_addr, (long)(len + 1), 1);
+return lock_user(VERIFY_READ, guest_addr, (size_t)len + 1, 1);
 }
 
 /* copy_from_user() and copy_to_user() are usually used to copy data
  * buffers between the target and host.  These internally perform
  * locking/unlocking of the memory.
  */
-abi_long copy_from_user(void *hptr, abi_ulong gaddr, size_t len)
+int copy_from_user(void *hptr, abi_ulong gaddr, size_t len)
 {
-abi_long ret = 0;
-void *ghptr;
+int ret = 0;
+void *ghptr = lock_user(VERIFY_READ, gaddr, len, 1);
 
-if ((ghptr = lock_user(VERIFY_READ, gaddr, len, 1))) {
+if (ghptr) {
 memcpy(hptr, ghptr, len);
 unlock_user(ghptr, gaddr, 0);
-} else
+} else {
 ret = -TARGET_EFAULT;
-
+}
 return ret;
 }
 
-
-abi_long copy_to_user(abi_ulong gaddr, void *hptr, size_t len)
+int copy_to_user(abi_ulong gaddr, void *hptr, size_t len)
 {
-abi_long ret = 0;
-void *ghptr;
+int ret = 0;
+void *ghptr = lock_user(VERIFY_WRITE, gaddr, len, 0);
 
-if ((ghptr = lock_user(VERIFY_WRITE, gaddr, len, 0))) {
+if (ghptr) {
 memcpy(ghptr, hptr, len);
 unlock_user(ghptr, gaddr, 

[PATCH v6 10/31] linux-user: Fix guest_addr_valid vs reserved_va

2021-02-09 Thread Richard Henderson
We must always use GUEST_ADDR_MAX, because even 32-bit hosts can
use -R  to restrict the memory address of the guest.

Reviewed-by: Peter Maydell 
Signed-off-by: Richard Henderson 
---
 include/exec/cpu_ldst.h | 9 -
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/include/exec/cpu_ldst.h b/include/exec/cpu_ldst.h
index 4e6ef3d542..e62f4fba00 100644
--- a/include/exec/cpu_ldst.h
+++ b/include/exec/cpu_ldst.h
@@ -72,11 +72,10 @@ typedef uint64_t abi_ptr;
 /* All direct uses of g2h and h2g need to go away for usermode softmmu.  */
 #define g2h(x) ((void *)((uintptr_t)(abi_ptr)(x) + guest_base))
 
-#if HOST_LONG_BITS <= TARGET_VIRT_ADDR_SPACE_BITS
-#define guest_addr_valid(x) (1)
-#else
-#define guest_addr_valid(x) ((x) <= GUEST_ADDR_MAX)
-#endif
+static inline bool guest_addr_valid(abi_ulong x)
+{
+return x <= GUEST_ADDR_MAX;
+}
 
 static inline bool guest_range_valid(abi_ulong start, abi_ulong len)
 {
-- 
2.25.1




[PATCH v6 17/31] linux-user: Move lock_user et al out of line

2021-02-09 Thread Richard Henderson
These functions are not small, except for unlock_user
without debugging enabled.  Move them out of line, and
add missing braces on the way.

Reviewed-by: Peter Maydell 
Signed-off-by: Richard Henderson 
---
 linux-user/qemu.h| 45 ++-
 linux-user/uaccess.c | 46 
 2 files changed, 52 insertions(+), 39 deletions(-)

diff --git a/linux-user/qemu.h b/linux-user/qemu.h
index 82eabb73f8..36b58bd840 100644
--- a/linux-user/qemu.h
+++ b/linux-user/qemu.h
@@ -640,57 +640,24 @@ abi_long copy_to_user(abi_ulong gaddr, void *hptr, size_t 
len);
 
 /* Lock an area of guest memory into the host.  If copy is true then the
host area will have the same contents as the guest.  */
-static inline void *lock_user(int type, abi_ulong guest_addr, long len, int 
copy)
-{
-if (!access_ok_untagged(type, guest_addr, len)) {
-return NULL;
-}
-#ifdef DEBUG_REMAP
-{
-void *addr;
-addr = g_malloc(len);
-if (copy)
-memcpy(addr, g2h(guest_addr), len);
-else
-memset(addr, 0, len);
-return addr;
-}
-#else
-return g2h_untagged(guest_addr);
-#endif
-}
+void *lock_user(int type, abi_ulong guest_addr, long len, int copy);
 
 /* Unlock an area of guest memory.  The first LEN bytes must be
flushed back to guest memory. host_ptr = NULL is explicitly
allowed and does nothing. */
-static inline void unlock_user(void *host_ptr, abi_ulong guest_addr,
-   long len)
-{
-
 #ifdef DEBUG_REMAP
-if (!host_ptr)
-return;
-if (host_ptr == g2h_untagged(guest_addr))
-return;
-if (len > 0)
-memcpy(g2h_untagged(guest_addr), host_ptr, len);
-g_free(host_ptr);
+static inline void unlock_user(void *host_ptr, abi_ulong guest_addr, long len)
+{ }
+#else
+void unlock_user(void *host_ptr, abi_ulong guest_addr, long len);
 #endif
-}
 
 /* Return the length of a string in target memory or -TARGET_EFAULT if
access error. */
 abi_long target_strlen(abi_ulong gaddr);
 
 /* Like lock_user but for null terminated strings.  */
-static inline void *lock_user_string(abi_ulong guest_addr)
-{
-abi_long len;
-len = target_strlen(guest_addr);
-if (len < 0)
-return NULL;
-return lock_user(VERIFY_READ, guest_addr, (long)(len + 1), 1);
-}
+void *lock_user_string(abi_ulong guest_addr);
 
 /* Helper macros for locking/unlocking a target struct.  */
 #define lock_user_struct(type, host_ptr, guest_addr, copy) \
diff --git a/linux-user/uaccess.c b/linux-user/uaccess.c
index e215ecc2a6..bba012ed15 100644
--- a/linux-user/uaccess.c
+++ b/linux-user/uaccess.c
@@ -4,6 +4,52 @@
 
 #include "qemu.h"
 
+void *lock_user(int type, abi_ulong guest_addr, long len, int copy)
+{
+if (!access_ok_untagged(type, guest_addr, len)) {
+return NULL;
+}
+#ifdef DEBUG_REMAP
+{
+void *addr;
+addr = g_malloc(len);
+if (copy) {
+memcpy(addr, g2h(guest_addr), len);
+} else {
+memset(addr, 0, len);
+}
+return addr;
+}
+#else
+return g2h_untagged(guest_addr);
+#endif
+}
+
+#ifdef DEBUG_REMAP
+void unlock_user(void *host_ptr, abi_ulong guest_addr, long len);
+{
+if (!host_ptr) {
+return;
+}
+if (host_ptr == g2h_untagged(guest_addr)) {
+return;
+}
+if (len > 0) {
+memcpy(g2h_untagged(guest_addr), host_ptr, len);
+}
+g_free(host_ptr);
+}
+#endif
+
+void *lock_user_string(abi_ulong guest_addr)
+{
+abi_long len = target_strlen(guest_addr);
+if (len < 0) {
+return NULL;
+}
+return lock_user(VERIFY_READ, guest_addr, (long)(len + 1), 1);
+}
+
 /* copy_from_user() and copy_to_user() are usually used to copy data
  * buffers between the target and host.  These internally perform
  * locking/unlocking of the memory.
-- 
2.25.1




[PATCH v6 08/31] bsd-user: Tidy VERIFY_READ/VERIFY_WRITE

2021-02-09 Thread Richard Henderson
These constants are only ever used with access_ok, and friends.
Rather than translating them to PAGE_* bits, let them equal
the PAGE_* bits to begin.

Reviewed-by: Warner Losh 
Reviewed-by: Peter Maydell 
Signed-off-by: Richard Henderson 
---
 bsd-user/qemu.h | 9 -
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/bsd-user/qemu.h b/bsd-user/qemu.h
index f8bb1e5459..4076adabd0 100644
--- a/bsd-user/qemu.h
+++ b/bsd-user/qemu.h
@@ -218,13 +218,12 @@ extern unsigned long x86_stack_size;
 
 /* user access */
 
-#define VERIFY_READ 0
-#define VERIFY_WRITE 1 /* implies read access */
+#define VERIFY_READ  PAGE_READ
+#define VERIFY_WRITE (PAGE_READ | PAGE_WRITE)
 
-static inline int access_ok(int type, abi_ulong addr, abi_ulong size)
+static inline bool access_ok(int type, abi_ulong addr, abi_ulong size)
 {
-return page_check_range((target_ulong)addr, size,
-(type == VERIFY_READ) ? PAGE_READ : (PAGE_READ | 
PAGE_WRITE)) == 0;
+return page_check_range((target_ulong)addr, size, type) == 0;
 }
 
 /* NOTE __get_user and __put_user use host pointers and don't check access. */
-- 
2.25.1




[PATCH v6 11/31] exec: Introduce cpu_untagged_addr

2021-02-09 Thread Richard Henderson
Provide an identity fallback for target that do not
use tagged addresses.

Reviewed-by: Peter Maydell 
Signed-off-by: Richard Henderson 
---
 include/exec/cpu_ldst.h | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/include/exec/cpu_ldst.h b/include/exec/cpu_ldst.h
index e62f4fba00..d9dc1de414 100644
--- a/include/exec/cpu_ldst.h
+++ b/include/exec/cpu_ldst.h
@@ -69,6 +69,13 @@ typedef uint64_t abi_ptr;
 #define TARGET_ABI_FMT_ptr "%"PRIx64
 #endif
 
+#ifndef TARGET_TAGGED_ADDRESSES
+static inline abi_ptr cpu_untagged_addr(CPUState *cs, abi_ptr x)
+{
+return x;
+}
+#endif
+
 /* All direct uses of g2h and h2g need to go away for usermode softmmu.  */
 #define g2h(x) ((void *)((uintptr_t)(abi_ptr)(x) + guest_base))
 
-- 
2.25.1




[PATCH v6 13/31] linux-user: Explicitly untag memory management syscalls

2021-02-09 Thread Richard Henderson
We define target_mmap et al as untagged, so that they can be
used from the binary loaders.  Explicitly call cpu_untagged_addr
for munmap, mprotect, mremap syscall entry points.

Add a few comments for the syscalls that are exempted by the
kernel's tagged-address-abi.rst.

Reviewed-by: Peter Maydell 
Signed-off-by: Richard Henderson 
---
 linux-user/syscall.c | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/linux-user/syscall.c b/linux-user/syscall.c
index 748893904e..4451f8e4f0 100644
--- a/linux-user/syscall.c
+++ b/linux-user/syscall.c
@@ -889,6 +889,8 @@ abi_long do_brk(abi_ulong new_brk)
 abi_long mapped_addr;
 abi_ulong new_alloc_size;
 
+/* brk pointers are always untagged */
+
 DEBUGF_BRK("do_brk(" TARGET_ABI_FMT_lx ") -> ", new_brk);
 
 if (!new_brk) {
@@ -4588,6 +4590,8 @@ static inline abi_ulong do_shmat(CPUArchState *cpu_env,
 int i,ret;
 abi_ulong shmlba;
 
+/* shmat pointers are always untagged */
+
 /* find out the length of the shared memory segment */
 ret = get_errno(shmctl(shmid, IPC_STAT, _info));
 if (is_error(ret)) {
@@ -4655,6 +4659,8 @@ static inline abi_long do_shmdt(abi_ulong shmaddr)
 int i;
 abi_long rv;
 
+/* shmdt pointers are always untagged */
+
 mmap_lock();
 
 for (i = 0; i < N_SHM_REGIONS; ++i) {
@@ -9691,6 +9697,7 @@ static abi_long do_syscall1(void *cpu_env, int num, 
abi_long arg1,
 v5, v6));
 }
 #else
+/* mmap pointers are always untagged */
 ret = get_errno(target_mmap(arg1, arg2, arg3,
 target_to_host_bitmask(arg4, 
mmap_flags_tbl),
 arg5,
@@ -9709,8 +9716,10 @@ static abi_long do_syscall1(void *cpu_env, int num, 
abi_long arg1,
 return get_errno(ret);
 #endif
 case TARGET_NR_munmap:
+arg1 = cpu_untagged_addr(cpu, arg1);
 return get_errno(target_munmap(arg1, arg2));
 case TARGET_NR_mprotect:
+arg1 = cpu_untagged_addr(cpu, arg1);
 {
 TaskState *ts = cpu->opaque;
 /* Special hack to detect libc making the stack executable.  */
@@ -9725,6 +9734,8 @@ static abi_long do_syscall1(void *cpu_env, int num, 
abi_long arg1,
 return get_errno(target_mprotect(arg1, arg2, arg3));
 #ifdef TARGET_NR_mremap
 case TARGET_NR_mremap:
+arg1 = cpu_untagged_addr(cpu, arg1);
+/* mremap new_addr (arg5) is always untagged */
 return get_errno(target_mremap(arg1, arg2, arg3, arg4, arg5));
 #endif
 /* ??? msync/mlock/munlock are broken for softmmu.  */
-- 
2.25.1




[PATCH v6 16/31] linux-user: Use cpu_untagged_addr in access_ok; split out *_untagged

2021-02-09 Thread Richard Henderson
Provide both tagged and untagged versions of access_ok.
In a few places use thread_cpu, as the user is several
callees removed from do_syscall1.

Reviewed-by: Peter Maydell 
Signed-off-by: Richard Henderson 
---
 linux-user/qemu.h  | 11 +--
 linux-user/elfload.c   |  2 +-
 linux-user/hppa/cpu_loop.c |  8 
 linux-user/i386/cpu_loop.c |  2 +-
 linux-user/i386/signal.c   |  5 +++--
 linux-user/syscall.c   |  9 ++---
 6 files changed, 24 insertions(+), 13 deletions(-)

diff --git a/linux-user/qemu.h b/linux-user/qemu.h
index b3ccffbf0f..82eabb73f8 100644
--- a/linux-user/qemu.h
+++ b/linux-user/qemu.h
@@ -491,7 +491,7 @@ extern unsigned long guest_stack_size;
 #define VERIFY_READ  PAGE_READ
 #define VERIFY_WRITE (PAGE_READ | PAGE_WRITE)
 
-static inline bool access_ok(int type, abi_ulong addr, abi_ulong size)
+static inline bool access_ok_untagged(int type, abi_ulong addr, abi_ulong size)
 {
 if (size == 0
 ? !guest_addr_valid_untagged(addr)
@@ -501,6 +501,12 @@ static inline bool access_ok(int type, abi_ulong addr, 
abi_ulong size)
 return page_check_range((target_ulong)addr, size, type) == 0;
 }
 
+static inline bool access_ok(CPUState *cpu, int type,
+ abi_ulong addr, abi_ulong size)
+{
+return access_ok_untagged(type, cpu_untagged_addr(cpu, addr), size);
+}
+
 /* NOTE __get_user and __put_user use host pointers and don't check access.
These are usually used to access struct data members once the struct has
been locked - usually with lock_user_struct.  */
@@ -636,8 +642,9 @@ abi_long copy_to_user(abi_ulong gaddr, void *hptr, size_t 
len);
host area will have the same contents as the guest.  */
 static inline void *lock_user(int type, abi_ulong guest_addr, long len, int 
copy)
 {
-if (!access_ok(type, guest_addr, len))
+if (!access_ok_untagged(type, guest_addr, len)) {
 return NULL;
+}
 #ifdef DEBUG_REMAP
 {
 void *addr;
diff --git a/linux-user/elfload.c b/linux-user/elfload.c
index f542841ba2..e7209e03cb 100644
--- a/linux-user/elfload.c
+++ b/linux-user/elfload.c
@@ -3500,7 +3500,7 @@ static int vma_get_mapping_count(const struct mm_struct 
*mm)
 static abi_ulong vma_dump_size(const struct vm_area_struct *vma)
 {
 /* if we cannot even read the first page, skip it */
-if (!access_ok(VERIFY_READ, vma->vma_start, TARGET_PAGE_SIZE))
+if (!access_ok_untagged(VERIFY_READ, vma->vma_start, TARGET_PAGE_SIZE))
 return (0);
 
 /*
diff --git a/linux-user/hppa/cpu_loop.c b/linux-user/hppa/cpu_loop.c
index 944511bbe4..3aaaf3337c 100644
--- a/linux-user/hppa/cpu_loop.c
+++ b/linux-user/hppa/cpu_loop.c
@@ -35,7 +35,7 @@ static abi_ulong hppa_lws(CPUHPPAState *env)
 return -TARGET_ENOSYS;
 
 case 0: /* elf32 atomic 32bit cmpxchg */
-if ((addr & 3) || !access_ok(VERIFY_WRITE, addr, 4)) {
+if ((addr & 3) || !access_ok(cs, VERIFY_WRITE, addr, 4)) {
 return -TARGET_EFAULT;
 }
 old = tswap32(old);
@@ -50,9 +50,9 @@ static abi_ulong hppa_lws(CPUHPPAState *env)
 return -TARGET_ENOSYS;
 }
 if (((addr | old | new) & ((1 << size) - 1))
-|| !access_ok(VERIFY_WRITE, addr, 1 << size)
-|| !access_ok(VERIFY_READ, old, 1 << size)
-|| !access_ok(VERIFY_READ, new, 1 << size)) {
+|| !access_ok(cs, VERIFY_WRITE, addr, 1 << size)
+|| !access_ok(cs, VERIFY_READ, old, 1 << size)
+|| !access_ok(cs, VERIFY_READ, new, 1 << size)) {
 return -TARGET_EFAULT;
 }
 /* Note that below we use host-endian loads so that the cmpxchg
diff --git a/linux-user/i386/cpu_loop.c b/linux-user/i386/cpu_loop.c
index 19c8a18cd3..f813e87294 100644
--- a/linux-user/i386/cpu_loop.c
+++ b/linux-user/i386/cpu_loop.c
@@ -99,7 +99,7 @@ static bool write_ok_or_segv(CPUX86State *env, abi_ptr addr, 
size_t len)
  * For all the vsyscalls, NULL means "don't write anything" not
  * "write it at address 0".
  */
-if (addr == 0 || access_ok(VERIFY_WRITE, addr, len)) {
+if (addr == 0 || access_ok(env_cpu(env), VERIFY_WRITE, addr, len)) {
 return true;
 }
 
diff --git a/linux-user/i386/signal.c b/linux-user/i386/signal.c
index 97a39204cc..9320e1d472 100644
--- a/linux-user/i386/signal.c
+++ b/linux-user/i386/signal.c
@@ -513,9 +513,10 @@ restore_sigcontext(CPUX86State *env, struct 
target_sigcontext *sc)
 
 fpstate_addr = tswapl(sc->fpstate);
 if (fpstate_addr != 0) {
-if (!access_ok(VERIFY_READ, fpstate_addr,
-   sizeof(struct target_fpstate)))
+if (!access_ok(env_cpu(env), VERIFY_READ, fpstate_addr,
+   sizeof(struct target_fpstate))) {
 goto badframe;
+}
 #ifndef TARGET_X86_64
 cpu_x86_frstor(env, fpstate_addr, 1);
 #else
diff --git a/linux-user/syscall.c b/linux-user/syscall.c
index 30a5021509..24fc1daf02 100644
--- 

[PATCH v6 00/31] target-arm: Implement ARMv8.5-MemTag, user mode

2021-02-09 Thread Richard Henderson
Changes for v6:
  * Drop the change to probe_access.  The cpu_untagged_addr function
isn't correct, since that's specifically for syscalls.  The uses
of probe_access in target/arm/ are already done with clean addresses.
  * Move unlock_user comparison change from p19 to p18.

The only unreviewed patch is 19.


r~


Richard Henderson (31):
  tcg: Introduce target-specific page data for user-only
  linux-user: Introduce PAGE_ANON
  exec: Use uintptr_t for guest_base
  exec: Use uintptr_t in cpu_ldst.h
  exec: Improve types for guest_addr_valid
  linux-user: Check for overflow in access_ok
  linux-user: Tidy VERIFY_READ/VERIFY_WRITE
  bsd-user: Tidy VERIFY_READ/VERIFY_WRITE
  linux-user: Do not use guest_addr_valid for h2g_valid
  linux-user: Fix guest_addr_valid vs reserved_va
  exec: Introduce cpu_untagged_addr
  exec: Use cpu_untagged_addr in g2h; split out g2h_untagged
  linux-user: Explicitly untag memory management syscalls
  linux-user: Use guest_range_valid in access_ok
  exec: Rename guest_{addr,range}_valid to *_untagged
  linux-user: Use cpu_untagged_addr in access_ok; split out *_untagged
  linux-user: Move lock_user et al out of line
  linux-user: Fix types in uaccess.c
  linux-user: Handle tags in lock_user/unlock_user
  linux-user/aarch64: Implement PR_TAGGED_ADDR_ENABLE
  target/arm: Improve gen_top_byte_ignore
  target/arm: Use the proper TBI settings for linux-user
  linux-user/aarch64: Implement PR_MTE_TCF and PR_MTE_TAG
  linux-user/aarch64: Implement PROT_MTE
  target/arm: Split out syndrome.h from internals.h
  linux-user/aarch64: Pass syndrome to EXC_*_ABORT
  linux-user/aarch64: Signal SEGV_MTESERR for sync tag check fault
  linux-user/aarch64: Signal SEGV_MTEAERR for async tag check error
  target/arm: Add allocation tag storage for user mode
  target/arm: Enable MTE for user-only
  tests/tcg/aarch64: Add mte smoke tests

 bsd-user/qemu.h |   9 +-
 include/exec/cpu-all.h  |  47 -
 include/exec/cpu_ldst.h |  39 ++--
 include/exec/exec-all.h |   2 +-
 linux-user/aarch64/target_signal.h  |   3 +
 linux-user/aarch64/target_syscall.h |  13 ++
 linux-user/qemu.h   |  76 +++-
 linux-user/syscall_defs.h   |   1 +
 target/arm/cpu-param.h  |   3 +
 target/arm/cpu.h|  32 
 target/arm/internals.h  | 249 +
 target/arm/syndrome.h   | 273 
 tests/tcg/aarch64/mte.h |  60 ++
 accel/tcg/translate-all.c   |  32 +++-
 accel/tcg/user-exec.c   |  51 +++---
 bsd-user/main.c |   4 +-
 linux-user/aarch64/cpu_loop.c   |  38 +++-
 linux-user/elfload.c|  18 +-
 linux-user/flatload.c   |   2 +-
 linux-user/hppa/cpu_loop.c  |  39 ++--
 linux-user/i386/cpu_loop.c  |   6 +-
 linux-user/i386/signal.c|   5 +-
 linux-user/main.c   |   4 +-
 linux-user/mmap.c   |  86 +
 linux-user/ppc/signal.c |   4 +-
 linux-user/syscall.c| 165 +
 linux-user/uaccess.c|  82 +++--
 target/arm/cpu.c|  25 ++-
 target/arm/helper-a64.c |   4 +-
 target/arm/mte_helper.c |  39 +++-
 target/arm/tlb_helper.c |  15 +-
 target/arm/translate-a64.c  |  25 +--
 target/hppa/op_helper.c |   2 +-
 target/i386/tcg/mem_helper.c|   2 +-
 target/s390x/mem_helper.c   |   4 +-
 tests/tcg/aarch64/mte-1.c   |  28 +++
 tests/tcg/aarch64/mte-2.c   |  45 +
 tests/tcg/aarch64/mte-3.c   |  51 ++
 tests/tcg/aarch64/mte-4.c   |  45 +
 tests/tcg/aarch64/pauth-2.c |   1 -
 tests/tcg/aarch64/Makefile.target   |   6 +
 tests/tcg/configure.sh  |   4 +
 42 files changed, 1122 insertions(+), 517 deletions(-)
 create mode 100644 target/arm/syndrome.h
 create mode 100644 tests/tcg/aarch64/mte.h
 create mode 100644 tests/tcg/aarch64/mte-1.c
 create mode 100644 tests/tcg/aarch64/mte-2.c
 create mode 100644 tests/tcg/aarch64/mte-3.c
 create mode 100644 tests/tcg/aarch64/mte-4.c

-- 
2.25.1




[PATCH v6 09/31] linux-user: Do not use guest_addr_valid for h2g_valid

2021-02-09 Thread Richard Henderson
This is the only use of guest_addr_valid that does not begin
with a guest address, but a host address being transformed to
a guest address.

We will shortly adjust guest_addr_valid to handle guest memory
tags, and the host address should not be subjected to that.

Move h2g_valid adjacent to the other h2g macros.

Reviewed-by: Peter Maydell 
Signed-off-by: Richard Henderson 
---
 include/exec/cpu_ldst.h | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/include/exec/cpu_ldst.h b/include/exec/cpu_ldst.h
index 5e8878ee9b..4e6ef3d542 100644
--- a/include/exec/cpu_ldst.h
+++ b/include/exec/cpu_ldst.h
@@ -77,13 +77,16 @@ typedef uint64_t abi_ptr;
 #else
 #define guest_addr_valid(x) ((x) <= GUEST_ADDR_MAX)
 #endif
-#define h2g_valid(x) guest_addr_valid((uintptr_t)(x) - guest_base)
 
 static inline bool guest_range_valid(abi_ulong start, abi_ulong len)
 {
 return len - 1 <= GUEST_ADDR_MAX && start <= GUEST_ADDR_MAX - len + 1;
 }
 
+#define h2g_valid(x) \
+(HOST_LONG_BITS <= TARGET_VIRT_ADDR_SPACE_BITS || \
+ (uintptr_t)(x) - guest_base <= GUEST_ADDR_MAX)
+
 #define h2g_nocheck(x) ({ \
 uintptr_t __ret = (uintptr_t)(x) - guest_base; \
 (abi_ptr)__ret; \
-- 
2.25.1




[PATCH v6 06/31] linux-user: Check for overflow in access_ok

2021-02-09 Thread Richard Henderson
Verify that addr + size - 1 does not wrap around.

Reviewed-by: Peter Maydell 
Signed-off-by: Richard Henderson 
---
 linux-user/qemu.h | 17 -
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/linux-user/qemu.h b/linux-user/qemu.h
index 17aa992165..441ba6a78b 100644
--- a/linux-user/qemu.h
+++ b/linux-user/qemu.h
@@ -491,12 +491,19 @@ extern unsigned long guest_stack_size;
 #define VERIFY_READ 0
 #define VERIFY_WRITE 1 /* implies read access */
 
-static inline int access_ok(int type, abi_ulong addr, abi_ulong size)
+static inline bool access_ok(int type, abi_ulong addr, abi_ulong size)
 {
-return guest_addr_valid(addr) &&
-   (size == 0 || guest_addr_valid(addr + size - 1)) &&
-   page_check_range((target_ulong)addr, size,
-(type == VERIFY_READ) ? PAGE_READ : (PAGE_READ | 
PAGE_WRITE)) == 0;
+if (!guest_addr_valid(addr)) {
+return false;
+}
+if (size != 0 &&
+(addr + size - 1 < addr ||
+ !guest_addr_valid(addr + size - 1))) {
+return false;
+}
+return page_check_range((target_ulong)addr, size,
+(type == VERIFY_READ) ? PAGE_READ :
+(PAGE_READ | PAGE_WRITE)) == 0;
 }
 
 /* NOTE __get_user and __put_user use host pointers and don't check access.
-- 
2.25.1




[PATCH v6 14/31] linux-user: Use guest_range_valid in access_ok

2021-02-09 Thread Richard Henderson
We're currently open-coding the range check in access_ok;
use guest_range_valid when size != 0.

Reviewed-by: Peter Maydell 
Signed-off-by: Richard Henderson 
---
 linux-user/qemu.h | 9 +++--
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/linux-user/qemu.h b/linux-user/qemu.h
index 9fbc5edc4b..ba122a7903 100644
--- a/linux-user/qemu.h
+++ b/linux-user/qemu.h
@@ -493,12 +493,9 @@ extern unsigned long guest_stack_size;
 
 static inline bool access_ok(int type, abi_ulong addr, abi_ulong size)
 {
-if (!guest_addr_valid(addr)) {
-return false;
-}
-if (size != 0 &&
-(addr + size - 1 < addr ||
- !guest_addr_valid(addr + size - 1))) {
+if (size == 0
+? !guest_addr_valid(addr)
+: !guest_range_valid(addr, size)) {
 return false;
 }
 return page_check_range((target_ulong)addr, size, type) == 0;
-- 
2.25.1




[PATCH v6 04/31] exec: Use uintptr_t in cpu_ldst.h

2021-02-09 Thread Richard Henderson
This is more descriptive than 'unsigned long'.
No functional change, since these match on all linux+bsd hosts.

Reviewed-by: Peter Maydell 
Signed-off-by: Richard Henderson 
---
 include/exec/cpu_ldst.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/exec/cpu_ldst.h b/include/exec/cpu_ldst.h
index ef54cb7e1f..3f9063aade 100644
--- a/include/exec/cpu_ldst.h
+++ b/include/exec/cpu_ldst.h
@@ -70,14 +70,14 @@ typedef uint64_t abi_ptr;
 #endif
 
 /* All direct uses of g2h and h2g need to go away for usermode softmmu.  */
-#define g2h(x) ((void *)((unsigned long)(abi_ptr)(x) + guest_base))
+#define g2h(x) ((void *)((uintptr_t)(abi_ptr)(x) + guest_base))
 
 #if HOST_LONG_BITS <= TARGET_VIRT_ADDR_SPACE_BITS
 #define guest_addr_valid(x) (1)
 #else
 #define guest_addr_valid(x) ((x) <= GUEST_ADDR_MAX)
 #endif
-#define h2g_valid(x) guest_addr_valid((unsigned long)(x) - guest_base)
+#define h2g_valid(x) guest_addr_valid((uintptr_t)(x) - guest_base)
 
 static inline int guest_range_valid(unsigned long start, unsigned long len)
 {
@@ -85,7 +85,7 @@ static inline int guest_range_valid(unsigned long start, 
unsigned long len)
 }
 
 #define h2g_nocheck(x) ({ \
-unsigned long __ret = (unsigned long)(x) - guest_base; \
+uintptr_t __ret = (uintptr_t)(x) - guest_base; \
 (abi_ptr)__ret; \
 })
 
-- 
2.25.1




[PATCH v6 05/31] exec: Improve types for guest_addr_valid

2021-02-09 Thread Richard Henderson
Return bool not int; pass abi_ulong not 'unsigned long'.
All callers use abi_ulong already, so the change in type
has no effect.

Reviewed-by: Peter Maydell 
Signed-off-by: Richard Henderson 
---
 include/exec/cpu_ldst.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/exec/cpu_ldst.h b/include/exec/cpu_ldst.h
index 3f9063aade..5e8878ee9b 100644
--- a/include/exec/cpu_ldst.h
+++ b/include/exec/cpu_ldst.h
@@ -79,7 +79,7 @@ typedef uint64_t abi_ptr;
 #endif
 #define h2g_valid(x) guest_addr_valid((uintptr_t)(x) - guest_base)
 
-static inline int guest_range_valid(unsigned long start, unsigned long len)
+static inline bool guest_range_valid(abi_ulong start, abi_ulong len)
 {
 return len - 1 <= GUEST_ADDR_MAX && start <= GUEST_ADDR_MAX - len + 1;
 }
-- 
2.25.1




[PATCH v6 07/31] linux-user: Tidy VERIFY_READ/VERIFY_WRITE

2021-02-09 Thread Richard Henderson
These constants are only ever used with access_ok, and friends.
Rather than translating them to PAGE_* bits, let them equal
the PAGE_* bits to begin.

Reviewed-by: Peter Maydell 
Signed-off-by: Richard Henderson 
---
 linux-user/qemu.h | 8 +++-
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/linux-user/qemu.h b/linux-user/qemu.h
index 441ba6a78b..9251337daf 100644
--- a/linux-user/qemu.h
+++ b/linux-user/qemu.h
@@ -488,8 +488,8 @@ extern unsigned long guest_stack_size;
 
 /* user access */
 
-#define VERIFY_READ 0
-#define VERIFY_WRITE 1 /* implies read access */
+#define VERIFY_READ  PAGE_READ
+#define VERIFY_WRITE (PAGE_READ | PAGE_WRITE)
 
 static inline bool access_ok(int type, abi_ulong addr, abi_ulong size)
 {
@@ -501,9 +501,7 @@ static inline bool access_ok(int type, abi_ulong addr, 
abi_ulong size)
  !guest_addr_valid(addr + size - 1))) {
 return false;
 }
-return page_check_range((target_ulong)addr, size,
-(type == VERIFY_READ) ? PAGE_READ :
-(PAGE_READ | PAGE_WRITE)) == 0;
+return page_check_range((target_ulong)addr, size, type) == 0;
 }
 
 /* NOTE __get_user and __put_user use host pointers and don't check access.
-- 
2.25.1




[PATCH v6 03/31] exec: Use uintptr_t for guest_base

2021-02-09 Thread Richard Henderson
This is more descriptive than 'unsigned long'.
No functional change, since these match on all linux+bsd hosts.

Reviewed-by: Peter Maydell 
Signed-off-by: Richard Henderson 
---
 include/exec/cpu-all.h | 2 +-
 bsd-user/main.c| 4 ++--
 linux-user/elfload.c   | 4 ++--
 linux-user/main.c  | 4 ++--
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/include/exec/cpu-all.h b/include/exec/cpu-all.h
index 1f47e0fe44..d6ad774c01 100644
--- a/include/exec/cpu-all.h
+++ b/include/exec/cpu-all.h
@@ -158,7 +158,7 @@ static inline void tswap64s(uint64_t *s)
 /* On some host systems the guest address space is reserved on the host.
  * This allows the guest address space to be offset to a convenient location.
  */
-extern unsigned long guest_base;
+extern uintptr_t guest_base;
 extern bool have_guest_base;
 extern unsigned long reserved_va;
 
diff --git a/bsd-user/main.c b/bsd-user/main.c
index 7cc08024e3..385d35886a 100644
--- a/bsd-user/main.c
+++ b/bsd-user/main.c
@@ -42,7 +42,7 @@
 
 int singlestep;
 unsigned long mmap_min_addr;
-unsigned long guest_base;
+uintptr_t guest_base;
 bool have_guest_base;
 unsigned long reserved_va;
 
@@ -970,7 +970,7 @@ int main(int argc, char **argv)
 g_free(target_environ);
 
 if (qemu_loglevel_mask(CPU_LOG_PAGE)) {
-qemu_log("guest_base  0x%lx\n", guest_base);
+qemu_log("guest_base  %p\n", (void *)guest_base);
 log_page_dump("binary load");
 
 qemu_log("start_brk   0x" TARGET_ABI_FMT_lx "\n", info->start_brk);
diff --git a/linux-user/elfload.c b/linux-user/elfload.c
index a64050713f..29f07bb234 100644
--- a/linux-user/elfload.c
+++ b/linux-user/elfload.c
@@ -2135,9 +2135,9 @@ static void pgb_have_guest_base(const char *image_name, 
abi_ulong guest_loaddr,
 void *addr, *test;
 
 if (!QEMU_IS_ALIGNED(guest_base, align)) {
-fprintf(stderr, "Requested guest base 0x%lx does not satisfy "
+fprintf(stderr, "Requested guest base %p does not satisfy "
 "host minimum alignment (0x%lx)\n",
-guest_base, align);
+(void *)guest_base, align);
 exit(EXIT_FAILURE);
 }
 
diff --git a/linux-user/main.c b/linux-user/main.c
index 2e3c169878..81f48ff54e 100644
--- a/linux-user/main.c
+++ b/linux-user/main.c
@@ -59,7 +59,7 @@ static const char *cpu_model;
 static const char *cpu_type;
 static const char *seed_optarg;
 unsigned long mmap_min_addr;
-unsigned long guest_base;
+uintptr_t guest_base;
 bool have_guest_base;
 
 /*
@@ -824,7 +824,7 @@ int main(int argc, char **argv, char **envp)
 g_free(target_environ);
 
 if (qemu_loglevel_mask(CPU_LOG_PAGE)) {
-qemu_log("guest_base  0x%lx\n", guest_base);
+qemu_log("guest_base  %p\n", (void *)guest_base);
 log_page_dump("binary load");
 
 qemu_log("start_brk   0x" TARGET_ABI_FMT_lx "\n", info->start_brk);
-- 
2.25.1




[PATCH v6 01/31] tcg: Introduce target-specific page data for user-only

2021-02-09 Thread Richard Henderson
This data can be allocated by page_alloc_target_data() and
released by page_set_flags(start, end, prot | PAGE_RESET).

This data will be used to hold tag memory for AArch64 MTE.

Reviewed-by: Peter Maydell 
Signed-off-by: Richard Henderson 
---
 include/exec/cpu-all.h| 42 +--
 accel/tcg/translate-all.c | 28 ++
 linux-user/mmap.c |  4 +++-
 linux-user/syscall.c  |  4 ++--
 4 files changed, 69 insertions(+), 9 deletions(-)

diff --git a/include/exec/cpu-all.h b/include/exec/cpu-all.h
index cfb1d79331..af555f1798 100644
--- a/include/exec/cpu-all.h
+++ b/include/exec/cpu-all.h
@@ -264,15 +264,21 @@ extern intptr_t qemu_host_page_mask;
 #define PAGE_EXEC  0x0004
 #define PAGE_BITS  (PAGE_READ | PAGE_WRITE | PAGE_EXEC)
 #define PAGE_VALID 0x0008
-/* original state of the write flag (used when tracking self-modifying
-   code */
+/*
+ * Original state of the write flag (used when tracking self-modifying code)
+ */
 #define PAGE_WRITE_ORG 0x0010
-/* Invalidate the TLB entry immediately, helpful for s390x
- * Low-Address-Protection. Used with PAGE_WRITE in tlb_set_page_with_attrs() */
-#define PAGE_WRITE_INV 0x0040
+/*
+ * Invalidate the TLB entry immediately, helpful for s390x
+ * Low-Address-Protection. Used with PAGE_WRITE in tlb_set_page_with_attrs()
+ */
+#define PAGE_WRITE_INV 0x0020
+/* For use with page_set_flags: page is being replaced; target_data cleared. */
+#define PAGE_RESET 0x0040
+
 #if defined(CONFIG_BSD) && defined(CONFIG_USER_ONLY)
 /* FIXME: Code that sets/uses this is broken and needs to go away.  */
-#define PAGE_RESERVED  0x0020
+#define PAGE_RESERVED  0x0100
 #endif
 /* Target-specific bits that will be used via page_get_flags().  */
 #define PAGE_TARGET_1  0x0080
@@ -287,6 +293,30 @@ int walk_memory_regions(void *, walk_memory_regions_fn);
 int page_get_flags(target_ulong address);
 void page_set_flags(target_ulong start, target_ulong end, int flags);
 int page_check_range(target_ulong start, target_ulong len, int flags);
+
+/**
+ * page_alloc_target_data(address, size)
+ * @address: guest virtual address
+ * @size: size of data to allocate
+ *
+ * Allocate @size bytes of out-of-band data to associate with the
+ * guest page at @address.  If the page is not mapped, NULL will
+ * be returned.  If there is existing data associated with @address,
+ * no new memory will be allocated.
+ *
+ * The memory will be freed when the guest page is deallocated,
+ * e.g. with the munmap system call.
+ */
+void *page_alloc_target_data(target_ulong address, size_t size);
+
+/**
+ * page_get_target_data(address)
+ * @address: guest virtual address
+ *
+ * Return any out-of-bound memory assocated with the guest page
+ * at @address, as per page_alloc_target_data.
+ */
+void *page_get_target_data(target_ulong address);
 #endif
 
 CPUArchState *cpu_copy(CPUArchState *env);
diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index 81d4c83f22..bba9c8e0b3 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -114,6 +114,7 @@ typedef struct PageDesc {
 unsigned int code_write_count;
 #else
 unsigned long flags;
+void *target_data;
 #endif
 #ifndef CONFIG_USER_ONLY
 QemuSpin lock;
@@ -2740,6 +2741,7 @@ int page_get_flags(target_ulong address)
 void page_set_flags(target_ulong start, target_ulong end, int flags)
 {
 target_ulong addr, len;
+bool reset_target_data;
 
 /* This function should never be called with addresses outside the
guest address space.  If this assert fires, it probably indicates
@@ -2754,6 +2756,8 @@ void page_set_flags(target_ulong start, target_ulong end, 
int flags)
 if (flags & PAGE_WRITE) {
 flags |= PAGE_WRITE_ORG;
 }
+reset_target_data = !(flags & PAGE_VALID) || (flags & PAGE_RESET);
+flags &= ~PAGE_RESET;
 
 for (addr = start, len = end - start;
  len != 0;
@@ -2767,10 +2771,34 @@ void page_set_flags(target_ulong start, target_ulong 
end, int flags)
 p->first_tb) {
 tb_invalidate_phys_page(addr, 0);
 }
+if (reset_target_data && p->target_data) {
+g_free(p->target_data);
+p->target_data = NULL;
+}
 p->flags = flags;
 }
 }
 
+void *page_get_target_data(target_ulong address)
+{
+PageDesc *p = page_find(address >> TARGET_PAGE_BITS);
+return p ? p->target_data : NULL;
+}
+
+void *page_alloc_target_data(target_ulong address, size_t size)
+{
+PageDesc *p = page_find(address >> TARGET_PAGE_BITS);
+void *ret = NULL;
+
+if (p->flags & PAGE_VALID) {
+ret = p->target_data;
+if (!ret) {
+p->target_data = ret = g_malloc0(size);
+}
+}
+return ret;
+}
+
 int page_check_range(target_ulong start, target_ulong len, int flags)
 {
 PageDesc *p;
diff --git a/linux-user/mmap.c b/linux-user/mmap.c
index 810653c503..c693505b60 100644
--- a/linux-user/mmap.c
+++ 

[PATCH v6 02/31] linux-user: Introduce PAGE_ANON

2021-02-09 Thread Richard Henderson
Record whether the backing page is anonymous, or if it has file
backing.  This will allow us to get close to the Linux AArch64
ABI for MTE, which allows tag memory only on ram-backed VMAs.

The real ABI allows tag memory on files, when those files are
on ram-backed filesystems, such as tmpfs.  We will not be able
to implement that in QEMU linux-user.

Thankfully, anonymous memory for malloc arenas is the primary
consumer of this feature, so this restricted version should
still be of use.

Reviewed-by: Peter Maydell 
Signed-off-by: Richard Henderson 
---
 include/exec/cpu-all.h | 2 ++
 linux-user/mmap.c  | 3 +++
 2 files changed, 5 insertions(+)

diff --git a/include/exec/cpu-all.h b/include/exec/cpu-all.h
index af555f1798..1f47e0fe44 100644
--- a/include/exec/cpu-all.h
+++ b/include/exec/cpu-all.h
@@ -275,6 +275,8 @@ extern intptr_t qemu_host_page_mask;
 #define PAGE_WRITE_INV 0x0020
 /* For use with page_set_flags: page is being replaced; target_data cleared. */
 #define PAGE_RESET 0x0040
+/* For linux-user, indicates that the page is MAP_ANON. */
+#define PAGE_ANON  0x0080
 
 #if defined(CONFIG_BSD) && defined(CONFIG_USER_ONLY)
 /* FIXME: Code that sets/uses this is broken and needs to go away.  */
diff --git a/linux-user/mmap.c b/linux-user/mmap.c
index c693505b60..7fb4c628e1 100644
--- a/linux-user/mmap.c
+++ b/linux-user/mmap.c
@@ -599,6 +599,9 @@ abi_long target_mmap(abi_ulong start, abi_ulong len, int 
target_prot,
 }
 }
  the_end1:
+if (flags & MAP_ANONYMOUS) {
+page_flags |= PAGE_ANON;
+}
 page_flags |= PAGE_RESET;
 page_set_flags(start, start + len, page_flags);
  the_end:
-- 
2.25.1




Re: [PATCH v2] i386: Add the support for AMD EPYC 3rd generation processors

2021-02-09 Thread Eduardo Habkost
On Tue, Feb 09, 2021 at 03:04:05PM -0600, Babu Moger wrote:
> Adds the support for AMD 3rd generation processors. The model
> display for the new processor will be EPYC-Milan.
> 
> Adds the following new feature bits on top of the feature bits from
> the first and second generation EPYC models.
> 
> pcid  : Process context identifiers support
> ibrs  : Indirect Branch Restricted Speculation
> ssbd  : Speculative Store Bypass Disable
> erms  : Enhanced REP MOVSB/STOSB support
> fsrm  : Fast Short REP MOVSB support
> invpcid   : Invalidate processor context ID
> pku   : Protection keys support
> svme-addr-chk : SVM instructions address check for #GP handling
> 
> Depends on the following kernel commits:
> 14c2bf81fcd2 ("KVM: SVM: Fix #GP handling for doubly-nested virtualization")
> 3b9c723ed7cf ("KVM: SVM: Add support for SVM instruction address check 
> change")
> 4aa2691dcbd3 ("8ce1c461188799d863398dd2865d KVM: x86: Factor out x86 
> instruction emulation with decoding")
> 4407a797e941 ("KVM: SVM: Enable INVPCID feature on AMD")
> 9715092f8d7e ("KVM: X86: Move handling of INVPCID types to x86")
> 3f3393b3ce38 ("KVM: X86: Rename and move the function 
> vmx_handle_memory_failure to x86.c")
> 830bd71f2c06 ("KVM: SVM: Remove set_cr_intercept, clr_cr_intercept and 
> is_cr_intercept")
> 4c44e8d6c193 ("KVM: SVM: Add new intercept word in vmcb_control_area")
> c62e2e94b9d4 ("KVM: SVM: Modify 64 bit intercept field to two 32 bit vectors")
> 9780d51dc2af ("KVM: SVM: Modify intercept_exceptions to generic intercepts")
> 30abaa88382c ("KVM: SVM: Change intercept_dr to generic intercepts")
> 03bfeeb988a9 ("KVM: SVM: Change intercept_cr to generic intercepts")
> c45ad7229d13 ("KVM: SVM: Introduce 
> vmcb_(set_intercept/clr_intercept/_is_intercept)")
> a90c1ed9f11d ("(pcid) KVM: nSVM: Remove unused field")
> fa44b82eb831 ("KVM: x86: Move MPK feature detection to common code")
> 38f3e775e9c2 ("x86/Kconfig: Update config and kernel doc for MPK feature on 
> AMD")
> 37486135d3a7 ("KVM: x86: Fix pkru save/restore when guest CR4.PKE=0, move it 
> to x86.c")
> 
> Signed-off-by: Babu Moger 

Queued, thanks!

-- 
Eduardo




Re: [RFC v2 2/2] Basic CXL DOE for CDAT and Compliance Mode

2021-02-09 Thread Chris Browy



> On Feb 9, 2021, at 4:53 PM, Ben Widawsky  wrote:
> 
> A couple of high level comments below. Overall your approach was what I had
> imagined originally. The approach Jonathan took is likely more versatile (but
> harder to read, for sure).
> 
> I'm fine with either and I hope you two can come to an agreement on what the
> best way forward is.
> 
> My ultimate goal was to be able to take a CDAT from a real device and load it 
> as
> a blob into the ct3d for regression testing. Not sure if that's actually
> possible or not.

I’d think so.  

For CDAT/DOE method, you could setup CDAT as non-ACPI tables but compile 
with ACPI iASL?  UEFI owns the ACPI and CDAT specs and all the info is public.

For example using generic datatypes one can describe CDAT structure types and
create an arbitrary CDAT table with any mix of struct types and describe one or 
more proximity
domains and their memory attributes.  The ct3d device can read the “blob” or 
.aml and setup 
entry indexing as Jonathan mentioned previously.  For example user could create 
a
CDAT table and compile using iasl -G  into a file.aml and disassemble 
back 
Into a file.dsl.

Here is example of CDAT header and DSMAS (with ACPI standard header as well):

Signature : "CDAT"
Table Length : 
Revision : 01
Checksum : 00
Oem ID : "TEST"
Oem Table ID : "QEMU "
Oem Revision : 0001
Asl Compiler ID : "INTL"
Asl Compiler Revision : 0001

Label : CDATST
Label : CDAT_HDR
UINT32 : $CDATEND - $CDATST
UINT8  : 01 // Revision 1
UINT8  : 00 // Checksum 1
UINT24 : 00 // Reserved 6
UINT32 :    // Sequence 4

Label : DSMAS   // FieldByte Length
UINT8  : 00 // Type 1
UINT8  : 00 // Reserved 1
UINT16 : 0018   // Length   2
UINT8  : 00 // DSMADHandle  1
UINT8  : 00 // Flags1
UINT16 :    // Reserved 2
UINT64 :    // DPA Base 8
UINT64 :    // DPA Length   8


For Device Option ROM method for CDAT, we could add a option rom to ct3d so 
UEFI could 
access CDAT through a  EFI_ADAPTER_INFORMATION_PROTOCOL (CDAT type) entry.


> 
> Thanks.
> Ben
> 
> On 21-02-09 15:36:03, Chris Browy wrote:
>> ---
>> hw/cxl/cxl-component-utils.c   | 132 +++
>> hw/mem/cxl_type3.c | 172 
>> include/hw/cxl/cxl_cdat.h  | 120 +
>> include/hw/cxl/cxl_compl.h | 289 
>> +
>> include/hw/cxl/cxl_component.h | 126 ++
>> include/hw/cxl/cxl_device.h|   3 +
>> include/hw/cxl/cxl_pci.h   |   4 +
>> 7 files changed, 846 insertions(+)
>> create mode 100644 include/hw/cxl/cxl_cdat.h
>> create mode 100644 include/hw/cxl/cxl_compl.h
>> 
>> diff --git a/hw/cxl/cxl-component-utils.c b/hw/cxl/cxl-component-utils.c
>> index e1bcee5..fc6c538 100644
>> --- a/hw/cxl/cxl-component-utils.c
>> +++ b/hw/cxl/cxl-component-utils.c
>> @@ -195,3 +195,135 @@ void cxl_component_create_dvsec(CXLComponentState 
>> *cxl, uint16_t length,
>> range_init_nofail(>dvsecs[type], cxl->dvsec_offset, length);
>> cxl->dvsec_offset += length;
>> }
>> +
>> +/* Return the sum of bytes */
>> +static void cdat_ent_init(CDATStruct *cs, void *base, uint32_t len)
>> +{
>> +cs->base = base;
>> +cs->length = len;
>> +}
>> +
>> +void cxl_doe_cdat_init(CXLComponentState *cxl_cstate)
>> +{
>> +uint8_t sum = 0;
>> +uint32_t len = 0;
>> +int i, j;
>> +
>> +cxl_cstate->cdat_ent_len = 7;
>> +cxl_cstate->cdat_ent =
>> +g_malloc0(sizeof(CDATStruct) * cxl_cstate->cdat_ent_len);
>> +
>> +cdat_ent_init(_cstate->cdat_ent[0],
>> +  _cstate->cdat_header, 
>> sizeof(cxl_cstate->cdat_header));
>> +cdat_ent_init(_cstate->cdat_ent[1],
>> +  _cstate->dsmas, sizeof(cxl_cstate->dsmas));
>> +cdat_ent_init(_cstate->cdat_ent[2],
>> +  _cstate->dslbis, sizeof(cxl_cstate->dslbis));
>> +cdat_ent_init(_cstate->cdat_ent[3],
>> +  _cstate->dsmscis, sizeof(cxl_cstate->dsmscis));
>> +cdat_ent_init(_cstate->cdat_ent[4],
>> +  _cstate->dsis, sizeof(cxl_cstate->dsis));
>> +cdat_ent_init(_cstate->cdat_ent[5],
>> +  _cstate->dsemts, sizeof(cxl_cstate->dsemts));
>> +cdat_ent_init(_cstate->cdat_ent[6],
>> +  _cstate->sslbis, sizeof(cxl_cstate->sslbis));
>> +
>> +/* Set the DSMAS entry, ent = 1 */
>> +cxl_cstate->dsmas.header.type = CDAT_TYPE_DSMAS;
>> +cxl_cstate->dsmas.header.reserved = 0x0;
>> +cxl_cstate->dsmas.header.length = sizeof(cxl_cstate->dsmas);
>> +cxl_cstate->dsmas.DSMADhandle = 0x0;
>> +cxl_cstate->dsmas.flags = 0x0;
>> 

Re: [PATCH 0/2] Additional vIOMMU fixes related to UNMAP notifiers

2021-02-09 Thread Peter Xu
On Tue, Feb 09, 2021 at 10:32:31PM +0100, Eric Auger wrote:
> 958ec334bca3 ("vhost: Unbreak SMMU and virtio-iommu on dev-iotlb
> support") fixed part of the UNMAP related regressions introduced by
> b68ba1ca5767 ("memory: Add IOMMU_NOTIFIER_DEVIOTLB_UNMAP
> IOMMUTLBNotificationType").
> 
> However the case of the spapr_iommu was not addressed. It should be
> identical to the others. Also 958ec334bca3 introduced a regresion
> on the VIRTIO-IOMMU/VFIO integration.
> 
> spapr_iommu is not tested.

Reviewed-by: Peter Xu 

Thanks!

-- 
Peter Xu




Re: [RFC PATCH v2 1/2] Basic PCIe DOE support

2021-02-09 Thread Chris Browy
No consensus yet but I’d suggest that we’ll do the QEMU work and Jonathan 
focuses 
on the linux kernel and UEFI/edk2 and CXL SSWG efforts.  Seems like
a way to maximize resources and everyone’s contribution and expertise.  QEMU 
part
requires the least expertise which is why we’re best suited for it compared to 
other 
areas ;)

Review comments will be folded into next patch.

> On Feb 9, 2021, at 4:42 PM, Ben Widawsky  wrote:
> 
> Have you/Jonathan come to consensus about which implementation is going 
> forward?
> I'd rather not have to review two :D
> 
> On 21-02-09 15:35:49, Chris Browy wrote:
>> ---
>> MAINTAINERS   |   7 +
>> hw/pci/meson.build|   1 +
>> hw/pci/pcie.c |   2 +-
>> hw/pci/pcie_doe.c | 414 
>> ++
>> include/hw/pci/pci_ids.h  |   2 +
>> include/hw/pci/pcie.h |   1 +
>> include/hw/pci/pcie_doe.h | 166 
>> include/hw/pci/pcie_regs.h|   4 +
>> include/standard-headers/linux/pci_regs.h |   3 +-
>> 9 files changed, 598 insertions(+), 2 deletions(-)
>> create mode 100644 hw/pci/pcie_doe.c
>> create mode 100644 include/hw/pci/pcie_doe.h
>> 
>> diff --git a/MAINTAINERS b/MAINTAINERS
>> index 981dc92..4fb865e 100644
>> --- a/MAINTAINERS
>> +++ b/MAINTAINERS
>> @@ -1655,6 +1655,13 @@ F: docs/pci*
>> F: docs/specs/*pci*
>> F: default-configs/pci.mak
>> 
>> +PCIE DOE
>> +M: Huai-Cheng Kuo 
>> +M: Chris Browy 
>> +S: Supported
>> +F: include/hw/pci/pcie_doe.h
>> +F: hw/pci/pcie_doe.c
>> +
>> ACPI/SMBIOS
>> M: Michael S. Tsirkin 
>> M: Igor Mammedov 
>> diff --git a/hw/pci/meson.build b/hw/pci/meson.build
>> index 5c4bbac..115e502 100644
>> --- a/hw/pci/meson.build
>> +++ b/hw/pci/meson.build
>> @@ -12,6 +12,7 @@ pci_ss.add(files(
>> # allow plugging PCIe devices into PCI buses, include them even if
>> # CONFIG_PCI_EXPRESS=n.
>> pci_ss.add(files('pcie.c', 'pcie_aer.c'))
>> +pci_ss.add(files('pcie_doe.c'))
> 
> It looks like this should be like the below line:
> softmmu_ss.add(when: 'CONFIG_PCI_EXPRESS', if_true: pci_doe.c))
> 
>> softmmu_ss.add(when: 'CONFIG_PCI_EXPRESS', if_true: files('pcie_port.c', 
>> 'pcie_host.c'))
>> softmmu_ss.add_all(when: 'CONFIG_PCI', if_true: pci_ss)
>> 
>> diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c
>> index 1ecf6f6..f7516c4 100644
>> --- a/hw/pci/pcie.c
>> +++ b/hw/pci/pcie.c
>> @@ -735,7 +735,7 @@ void pcie_cap_slot_write_config(PCIDevice *dev,
>> 
>> hotplug_event_notify(dev);
>> 
>> -/* 
>> +/*
> 
> Please drop this.
> 
>>  * 6.7.3.2 Command Completed Events
>>  *
>>  * Software issues a command to a hot-plug capable Downstream Port by
>> diff --git a/hw/pci/pcie_doe.c b/hw/pci/pcie_doe.c
>> new file mode 100644
>> index 000..df8e92e
>> --- /dev/null
>> +++ b/hw/pci/pcie_doe.c
>> @@ -0,0 +1,414 @@
>> +#include "qemu/osdep.h"
>> +#include "qemu/log.h"
>> +#include "qemu/error-report.h"
>> +#include "qapi/error.h"
>> +#include "qemu/range.h"
>> +#include "hw/pci/pci.h"
>> +#include "hw/pci/pcie.h"
>> +#include "hw/pci/pcie_doe.h"
>> +#include "hw/pci/msi.h"
>> +#include "hw/pci/msix.h"
>> +
>> +/*
>> + * DOE Default Protocols (Discovery, CMA)
>> + */
>> +/* Discovery Request Object */
>> +struct doe_discovery {
>> +DOEHeader header;
>> +uint8_t index;
>> +uint8_t reserved[3];
>> +} QEMU_PACKED;
>> +
>> +/* Discovery Response Object */
>> +struct doe_discovery_rsp {
>> +DOEHeader header;
>> +uint16_t vendor_id;
>> +uint8_t doe_type;
>> +uint8_t next_index;
>> +} QEMU_PACKED;
>> +
>> +/* Callback for Discovery */
>> +static bool pcie_doe_discovery_rsp(DOECap *doe_cap)
>> +{
>> +PCIEDOE *doe = doe_cap->doe;
>> +struct doe_discovery *req = pcie_doe_get_req(doe_cap);
>> +uint8_t index = req->index;
>> +DOEProtocol *prot = NULL;
>> +
>> +/* Request length mismatch, discard */
>> +if (req->header.length < dwsizeof(struct doe_discovery)) {
> 
> Use DIV_ROUND_UP instead of rolling your own thing.
> 
>> +return DOE_DISCARD;
>> +}
>> +
>> +/* Point to the requested protocol */
>> +if (index < doe->protocol_num) {
>> +prot = >protocols[index];
>> +}
> 
> What happens on else, should that still return DOE_SUCCESS?
> 
>> +
>> +struct doe_discovery_rsp rsp = {
>> +.header = {
>> +.vendor_id = PCI_VENDOR_ID_PCI_SIG,
>> +.doe_type = PCI_SIG_DOE_DISCOVERY,
>> +.reserved = 0x0,
>> +.length = dwsizeof(struct doe_discovery_rsp),
>> +},
> 
> mixed declarations are not allowed.
> DIV_ROUND_UP
> 
>> +.vendor_id = (prot) ? prot->vendor_id : 0x,
>> +.doe_type = (prot) ? prot->doe_type : 0xFF,
>> +.next_index = (index + 1) < doe->protocol_num ?
>> +  (index + 1) : 0,
>> +};
> 
> I prefer:
> next_index = (index + 1) % doe->protocol_num
> 
>> +
>> +

Re: [PATCH v2 63/93] tcg/tci: Use ffi for calls

2021-02-09 Thread Stefan Weil

Am 09.02.21 um 22:15 schrieb Stefan Weil:



Thanks for solving this. The patch works for me.

BIOS boot time with qemu-system-i386 is about 41 s (with my code which 
lacks thread support and ffi it was 40 s).


With qemu-system-x86_64 it is twice as fast, so it looks like in my 
last report where I said that the new code had doubled the speed I 
compared different system emulations.



Update: with Richard's latest tci-next branch which includes the fixed 
code both qemu-system-x86_64 and qemu-system-i386 require about 20 s 
user time for the BIOS boot.


Stefan





Re: [RFC v2 2/2] Basic CXL DOE for CDAT and Compliance Mode

2021-02-09 Thread Ben Widawsky
A couple of high level comments below. Overall your approach was what I had
imagined originally. The approach Jonathan took is likely more versatile (but
harder to read, for sure).

I'm fine with either and I hope you two can come to an agreement on what the
best way forward is.

My ultimate goal was to be able to take a CDAT from a real device and load it as
a blob into the ct3d for regression testing. Not sure if that's actually
possible or not.

Thanks.
Ben

On 21-02-09 15:36:03, Chris Browy wrote:
> ---
>  hw/cxl/cxl-component-utils.c   | 132 +++
>  hw/mem/cxl_type3.c | 172 
>  include/hw/cxl/cxl_cdat.h  | 120 +
>  include/hw/cxl/cxl_compl.h | 289 
> +
>  include/hw/cxl/cxl_component.h | 126 ++
>  include/hw/cxl/cxl_device.h|   3 +
>  include/hw/cxl/cxl_pci.h   |   4 +
>  7 files changed, 846 insertions(+)
>  create mode 100644 include/hw/cxl/cxl_cdat.h
>  create mode 100644 include/hw/cxl/cxl_compl.h
> 
> diff --git a/hw/cxl/cxl-component-utils.c b/hw/cxl/cxl-component-utils.c
> index e1bcee5..fc6c538 100644
> --- a/hw/cxl/cxl-component-utils.c
> +++ b/hw/cxl/cxl-component-utils.c
> @@ -195,3 +195,135 @@ void cxl_component_create_dvsec(CXLComponentState *cxl, 
> uint16_t length,
>  range_init_nofail(>dvsecs[type], cxl->dvsec_offset, length);
>  cxl->dvsec_offset += length;
>  }
> +
> +/* Return the sum of bytes */
> +static void cdat_ent_init(CDATStruct *cs, void *base, uint32_t len)
> +{
> +cs->base = base;
> +cs->length = len;
> +}
> +
> +void cxl_doe_cdat_init(CXLComponentState *cxl_cstate)
> +{
> +uint8_t sum = 0;
> +uint32_t len = 0;
> +int i, j;
> +
> +cxl_cstate->cdat_ent_len = 7;
> +cxl_cstate->cdat_ent =
> +g_malloc0(sizeof(CDATStruct) * cxl_cstate->cdat_ent_len);
> +
> +cdat_ent_init(_cstate->cdat_ent[0],
> +  _cstate->cdat_header, sizeof(cxl_cstate->cdat_header));
> +cdat_ent_init(_cstate->cdat_ent[1],
> +  _cstate->dsmas, sizeof(cxl_cstate->dsmas));
> +cdat_ent_init(_cstate->cdat_ent[2],
> +  _cstate->dslbis, sizeof(cxl_cstate->dslbis));
> +cdat_ent_init(_cstate->cdat_ent[3],
> +  _cstate->dsmscis, sizeof(cxl_cstate->dsmscis));
> +cdat_ent_init(_cstate->cdat_ent[4],
> +  _cstate->dsis, sizeof(cxl_cstate->dsis));
> +cdat_ent_init(_cstate->cdat_ent[5],
> +  _cstate->dsemts, sizeof(cxl_cstate->dsemts));
> +cdat_ent_init(_cstate->cdat_ent[6],
> +  _cstate->sslbis, sizeof(cxl_cstate->sslbis));
> +
> +/* Set the DSMAS entry, ent = 1 */
> +cxl_cstate->dsmas.header.type = CDAT_TYPE_DSMAS;
> +cxl_cstate->dsmas.header.reserved = 0x0;
> +cxl_cstate->dsmas.header.length = sizeof(cxl_cstate->dsmas);
> +cxl_cstate->dsmas.DSMADhandle = 0x0;
> +cxl_cstate->dsmas.flags = 0x0;
> +cxl_cstate->dsmas.reserved2 = 0x0;
> +cxl_cstate->dsmas.DPA_base = 0x0;
> +cxl_cstate->dsmas.DPA_length = 0x4;
> +
> +/* Set the DSLBIS entry, ent = 2 */
> +cxl_cstate->dslbis.header.type = CDAT_TYPE_DSLBIS;
> +cxl_cstate->dslbis.header.reserved = 0;
> +cxl_cstate->dslbis.header.length = sizeof(cxl_cstate->dslbis);
> +cxl_cstate->dslbis.handle = 0;
> +cxl_cstate->dslbis.flags = 0;
> +cxl_cstate->dslbis.data_type = 0;
> +cxl_cstate->dslbis.reserved2 = 0;
> +cxl_cstate->dslbis.entry_base_unit = 0;
> +cxl_cstate->dslbis.entry[0] = 0;
> +cxl_cstate->dslbis.entry[1] = 0;
> +cxl_cstate->dslbis.entry[2] = 0;
> +cxl_cstate->dslbis.reserved3 = 0;
> +
> +/* Set the DSMSCIS entry, ent = 3 */
> +cxl_cstate->dsmscis.header.type = CDAT_TYPE_DSMSCIS;
> +cxl_cstate->dsmscis.header.reserved = 0;
> +cxl_cstate->dsmscis.header.length = sizeof(cxl_cstate->dsmscis);
> +cxl_cstate->dsmscis.DSMASH_handle = 0;
> +cxl_cstate->dsmscis.reserved2[0] = 0;
> +cxl_cstate->dsmscis.reserved2[1] = 0;
> +cxl_cstate->dsmscis.reserved2[2] = 0;
> +cxl_cstate->dsmscis.memory_side_cache_size = 0;
> +cxl_cstate->dsmscis.cache_attributes = 0;
> +
> +/* Set the DSIS entry, ent = 4 */
> +cxl_cstate->dsis.header.type = CDAT_TYPE_DSIS;
> +cxl_cstate->dsis.header.reserved = 0;
> +cxl_cstate->dsis.header.length = sizeof(cxl_cstate->dsis);
> +cxl_cstate->dsis.flags = 0;
> +cxl_cstate->dsis.handle = 0;
> +cxl_cstate->dsis.reserved2 = 0;
> +
> +/* Set the DSEMTS entry, ent = 5 */
> +cxl_cstate->dsemts.header.type = CDAT_TYPE_DSEMTS;
> +cxl_cstate->dsemts.header.reserved = 0;
> +cxl_cstate->dsemts.header.length = sizeof(cxl_cstate->dsemts);
> +cxl_cstate->dsemts.DSMAS_handle = 0;
> +cxl_cstate->dsemts.EFI_memory_type_attr = 0;
> +cxl_cstate->dsemts.reserved2 = 0;
> +cxl_cstate->dsemts.DPA_offset = 0;
> +cxl_cstate->dsemts.DPA_length = 0;
> +
> +/* Set the SSLBIS 

Re: [RFC PATCH v2 1/2] Basic PCIe DOE support

2021-02-09 Thread Ben Widawsky
Have you/Jonathan come to consensus about which implementation is going forward?
I'd rather not have to review two :D

On 21-02-09 15:35:49, Chris Browy wrote:
> ---
>  MAINTAINERS   |   7 +
>  hw/pci/meson.build|   1 +
>  hw/pci/pcie.c |   2 +-
>  hw/pci/pcie_doe.c | 414 
> ++
>  include/hw/pci/pci_ids.h  |   2 +
>  include/hw/pci/pcie.h |   1 +
>  include/hw/pci/pcie_doe.h | 166 
>  include/hw/pci/pcie_regs.h|   4 +
>  include/standard-headers/linux/pci_regs.h |   3 +-
>  9 files changed, 598 insertions(+), 2 deletions(-)
>  create mode 100644 hw/pci/pcie_doe.c
>  create mode 100644 include/hw/pci/pcie_doe.h
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 981dc92..4fb865e 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -1655,6 +1655,13 @@ F: docs/pci*
>  F: docs/specs/*pci*
>  F: default-configs/pci.mak
>  
> +PCIE DOE
> +M: Huai-Cheng Kuo 
> +M: Chris Browy 
> +S: Supported
> +F: include/hw/pci/pcie_doe.h
> +F: hw/pci/pcie_doe.c
> +
>  ACPI/SMBIOS
>  M: Michael S. Tsirkin 
>  M: Igor Mammedov 
> diff --git a/hw/pci/meson.build b/hw/pci/meson.build
> index 5c4bbac..115e502 100644
> --- a/hw/pci/meson.build
> +++ b/hw/pci/meson.build
> @@ -12,6 +12,7 @@ pci_ss.add(files(
>  # allow plugging PCIe devices into PCI buses, include them even if
>  # CONFIG_PCI_EXPRESS=n.
>  pci_ss.add(files('pcie.c', 'pcie_aer.c'))
> +pci_ss.add(files('pcie_doe.c'))

It looks like this should be like the below line:
softmmu_ss.add(when: 'CONFIG_PCI_EXPRESS', if_true: pci_doe.c))

>  softmmu_ss.add(when: 'CONFIG_PCI_EXPRESS', if_true: files('pcie_port.c', 
> 'pcie_host.c'))
>  softmmu_ss.add_all(when: 'CONFIG_PCI', if_true: pci_ss)
>  
> diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c
> index 1ecf6f6..f7516c4 100644
> --- a/hw/pci/pcie.c
> +++ b/hw/pci/pcie.c
> @@ -735,7 +735,7 @@ void pcie_cap_slot_write_config(PCIDevice *dev,
>  
>  hotplug_event_notify(dev);
>  
> -/* 
> +/*

Please drop this.

>   * 6.7.3.2 Command Completed Events
>   *
>   * Software issues a command to a hot-plug capable Downstream Port by
> diff --git a/hw/pci/pcie_doe.c b/hw/pci/pcie_doe.c
> new file mode 100644
> index 000..df8e92e
> --- /dev/null
> +++ b/hw/pci/pcie_doe.c
> @@ -0,0 +1,414 @@
> +#include "qemu/osdep.h"
> +#include "qemu/log.h"
> +#include "qemu/error-report.h"
> +#include "qapi/error.h"
> +#include "qemu/range.h"
> +#include "hw/pci/pci.h"
> +#include "hw/pci/pcie.h"
> +#include "hw/pci/pcie_doe.h"
> +#include "hw/pci/msi.h"
> +#include "hw/pci/msix.h"
> +
> +/*
> + * DOE Default Protocols (Discovery, CMA)
> + */
> +/* Discovery Request Object */
> +struct doe_discovery {
> +DOEHeader header;
> +uint8_t index;
> +uint8_t reserved[3];
> +} QEMU_PACKED;
> +
> +/* Discovery Response Object */
> +struct doe_discovery_rsp {
> +DOEHeader header;
> +uint16_t vendor_id;
> +uint8_t doe_type;
> +uint8_t next_index;
> +} QEMU_PACKED;
> +
> +/* Callback for Discovery */
> +static bool pcie_doe_discovery_rsp(DOECap *doe_cap)
> +{
> +PCIEDOE *doe = doe_cap->doe;
> +struct doe_discovery *req = pcie_doe_get_req(doe_cap);
> +uint8_t index = req->index;
> +DOEProtocol *prot = NULL;
> +
> +/* Request length mismatch, discard */
> +if (req->header.length < dwsizeof(struct doe_discovery)) {

Use DIV_ROUND_UP instead of rolling your own thing.

> +return DOE_DISCARD;
> +}
> +
> +/* Point to the requested protocol */
> +if (index < doe->protocol_num) {
> +prot = >protocols[index];
> +}

What happens on else, should that still return DOE_SUCCESS?

> +
> +struct doe_discovery_rsp rsp = {
> +.header = {
> +.vendor_id = PCI_VENDOR_ID_PCI_SIG,
> +.doe_type = PCI_SIG_DOE_DISCOVERY,
> +.reserved = 0x0,
> +.length = dwsizeof(struct doe_discovery_rsp),
> +},

mixed declarations are not allowed.
DIV_ROUND_UP

> +.vendor_id = (prot) ? prot->vendor_id : 0x,
> +.doe_type = (prot) ? prot->doe_type : 0xFF,
> +.next_index = (index + 1) < doe->protocol_num ?
> +  (index + 1) : 0,
> +};

I prefer:
next_index = (index + 1) % doe->protocol_num

> +
> +pcie_doe_set_rsp(doe_cap, );
> +
> +return DOE_SUCCESS;
> +}
> +
> +/* Callback for CMA */
> +static bool pcie_doe_cma_rsp(DOECap *doe_cap)
> +{
> +doe_cap->status.error = 1;
> +
> +memset(doe_cap->read_mbox, 0,
> +   PCI_DOE_MAX_DW_SIZE * sizeof(uint32_t));
> +
> +doe_cap->write_mbox_len = 0;
> +
> +return DOE_DISCARD;
> +}
> +
> +/*
> + * DOE Utilities
> + */
> +static void pcie_doe_reset_mbox(DOECap *st)
> +{
> +st->read_mbox_idx = 0;
> +
> +st->read_mbox_len = 0;
> +st->write_mbox_len = 0;
> +
> +memset(st->read_mbox, 0, 

[PATCH 2/2] spapr_iommu: Fix vhost integration regression

2021-02-09 Thread Eric Auger
Previous work on dev-iotlb message broke spapr_iommu/vhost integration
as it did for SMMU and virtio-iommu. The spapr_iommu currently
only sends IOMMU_NOTIFIER_UNMAP notifications. Since commit
958ec334bca3 ("vhost: Unbreak SMMU and virtio-iommu on dev-iotlb support"),
VHOST first tries to register IOMMU_NOTIFIER_DEVIOTLB_UNMAP notifier
and if it fails, falls back to legacy IOMMU_NOTIFIER_UNMAP. So
spapr_iommu must fail on the IOMMU_NOTIFIER_DEVIOTLB_UNMAP
registration.

Reported-by: Peter Xu 
Fixes: b68ba1ca57677acf870d5ab10579e6105c1f5338
Signed-off-by: Eric Auger 
---
 hw/ppc/spapr_iommu.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c
index 30352df00e..24537ffcbd 100644
--- a/hw/ppc/spapr_iommu.c
+++ b/hw/ppc/spapr_iommu.c
@@ -212,6 +212,11 @@ static int spapr_tce_notify_flag_changed(IOMMUMemoryRegion 
*iommu,
 {
 struct SpaprTceTable *tbl = container_of(iommu, SpaprTceTable, iommu);
 
+if (new & IOMMU_NOTIFIER_DEVIOTLB_UNMAP) {
+error_setg(errp, "spart_tce does not support dev-iotlb yet");
+return -EINVAL;
+}
+
 if (old == IOMMU_NOTIFIER_NONE && new != IOMMU_NOTIFIER_NONE) {
 spapr_tce_set_need_vfio(tbl, true);
 } else if (old != IOMMU_NOTIFIER_NONE && new == IOMMU_NOTIFIER_NONE) {
-- 
2.26.2




[PATCH 0/2] Additional vIOMMU fixes related to UNMAP notifiers

2021-02-09 Thread Eric Auger
958ec334bca3 ("vhost: Unbreak SMMU and virtio-iommu on dev-iotlb
support") fixed part of the UNMAP related regressions introduced by
b68ba1ca5767 ("memory: Add IOMMU_NOTIFIER_DEVIOTLB_UNMAP
IOMMUTLBNotificationType").

However the case of the spapr_iommu was not addressed. It should be
identical to the others. Also 958ec334bca3 introduced a regresion
on the VIRTIO-IOMMU/VFIO integration.

spapr_iommu is not tested.

Best Regards

Eric

Eric Auger (2):
  vfio: Do not register any IOMMU_NOTIFIER_DEVIOTLB_UNMAP notifier
  spapr_iommu: Fix vhost integration regression

 hw/ppc/spapr_iommu.c | 5 +
 hw/vfio/common.c | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

-- 
2.26.2




[PATCH 1/2] vfio: Do not register any IOMMU_NOTIFIER_DEVIOTLB_UNMAP notifier

2021-02-09 Thread Eric Auger
In an attempt to fix smmu/virtio-iommu - vhost regression, commit
958ec334bca3 ("vhost: Unbreak SMMU and virtio-iommu on dev-iotlb support")
broke virtio-iommu integration. This is due to the fact VFIO registers
IOMMU_NOTIFIER_ALL notifiers, which includes IOMMU_NOTIFIER_DEVIOTLB_UNMAP
and this latter now is rejected by the virtio-iommu. As a consequence,
the registration fails. VHOST behaves like a device with an ATC cache. The
VFIO device does not support this scheme yet.

Let's register only legacy MAP and UNMAP notifiers.

Fixes: 958ec334bca3 ("vhost: Unbreak SMMU and virtio-iommu on dev-iotlb 
support")
Signed-off-by: Eric Auger 
---
 hw/vfio/common.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 6ff1daa763..a50b10c801 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -787,7 +787,7 @@ static void vfio_listener_region_add(MemoryListener 
*listener,
 iommu_idx = memory_region_iommu_attrs_to_index(iommu_mr,
MEMTXATTRS_UNSPECIFIED);
 iommu_notifier_init(>n, vfio_iommu_map_notify,
-IOMMU_NOTIFIER_ALL,
+IOMMU_NOTIFIER_IOTLB_EVENTS,
 section->offset_within_region,
 int128_get64(llend),
 iommu_idx);
-- 
2.26.2




[PATCH v2] i386: Add the support for AMD EPYC 3rd generation processors

2021-02-09 Thread Babu Moger
Adds the support for AMD 3rd generation processors. The model
display for the new processor will be EPYC-Milan.

Adds the following new feature bits on top of the feature bits from
the first and second generation EPYC models.

pcid  : Process context identifiers support
ibrs  : Indirect Branch Restricted Speculation
ssbd  : Speculative Store Bypass Disable
erms  : Enhanced REP MOVSB/STOSB support
fsrm  : Fast Short REP MOVSB support
invpcid   : Invalidate processor context ID
pku   : Protection keys support
svme-addr-chk : SVM instructions address check for #GP handling

Depends on the following kernel commits:
14c2bf81fcd2 ("KVM: SVM: Fix #GP handling for doubly-nested virtualization")
3b9c723ed7cf ("KVM: SVM: Add support for SVM instruction address check change")
4aa2691dcbd3 ("8ce1c461188799d863398dd2865d KVM: x86: Factor out x86 
instruction emulation with decoding")
4407a797e941 ("KVM: SVM: Enable INVPCID feature on AMD")
9715092f8d7e ("KVM: X86: Move handling of INVPCID types to x86")
3f3393b3ce38 ("KVM: X86: Rename and move the function vmx_handle_memory_failure 
to x86.c")
830bd71f2c06 ("KVM: SVM: Remove set_cr_intercept, clr_cr_intercept and 
is_cr_intercept")
4c44e8d6c193 ("KVM: SVM: Add new intercept word in vmcb_control_area")
c62e2e94b9d4 ("KVM: SVM: Modify 64 bit intercept field to two 32 bit vectors")
9780d51dc2af ("KVM: SVM: Modify intercept_exceptions to generic intercepts")
30abaa88382c ("KVM: SVM: Change intercept_dr to generic intercepts")
03bfeeb988a9 ("KVM: SVM: Change intercept_cr to generic intercepts")
c45ad7229d13 ("KVM: SVM: Introduce 
vmcb_(set_intercept/clr_intercept/_is_intercept)")
a90c1ed9f11d ("(pcid) KVM: nSVM: Remove unused field")
fa44b82eb831 ("KVM: x86: Move MPK feature detection to common code")
38f3e775e9c2 ("x86/Kconfig: Update config and kernel doc for MPK feature on 
AMD")
37486135d3a7 ("KVM: x86: Fix pkru save/restore when guest CR4.PKE=0, move it to 
x86.c")

Signed-off-by: Babu Moger 
---
v2: Added svme-addr-chk. Also added all the dependent kernel commits in the log.

v1: 
https://lore.kernel.org/qemu-devel/16118780.27536.17735339269843944966.stgit@bmoger-ubuntu/

 target/i386/cpu.c |  107 +
 target/i386/cpu.h |4 ++
 2 files changed, 110 insertions(+), 1 deletion(-)

diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index 9c3d2d60b7..24db7ed892 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -1033,7 +1033,7 @@ static FeatureWordInfo feature_word_info[FEATURE_WORDS] = 
{
 "clzero", NULL, "xsaveerptr", NULL,
 NULL, NULL, NULL, NULL,
 NULL, "wbnoinvd", NULL, NULL,
-"ibpb", NULL, NULL, "amd-stibp",
+"ibpb", NULL, "ibrs", "amd-stibp",
 NULL, NULL, NULL, NULL,
 NULL, NULL, NULL, NULL,
 "amd-ssbd", "virt-ssbd", "amd-no-ssb", NULL,
@@ -1798,6 +1798,56 @@ static CPUCaches epyc_rome_cache_info = {
 },
 };
 
+static CPUCaches epyc_milan_cache_info = {
+.l1d_cache = &(CPUCacheInfo) {
+.type = DATA_CACHE,
+.level = 1,
+.size = 32 * KiB,
+.line_size = 64,
+.associativity = 8,
+.partitions = 1,
+.sets = 64,
+.lines_per_tag = 1,
+.self_init = 1,
+.no_invd_sharing = true,
+},
+.l1i_cache = &(CPUCacheInfo) {
+.type = INSTRUCTION_CACHE,
+.level = 1,
+.size = 32 * KiB,
+.line_size = 64,
+.associativity = 8,
+.partitions = 1,
+.sets = 64,
+.lines_per_tag = 1,
+.self_init = 1,
+.no_invd_sharing = true,
+},
+.l2_cache = &(CPUCacheInfo) {
+.type = UNIFIED_CACHE,
+.level = 2,
+.size = 512 * KiB,
+.line_size = 64,
+.associativity = 8,
+.partitions = 1,
+.sets = 1024,
+.lines_per_tag = 1,
+},
+.l3_cache = &(CPUCacheInfo) {
+.type = UNIFIED_CACHE,
+.level = 3,
+.size = 32 * MiB,
+.line_size = 64,
+.associativity = 16,
+.partitions = 1,
+.sets = 32768,
+.lines_per_tag = 1,
+.self_init = true,
+.inclusive = true,
+.complex_indexing = true,
+},
+};
+
 /* The following VMX features are not supported by KVM and are left out in the
  * CPU definitions:
  *
@@ -4130,6 +4180,61 @@ static X86CPUDefinition builtin_x86_defs[] = {
 .model_id = "AMD EPYC-Rome Processor",
 .cache_info = _rome_cache_info,
 },
+{
+.name = "EPYC-Milan",
+.level = 0xd,
+.vendor = CPUID_VENDOR_AMD,
+.family = 25,
+.model = 1,
+.stepping = 1,
+.features[FEAT_1_EDX] =
+CPUID_SSE2 | CPUID_SSE | CPUID_FXSR | CPUID_MMX | CPUID_CLFLUSH |
+CPUID_PSE36 | CPUID_PAT | CPUID_CMOV | CPUID_MCA | CPUID_PGE |
+CPUID_MTRR | CPUID_SEP | CPUID_APIC | CPUID_CX8 | 

Re: [PATCH v2 63/93] tcg/tci: Use ffi for calls

2021-02-09 Thread Stefan Weil

Am 09.02.21 um 21:46 schrieb Richard Henderson:


On 2/8/21 2:55 PM, Richard Henderson wrote:

Ok, I've reproduced something on a T3 (gcc102.fsffrance.org).
Running the same code side-by-side vs the T5, I get different results.

Brown paper bag time: the T5 build dir lost the --enable-tcg-interpreter flag,
so was testing tcg native.

Big-endian bug wrt an odd api wart in libffi.  Fixed thus.



Thanks for solving this. The patch works for me.

BIOS boot time with qemu-system-i386 is about 41 s (with my code which 
lacks thread support and ffi it was 40 s).


With qemu-system-x86_64 it is twice as fast, so it looks like in my last 
report where I said that the new code had doubled the speed I compared 
different system emulations.


Apropos "slow" TCI: on Apple's M1 it is faster than native TCG on most 
of my Intel / AMD machines. And it works with current git master while 
native TCG still waits for pending patches which fix the memory access.


Stefan






[Bug 1625216] Re: memory writes via gdb don't work for memory mapped hardware

2021-02-09 Thread Alex Bennée
** Tags added: gdbstub

-- 
You received this bug notification because you are a member of qemu-
devel-ml, which is subscribed to QEMU.
https://bugs.launchpad.net/bugs/1625216

Title:
  memory writes via gdb don't work for memory mapped hardware

Status in QEMU:
  Confirmed

Bug description:
  When I remote-debug a qemu-guest and attempt to write to a memory mapped 
location, the
  write-handler for the concerned device will not be called. All 
write-requiests from
  gdb are delegated to cpu_physical_memory_write_rom(...). a function that 
writes to the 
  underlying ram-block.

  I believe requests to memory mapped hardware should be delegated to 
  address_space_rw(). 

  example:
  ;; a memory mapped device. No effect, the write-handler is not called
  (gdb) set *0xfff3c000 = 48

  ;; a ram or rom-block. Thos works. The value is changed.
  (gdb) set *0x10 = 48

  
  

  Here's my suggested patch. As noted in the comment, it could perhaps be
  improved for the (rare) case when the write-request from gdb spans multiple 
  memory regions.

  $ git diff   85bc2a15121e8bcd9f15eb75794a1eacca9d84bd HEAD ../exec.c
  diff --git a/exec.c b/exec.c
  index c4f9036..45ef896 100644
  --- a/exec.c
  +++ b/exec.c
  @@ -3676,6 +3676,7 @@ int cpu_memory_rw_debug(CPUState *cpu, target_ulong 
addr,
   int l;
   hwaddr phys_addr;
   target_ulong page;
  +bool is_memcpy_access;
   
   while (len > 0) {
   int asidx;
  @@ -3691,13 +3692,32 @@ int cpu_memory_rw_debug(CPUState *cpu, target_ulong 
addr,
   if (l > len)
   l = len;
   phys_addr += (addr & ~TARGET_PAGE_MASK);
  +
   if (is_write) {
  +/* if ram/rom region we access the memory 
  +   via memcpy instead of via the cpu */
  +hwaddr mr_len, addr1;
  +AddressSpace *as = cpu->cpu_ases[asidx].as;
  +MemoryRegion *mr = address_space_translate(as, phys_addr, 
, _len, is_write);
  +is_memcpy_access  = memory_region_is_ram(mr) || 
memory_region_is_romd(mr);
  +if(mr_len < len) {
  +/* TODO, mimic more of the loop over mr chunks as 
  +   done in cpu_physical_memory_write_internal */ 
  +printf("warning: we dont know whether all bytes "
  +   "to be written are ram/rom or io\n");
  +}
  +}
  +else {
  +is_memcpy_access = false;
  +}
  +
  +if (is_write && is_memcpy_access) {
   cpu_physical_memory_write_rom(cpu->cpu_ases[asidx].as,
 phys_addr, buf, l);
   } else {
   address_space_rw(cpu->cpu_ases[asidx].as, phys_addr,
MEMTXATTRS_UNSPECIFIED,
  - buf, l, 0);
  + buf, l, is_write);
   }
   len -= l;
   buf += l;

To manage notifications about this bug go to:
https://bugs.launchpad.net/qemu/+bug/1625216/+subscriptions



Re: [PATCH v1 11/12] accel/tcg: allow plugin instrumentation to be disable via cflags

2021-02-09 Thread Richard Henderson
On 2/9/21 10:27 AM, Alex Bennée wrote:
> When icount is enabled and we recompile an MMIO access we end up
> double counting the instruction execution. To avoid this we introduce
> the CF_NOINSTR cflag which disables instrumentation for the next TB.
> As this is part of the hashed compile flags we will only execute the
> generated TB while coming out of a cpu_io_recompile.
> 
> While we are at it delete the old TODO. We might as well keep the
> translation handy as it's likely you will repeatedly hit it on each
> MMIO access.
> 
> Reported-by: Aaron Lindsay 
> Signed-off-by: Alex Bennée 
> ---
>  include/exec/exec-all.h   |  3 ++-
>  accel/tcg/translate-all.c | 17 -
>  accel/tcg/translator.c|  2 +-
>  3 files changed, 11 insertions(+), 11 deletions(-)

Reviewed-by: Richard Henderson 

r~



Re: [PATCH v1 10/12] accel/tcg: remove CF_NOCACHE and special cases

2021-02-09 Thread Richard Henderson
On 2/9/21 10:27 AM, Alex Bennée wrote:
> Now we no longer generate CF_NOCACHE blocks we can remove a bunch of
> the special case handling for them. While we are at it we can remove
> the unused tb->orig_tb field and save a few bytes on the TB structure.
> 
> Signed-off-by: Alex Bennée 
> ---
>  include/exec/exec-all.h   |  3 ---
>  accel/tcg/translate-all.c | 51 ---
>  2 files changed, 15 insertions(+), 39 deletions(-)

Reviewed-by: Richard Henderson 

r~



Re: [PATCH v1 07/12] accel/tcg: actually cache our partial icount TB

2021-02-09 Thread Alex Bennée


Richard Henderson  writes:

> On 2/9/21 10:27 AM, Alex Bennée wrote:
>>  /* Refill decrementer and continue execution.  */
>> -insns_left = MIN(0x, cpu->icount_budget);
>> +insns_left = MIN(CF_COUNT_MASK, cpu->icount_budget);
> ...
>> +g_assert(insns_left < CF_COUNT_MASK);
>
> Why both the MIN and the assert?

Lack of faith in MIN I guess ;-)

I'll drop the assert.

>
>
> r~


-- 
Alex Bennée



Re: [PATCH 0/5] Drop float32/float64 accessors used by gdbstub code

2021-02-09 Thread Alex Bennée


Peter Maydell  writes:

> We used to make a distinction between 'float64'/'float32' types and
> the 'uint64_t'/'uint32_t' types, requiring special conversion
> operations to go between them.  We've now dropped this distinction as
> unnecessary, and the 'float*' types remain primarily for
> documentation purposes when used in places like the function
> prototypes of TCG helper functions.
>
> This means that there's no need for special gdb_get_float64() and
> gdb_get_float32() functions to write float64 or float32 values to the
> GDB protocol buffer; we can just use gdb_get_reg64() and
> gdb_get_reg32().
>
> Similarly, for reading a value out of the GDB buffer into a float64
> or float32 we can use ldq_p() or ldl_p() and need not use ldfq_p()
> or ldfl_p().
>
> This patchseries drops the use of the gdb_get_float* and ldf*
> functions from the three targets that were using them, and then
> removes the now-unused functions from gdbstub.h and bswap.h.

Queued to gdbstub/next, thanks.

-- 
Alex Bennée



Re: [PATCH v1 05/12] tests/plugin: expand insn test to detect duplicate instructions

2021-02-09 Thread Alex Bennée


Richard Henderson  writes:

> On 2/9/21 10:27 AM, Alex Bennée wrote:
>> A duplicate insn is one that is appears to be executed twice in a row.
>> This is currently possible due to -icount and cpu_io_recompile()
>> causing a re-translation of a block. On it's own this won't trigger
>> any tests though.
>> 
>> Signed-off-by: Alex Bennée 
>> 
>> ---
>> [AJB: well not quite, the x86_64 test trips over this due to some
>> weirdness in the way we handle rep insns, e.g. rep movsb (%esi),
>> %es:(%edi) in the x86 bios code]
>
> Ah, but that's not tcg weirdness, that's architectural weirdness.  Multiple
> executions is how "rep" is supposed to work.

As the plugin can know the arch I can just disable the test for x86. At
the moment it doesn't matter because there is only a test for aarch64.

-- 
Alex Bennée



Re: [PATCH v2 63/93] tcg/tci: Use ffi for calls

2021-02-09 Thread Richard Henderson
On 2/8/21 2:55 PM, Richard Henderson wrote:
> Ok, I've reproduced something on a T3 (gcc102.fsffrance.org).
> Running the same code side-by-side vs the T5, I get different results.

Brown paper bag time: the T5 build dir lost the --enable-tcg-interpreter flag,
so was testing tcg native.

Big-endian bug wrt an odd api wart in libffi.  Fixed thus.


r~
diff --git a/tcg/tci.c b/tcg/tci.c
index d27db9f720..dd0cca296a 100644
--- a/tcg/tci.c
+++ b/tcg/tci.c
@@ -557,8 +557,15 @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
 case 0: /* void */
 break;
 case 1: /* uint32_t */
-regs[TCG_REG_R0] = *(uint32_t *)stack;
-break;
+/*
+ * Note that libffi has an odd special case in that it will
+ * always widen an integral result to ffi_arg.
+ */
+if (sizeof(ffi_arg) == 4) {
+regs[TCG_REG_R0] = *(uint32_t *)stack;
+break;
+}
+/* fall through */
 case 2: /* uint64_t */
 if (TCG_TARGET_REG_BITS == 32) {
 tci_write_reg64(regs, TCG_REG_R1, TCG_REG_R0, stack[0]);


[RFC PATCH v2 1/2] Basic PCIe DOE support

2021-02-09 Thread Chris Browy
---
 MAINTAINERS   |   7 +
 hw/pci/meson.build|   1 +
 hw/pci/pcie.c |   2 +-
 hw/pci/pcie_doe.c | 414 ++
 include/hw/pci/pci_ids.h  |   2 +
 include/hw/pci/pcie.h |   1 +
 include/hw/pci/pcie_doe.h | 166 
 include/hw/pci/pcie_regs.h|   4 +
 include/standard-headers/linux/pci_regs.h |   3 +-
 9 files changed, 598 insertions(+), 2 deletions(-)
 create mode 100644 hw/pci/pcie_doe.c
 create mode 100644 include/hw/pci/pcie_doe.h

diff --git a/MAINTAINERS b/MAINTAINERS
index 981dc92..4fb865e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1655,6 +1655,13 @@ F: docs/pci*
 F: docs/specs/*pci*
 F: default-configs/pci.mak
 
+PCIE DOE
+M: Huai-Cheng Kuo 
+M: Chris Browy 
+S: Supported
+F: include/hw/pci/pcie_doe.h
+F: hw/pci/pcie_doe.c
+
 ACPI/SMBIOS
 M: Michael S. Tsirkin 
 M: Igor Mammedov 
diff --git a/hw/pci/meson.build b/hw/pci/meson.build
index 5c4bbac..115e502 100644
--- a/hw/pci/meson.build
+++ b/hw/pci/meson.build
@@ -12,6 +12,7 @@ pci_ss.add(files(
 # allow plugging PCIe devices into PCI buses, include them even if
 # CONFIG_PCI_EXPRESS=n.
 pci_ss.add(files('pcie.c', 'pcie_aer.c'))
+pci_ss.add(files('pcie_doe.c'))
 softmmu_ss.add(when: 'CONFIG_PCI_EXPRESS', if_true: files('pcie_port.c', 
'pcie_host.c'))
 softmmu_ss.add_all(when: 'CONFIG_PCI', if_true: pci_ss)
 
diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c
index 1ecf6f6..f7516c4 100644
--- a/hw/pci/pcie.c
+++ b/hw/pci/pcie.c
@@ -735,7 +735,7 @@ void pcie_cap_slot_write_config(PCIDevice *dev,
 
 hotplug_event_notify(dev);
 
-/* 
+/*
  * 6.7.3.2 Command Completed Events
  *
  * Software issues a command to a hot-plug capable Downstream Port by
diff --git a/hw/pci/pcie_doe.c b/hw/pci/pcie_doe.c
new file mode 100644
index 000..df8e92e
--- /dev/null
+++ b/hw/pci/pcie_doe.c
@@ -0,0 +1,414 @@
+#include "qemu/osdep.h"
+#include "qemu/log.h"
+#include "qemu/error-report.h"
+#include "qapi/error.h"
+#include "qemu/range.h"
+#include "hw/pci/pci.h"
+#include "hw/pci/pcie.h"
+#include "hw/pci/pcie_doe.h"
+#include "hw/pci/msi.h"
+#include "hw/pci/msix.h"
+
+/*
+ * DOE Default Protocols (Discovery, CMA)
+ */
+/* Discovery Request Object */
+struct doe_discovery {
+DOEHeader header;
+uint8_t index;
+uint8_t reserved[3];
+} QEMU_PACKED;
+
+/* Discovery Response Object */
+struct doe_discovery_rsp {
+DOEHeader header;
+uint16_t vendor_id;
+uint8_t doe_type;
+uint8_t next_index;
+} QEMU_PACKED;
+
+/* Callback for Discovery */
+static bool pcie_doe_discovery_rsp(DOECap *doe_cap)
+{
+PCIEDOE *doe = doe_cap->doe;
+struct doe_discovery *req = pcie_doe_get_req(doe_cap);
+uint8_t index = req->index;
+DOEProtocol *prot = NULL;
+
+/* Request length mismatch, discard */
+if (req->header.length < dwsizeof(struct doe_discovery)) {
+return DOE_DISCARD;
+}
+
+/* Point to the requested protocol */
+if (index < doe->protocol_num) {
+prot = >protocols[index];
+}
+
+struct doe_discovery_rsp rsp = {
+.header = {
+.vendor_id = PCI_VENDOR_ID_PCI_SIG,
+.doe_type = PCI_SIG_DOE_DISCOVERY,
+.reserved = 0x0,
+.length = dwsizeof(struct doe_discovery_rsp),
+},
+.vendor_id = (prot) ? prot->vendor_id : 0x,
+.doe_type = (prot) ? prot->doe_type : 0xFF,
+.next_index = (index + 1) < doe->protocol_num ?
+  (index + 1) : 0,
+};
+
+pcie_doe_set_rsp(doe_cap, );
+
+return DOE_SUCCESS;
+}
+
+/* Callback for CMA */
+static bool pcie_doe_cma_rsp(DOECap *doe_cap)
+{
+doe_cap->status.error = 1;
+
+memset(doe_cap->read_mbox, 0,
+   PCI_DOE_MAX_DW_SIZE * sizeof(uint32_t));
+
+doe_cap->write_mbox_len = 0;
+
+return DOE_DISCARD;
+}
+
+/*
+ * DOE Utilities
+ */
+static void pcie_doe_reset_mbox(DOECap *st)
+{
+st->read_mbox_idx = 0;
+
+st->read_mbox_len = 0;
+st->write_mbox_len = 0;
+
+memset(st->read_mbox, 0, PCI_DOE_MAX_DW_SIZE * sizeof(uint32_t));
+memset(st->write_mbox, 0, PCI_DOE_MAX_DW_SIZE * sizeof(uint32_t));
+}
+
+/*
+ * Initialize the list and protocol for a device.
+ * This function won't add the DOE capabitity to your PCIe device.
+ */
+void pcie_doe_init(PCIDevice *dev, PCIEDOE *doe)
+{
+doe->pdev = dev;
+doe->head = NULL;
+doe->protocol_num = 0;
+
+/* Register two default protocol */
+//TODO : LINK LIST
+pcie_doe_register_protocol(doe, PCI_VENDOR_ID_PCI_SIG,
+PCI_SIG_DOE_DISCOVERY, pcie_doe_discovery_rsp);
+pcie_doe_register_protocol(doe, PCI_VENDOR_ID_PCI_SIG,
+PCI_SIG_DOE_CMA, pcie_doe_cma_rsp);
+}
+
+int pcie_cap_doe_add(PCIEDOE *doe, uint16_t offset, bool intr, uint16_t vec) {
+DOECap *new_cap, **ptr;
+PCIDevice *dev = doe->pdev;
+
+pcie_add_capability(dev, 

  1   2   3   4   5   >