On Thu, 2022-08-11 at 12:11 +0000, Janosch Frank wrote: > Sometimes dumping a guest from the outside is the only way to get the > data that is needed. This can be the case if a dumping mechanism like > KDUMP hasn't been configured or data needs to be fetched at a specific > point. Dumping a protected guest from the outside without help from > fw/hw doesn't yield sufficient data to be useful. Hence we now > introduce PV dump support. > > The PV dump support works by integrating the firmware into the dump > process. New Ultravisor calls are used to initiate the dump process, > dump cpu data, dump memory state and lastly complete the dump process. > The UV calls are exposed by KVM via the new KVM_PV_DUMP command and > its subcommands. The guest's data is fully encrypted and can only be > decrypted by the entity that owns the customer communication key for > the dumped guest. Also dumping needs to be allowed via a flag in the > SE header. > > On the QEMU side of things we store the PV dump data in the newly > introduced architecture ELF sections (storage state and completion > data) and the cpu notes (for cpu dump data). > > Users can use the zgetdump tool to convert the encrypted QEMU dump to an > unencrypted one.
Does PV dump work when memory is being filtered? Are there any constraints on the filter parameters, alignment or so? > > Signed-off-by: Janosch Frank <fran...@linux.ibm.com> > --- > dump/dump.c | 12 +- > include/sysemu/dump.h | 5 + > target/s390x/arch_dump.c | 242 ++++++++++++++++++++++++++++++++++----- > 3 files changed, 227 insertions(+), 32 deletions(-) [...] > > typedef struct NoteFuncDescStruct { > int contents_size; > + uint64_t (*note_size_func)(void); /* NULL for non-dynamic sized contents > */ > void (*note_contents_func)(Note *note, S390CPU *cpu, int id); > + bool pvonly; > } NoteFuncDesc; > > static const NoteFuncDesc note_core[] = { > - {sizeof_field(Note, contents.prstatus), s390x_write_elf64_prstatus}, > - {sizeof_field(Note, contents.fpregset), s390x_write_elf64_fpregset}, > - { 0, NULL} > + {sizeof_field(Note, contents.prstatus), NULL, > s390x_write_elf64_prstatus, false}, > + {sizeof_field(Note, contents.fpregset), NULL, > s390x_write_elf64_fpregset, false}, > + { 0, NULL, NULL} > }; > > static const NoteFuncDesc note_linux[] = { > - {sizeof_field(Note, contents.prefix), s390x_write_elf64_prefix}, > - {sizeof_field(Note, contents.ctrs), s390x_write_elf64_ctrs}, > - {sizeof_field(Note, contents.timer), s390x_write_elf64_timer}, > - {sizeof_field(Note, contents.todcmp), s390x_write_elf64_todcmp}, > - {sizeof_field(Note, contents.todpreg), s390x_write_elf64_todpreg}, > - {sizeof_field(Note, contents.vregslo), s390x_write_elf64_vregslo}, > - {sizeof_field(Note, contents.vregshi), s390x_write_elf64_vregshi}, > - {sizeof_field(Note, contents.gscb), s390x_write_elf64_gscb}, > - { 0, NULL} > + {sizeof_field(Note, contents.prefix), NULL, s390x_write_elf64_prefix, > false}, > + {sizeof_field(Note, contents.ctrs), NULL, s390x_write_elf64_ctrs, > false}, > + {sizeof_field(Note, contents.timer), NULL, s390x_write_elf64_timer, > false}, > + {sizeof_field(Note, contents.todcmp), NULL, s390x_write_elf64_todcmp, > false}, > + {sizeof_field(Note, contents.todpreg), NULL, s390x_write_elf64_todpreg, > false}, > + {sizeof_field(Note, contents.vregslo), NULL, s390x_write_elf64_vregslo, > false}, > + {sizeof_field(Note, contents.vregshi), NULL, s390x_write_elf64_vregshi, > false}, > + {sizeof_field(Note, contents.gscb), NULL, s390x_write_elf64_gscb, > false}, > + {0, kvm_s390_pv_dmp_get_size_cpu, s390x_write_elf64_pv, true}, > + { 0, NULL, NULL} > }; > > static int s390x_write_elf64_notes(const char *note_name, > @@ -207,22 +226,41 @@ static int s390x_write_elf64_notes(const char > *note_name, > DumpState *s, > const NoteFuncDesc *funcs) > { > - Note note; > + Note note, *notep; > const NoteFuncDesc *nf; > - int note_size; > + int note_size, content_size; Could make those size_t. I guess it's not necessary, but we're kind of a dumb pipe for data from the ultravisor so there's something to be said for not making assumptions. > int ret = -1; > > assert(strlen(note_name) < sizeof(note.name)); > > for (nf = funcs; nf->note_contents_func; nf++) { > - memset(¬e, 0, sizeof(note)); > - note.hdr.n_namesz = cpu_to_be32(strlen(note_name) + 1); > - note.hdr.n_descsz = cpu_to_be32(nf->contents_size); > - g_strlcpy(note.name, note_name, sizeof(note.name)); > - (*nf->note_contents_func)(¬e, cpu, id); > + notep = ¬e; > + if (nf->pvonly && !s390_is_pv()) { > + continue; > + } > > - note_size = sizeof(note) - sizeof(note.contents) + nf->contents_size; > - ret = f(¬e, note_size, s); > + content_size = nf->contents_size ? nf->contents_size : > nf->note_size_func(); Your comment above says /* NULL for non-dynamic sized contents */ So it makes sense to condition on that, i.e. + content_size = nf->contents_size_func ? nf->note_size_func() : nf->contents_size; And it makes it consistent with the ifs below > + note_size = sizeof(note) - sizeof(notep->contents) + content_size; > + > + /* Notes with dynamic sizes need to allocate a note */ > + if (nf->note_size_func) { > + notep = g_malloc0(note_size); > + } > + > + memset(notep, 0, sizeof(note)); > + > + /* Setup note header data */ > + notep->hdr.n_descsz = cpu_to_be32(content_size); > + notep->hdr.n_namesz = cpu_to_be32(strlen(note_name) + 1); > + g_strlcpy(notep->name, note_name, sizeof(notep->name)); > + > + /* Get contents and write them out */ > + (*nf->note_contents_func)(notep, cpu, id); > + ret = f(notep, note_size, s); > + > + if (nf->note_size_func) { > + g_free(notep); > + } > > if (ret < 0) { > return -1; > @@ -247,12 +285,161 @@ int s390_cpu_write_elf64_note(WriteCoreDumpFunction f, > CPUState *cs, > return s390x_write_elf64_notes("LINUX", f, cpu, cpuid, s, note_linux); > } > > +/* PV dump section size functions */ > +static uint64_t get_dump_stor_state_size_from_len(uint64_t len) > +{ > + return (len / (1 << 20)) * kvm_s390_pv_dmp_get_size_stor_state(); > +} > + > +static uint64_t get_size_stor_state(DumpState *s) > +{ > + return get_dump_stor_state_size_from_len(s->total_size); > +} > + > +static uint64_t get_size_complete(DumpState *s) > +{ > + return kvm_s390_pv_dmp_get_size_complete(); > +} > + > +/* PV dump section data functions*/ > +static int get_data_complete(DumpState *s, uint8_t *buff) > +{ > + int rc; > + > + if (!pv_dump_initialized) { > + return 0; > + } > + rc = kvm_s390_dump_complete(buff); > + if (!rc) { > + pv_dump_initialized = false; > + } > + return rc; > +} > + > +static int dump_mem(DumpState *s, uint64_t gaddr, uint8_t *buff, uint64_t > buff_len) The DumpState arg is being ignored. > +{ > + /* We need the gaddr + len and something to write to */ > + if (!pv_dump_initialized) { > + return 0; > + } > + return kvm_s390_dump_mem(gaddr, buff_len, buff); > +} > + > +static int get_data_mem(DumpState *s, uint8_t *buff) > +{ > + int64_t memblock_size, memblock_start; > + GuestPhysBlock *block; > + uint64_t off; > + > + QTAILQ_FOREACH(block, &s->guest_phys_blocks.head, next) { > + memblock_start = dump_filtered_memblock_start(block, > s->filter_area_begin, > + s->filter_area_length); > + if (memblock_start == -1) { > + continue; > + } > + > + memblock_size = dump_filtered_memblock_size(block, > s->filter_area_begin, > + s->filter_area_length); > + > + off = get_dump_stor_state_size_from_len(block->target_start); > + dump_mem(s, block->target_start, buff + off, > + get_dump_stor_state_size_from_len(memblock_size)); This ignores the return value, if this is intentional a comment explaining it would be nice. > + } > + > + return 0; > +} > + > +struct sections { > + uint64_t (*sections_size_func)(DumpState *s); > + int (*sections_contents_func)(DumpState *s, uint8_t *buff); > + char sctn_str[12]; > +} sections[] = { > + { get_size_stor_state, get_data_mem, "pv_mem_meta"}, > + { get_size_complete, get_data_complete, "pv_compl"}, > + {NULL , NULL, ""} > +}; Could be static right? > + > +static uint64_t arch_sections_write_hdr(DumpState *s, uint8_t *buff) > +{ > + Elf64_Shdr *shdr = (void *)buff; > + struct sections *sctn = sections; > + uint64_t off = s->section_offset; > + > + if (!s390_is_pv()) { > + return 0; > + } > + > + for (; sctn->sections_size_func; off += shdr->sh_size, sctn++, shdr++) { > + memset(shdr, 0, sizeof(*shdr)); This isn't necessary since the header mem is zero allocated, but you might not want to make guarantees about that. Maybe consider doing a normal malloc and memsetting just the special 0 index section header in dump.c. > + shdr->sh_type = SHT_PROGBITS; > + shdr->sh_offset = off; > + shdr->sh_size = sctn->sections_size_func(s); > + shdr->sh_name = s->string_table_buf->len; > + g_array_append_vals(s->string_table_buf, sctn->sctn_str, > sizeof(sctn->sctn_str)); > + } > + > + return (uintptr_t)shdr - (uintptr_t)buff; > +} > + > + > +/* Add arch specific number of sections and their respective sizes */ > +static void arch_sections_add(DumpState *s) > +{ > + struct sections *sctn = sections; > + > + /* > + * We only do a PV dump if we are running a PV guest, KVM supports > + * the dump API and we got valid dump length information. > + */ > + if (!s390_is_pv() || !kvm_s390_get_protected_dump() || > + !kvm_s390_pv_info_basic_valid()) { > + return; > + } > + > + /* > + * Start the UV dump process by doing the initialize dump call via > + * KVM as the proxy. > + */ > + if (!kvm_s390_dump_init()) { > + pv_dump_initialized = true; > + } > + > + for (; sctn->sections_size_func; sctn++) { > + s->shdr_num += 1; > + s->elf_section_data_size += sctn->sections_size_func(s); > + } > + > + /* We use the string table to identify the sections */ > + s->string_table_usage = true; In dump.c this value is being set if shdr_num > 0, so this seems redundant. > +} > + > +/* > + * After the PV dump has been initialized, the CPU data has been > + * fetched and memory has been dumped, we need to grab the tweak data > + * and the completion data. > + */ > +static void arch_sections_write(DumpState *s, uint8_t *buff) > +{ > + struct sections *sctn = sections; > + > + /* shdr_num should only have been set > 1 if we are protected */ > + assert(s390_is_pv()); > + > + for (; sctn->sections_size_func; sctn++) { > + sctn->sections_contents_func(s, buff); As above with regards to ignoring return values. > + buff += sctn->sections_size_func(s); > + } > +} > + > int cpu_get_dump_info(ArchDumpInfo *info, > const struct GuestPhysBlockList *guest_phys_blocks) > { > info->d_machine = EM_S390; > info->d_endian = ELFDATA2MSB; > info->d_class = ELFCLASS64; > + info->arch_sections_add_fn = *arch_sections_add; > + info->arch_sections_write_hdr_fn = *arch_sections_write_hdr; > + info->arch_sections_write_fn = *arch_sections_write; > > return 0; > } > @@ -261,7 +448,7 @@ ssize_t cpu_get_note_size(int class, int machine, int > nr_cpus) > { > int name_size = 8; /* "LINUX" or "CORE" + pad */ > size_t elf_note_size = 0; > - int note_head_size; > + int note_head_size, content_size; > const NoteFuncDesc *nf; > > assert(class == ELFCLASS64); > @@ -270,12 +457,15 @@ ssize_t cpu_get_note_size(int class, int machine, int > nr_cpus) > note_head_size = sizeof(Elf64_Nhdr); > > for (nf = note_core; nf->note_contents_func; nf++) { > - elf_note_size = elf_note_size + note_head_size + name_size + > - nf->contents_size; > + elf_note_size = elf_note_size + note_head_size + name_size + > nf->contents_size; > } > for (nf = note_linux; nf->note_contents_func; nf++) { > + if (nf->pvonly && !s390_is_pv()) { > + continue; > + } > + content_size = nf->contents_size ? nf->contents_size : > nf->note_size_func(); > elf_note_size = elf_note_size + note_head_size + name_size + > - nf->contents_size; > + content_size; > } > > return (elf_note_size) * nr_cpus;