On Mon, Sep 20, 2021 at 02:49:46PM -0300, Daniel Henrique Barboza wrote:
> The main feature of FORM2 affinity support is the separation of NUMA
> distances from ibm,associativity information. This allows for a more
> flexible and straightforward NUMA distance assignment without relying on
> complex associations between several levels of NUMA via
> ibm,associativity matches. Another feature is its extensibility. This base
> support contains the facilities for NUMA distance assignment, but in the
> future more facilities will be added for latency, performance, bandwidth
> and so on.
> 
> This patch implements the base FORM2 affinity support as follows:
> 
> - the use of FORM2 associativity is indicated by using bit 2 of byte 5
> of ibm,architecture-vec-5. A FORM2 aware guest can choose to use FORM1
> or FORM2 affinity. Setting both forms will default to FORM2. We're not
> advertising FORM2 for pseries-6.1 and older machine versions to prevent
> guest visible changes in those;
> 
> - ibm,associativity-reference-points has a new semantic. Instead of
> being used to calculate distances via NUMA levels, it's now used to
> indicate the primary domain index in the ibm,associativity domain of
> each resource. In our case it's set to {0x4}, matching the position
> where we already place logical_domain_id;
> 
> - two new RTAS DT artifacts are introduced: ibm,numa-lookup-index-table
> and ibm,numa-distance-table. The index table is used to list all the
> NUMA logical domains of the platform, in ascending order, and allows for
> spartial NUMA configurations (although QEMU ATM doesn't support that).
> ibm,numa-distance-table is an array that contains all the distances from
> the first NUMA node to all other nodes, then the second NUMA node
> distances to all other nodes and so on;
> 
> - get_max_dist_ref_points(), get_numa_assoc_size() and get_associativity()
> now checks for OV5_FORM2_AFFINITY and returns FORM2 values if the guest
> selected FORM2 affinity during CAS.
> 
> Reviewed-by: Greg Kurz <gr...@kaod.org>
> Signed-off-by: Daniel Henrique Barboza <danielhb...@gmail.com>
> ---
>  hw/ppc/spapr.c              |   8 ++
>  hw/ppc/spapr_numa.c         | 146 ++++++++++++++++++++++++++++++++++++
>  include/hw/ppc/spapr.h      |   9 +++
>  include/hw/ppc/spapr_ovec.h |   1 +
>  4 files changed, 164 insertions(+)
> 
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index ada85ee083..babb662845 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -2752,6 +2752,11 @@ static void spapr_machine_init(MachineState *machine)
>  
>      spapr_ovec_set(spapr->ov5, OV5_FORM1_AFFINITY);
>  
> +    /* Do not advertise FORM2 NUMA support for pseries-6.1 and older */
> +    if (!smc->pre_6_2_numa_affinity) {
> +        spapr_ovec_set(spapr->ov5, OV5_FORM2_AFFINITY);
> +    }
> +
>      /* advertise support for dedicated HP event source to guests */
>      if (spapr->use_hotplug_event_source) {
>          spapr_ovec_set(spapr->ov5, OV5_HP_EVT);
> @@ -4667,8 +4672,11 @@ DEFINE_SPAPR_MACHINE(6_2, "6.2", true);
>   */
>  static void spapr_machine_6_1_class_options(MachineClass *mc)
>  {
> +    SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
> +
>      spapr_machine_6_2_class_options(mc);
>      compat_props_add(mc->compat_props, hw_compat_6_1, hw_compat_6_1_len);
> +    smc->pre_6_2_numa_affinity = true;
>  }
>  
>  DEFINE_SPAPR_MACHINE(6_1, "6.1", false);
> diff --git a/hw/ppc/spapr_numa.c b/hw/ppc/spapr_numa.c
> index 6718c0fdd1..13db321997 100644
> --- a/hw/ppc/spapr_numa.c
> +++ b/hw/ppc/spapr_numa.c
> @@ -24,6 +24,10 @@
>   */
>  static int get_max_dist_ref_points(SpaprMachineState *spapr)
>  {
> +    if (spapr_ovec_test(spapr->ov5_cas, OV5_FORM2_AFFINITY)) {
> +        return FORM2_DIST_REF_POINTS;
> +    }
> +
>      return FORM1_DIST_REF_POINTS;
>  }
>  
> @@ -32,6 +36,10 @@ static int get_max_dist_ref_points(SpaprMachineState 
> *spapr)
>   */
>  static int get_numa_assoc_size(SpaprMachineState *spapr)
>  {
> +    if (spapr_ovec_test(spapr->ov5_cas, OV5_FORM2_AFFINITY)) {
> +        return FORM2_NUMA_ASSOC_SIZE;
> +    }
> +
>      return FORM1_NUMA_ASSOC_SIZE;
>  }
>  
> @@ -52,6 +60,9 @@ static int get_vcpu_assoc_size(SpaprMachineState *spapr)
>   */
>  static const uint32_t *get_associativity(SpaprMachineState *spapr, int 
> node_id)
>  {
> +    if (spapr_ovec_test(spapr->ov5_cas, OV5_FORM2_AFFINITY)) {
> +        return spapr->FORM2_assoc_array[node_id];
> +    }
>      return spapr->FORM1_assoc_array[node_id];
>  }
>  
> @@ -295,14 +306,50 @@ static void 
> spapr_numa_FORM1_affinity_init(SpaprMachineState *spapr,
>      spapr_numa_define_FORM1_domains(spapr);
>  }
>  
> +/*
> + * Init NUMA FORM2 machine state data
> + */
> +static void spapr_numa_FORM2_affinity_init(SpaprMachineState *spapr)
> +{
> +    int i;
> +
> +    /*
> +     * For all resources but CPUs, FORM2 associativity arrays will
> +     * be a size 2 array with the following format:
> +     *
> +     * ibm,associativity = {1, numa_id}
> +     *
> +     * CPUs will write an additional 'vcpu_id' on top of the arrays
> +     * being initialized here. 'numa_id' is represented by the
> +     * index 'i' of the loop.
> +     *
> +     * Given that this initialization is also valid for GPU associativity
> +     * arrays, handle everything in one single step by populating the
> +     * arrays up to NUMA_NODES_MAX_NUM.
> +     */
> +    for (i = 0; i < NUMA_NODES_MAX_NUM; i++) {
> +        spapr->FORM2_assoc_array[i][0] = cpu_to_be32(1);
> +        spapr->FORM2_assoc_array[i][1] = cpu_to_be32(i);
> +    }
> +}
> +
>  void spapr_numa_associativity_init(SpaprMachineState *spapr,
>                                     MachineState *machine)
>  {
>      spapr_numa_FORM1_affinity_init(spapr, machine);
> +    spapr_numa_FORM2_affinity_init(spapr);
>  }
>  
>  void spapr_numa_associativity_check(SpaprMachineState *spapr)
>  {
> +    /*
> +     * FORM2 does not have any restrictions we need to handle
> +     * at CAS time, for now.
> +     */
> +    if (spapr_ovec_test(spapr->ov5_cas, OV5_FORM2_AFFINITY)) {
> +        return;
> +    }
> +
>      spapr_numa_FORM1_affinity_check(MACHINE(spapr));
>  }
>  
> @@ -447,6 +494,100 @@ static void 
> spapr_numa_FORM1_write_rtas_dt(SpaprMachineState *spapr,
>                       maxdomains, sizeof(maxdomains)));
>  }
>  
> +static void spapr_numa_FORM2_write_rtas_tables(SpaprMachineState *spapr,
> +                                               void *fdt, int rtas)
> +{
> +    MachineState *ms = MACHINE(spapr);
> +    NodeInfo *numa_info = ms->numa_state->nodes;
> +    int nb_numa_nodes = ms->numa_state->num_nodes;
> +    int distance_table_entries = nb_numa_nodes * nb_numa_nodes;
> +    g_autofree uint32_t *lookup_index_table = NULL;
> +    g_autofree uint32_t *distance_table = NULL;
> +    int src, dst, i, distance_table_size;
> +    uint8_t *node_distances;
> +
> +    /*
> +     * ibm,numa-lookup-index-table: array with length and a
> +     * list of NUMA ids present in the guest.
> +     */
> +    lookup_index_table = g_new0(uint32_t, nb_numa_nodes + 1);
> +    lookup_index_table[0] = cpu_to_be32(nb_numa_nodes);
> +
> +    for (i = 0; i < nb_numa_nodes; i++) {
> +        lookup_index_table[i + 1] = cpu_to_be32(i);
> +    }
> +
> +    _FDT(fdt_setprop(fdt, rtas, "ibm,numa-lookup-index-table",
> +                     lookup_index_table,
> +                     (nb_numa_nodes + 1) * sizeof(uint32_t)));
> +
> +    /*
> +     * ibm,numa-distance-table: contains all node distances. First
> +     * element is the size of the table as uint32, followed up
> +     * by all the uint8 distances from the first NUMA node, then all
> +     * distances from the second NUMA node and so on.
> +     *
> +     * ibm,numa-lookup-index-table is used by guest to navigate this
> +     * array because NUMA ids can be sparse (node 0 is the first,
> +     * node 8 is the second ...).
> +     */
> +    distance_table = g_new0(uint32_t, distance_table_entries + 1);

You're allocating significantly more than you need here, since the
actual distance entries are u8, not u32.  That can be fixed as a
followup, however.

> +    distance_table[0] = cpu_to_be32(distance_table_entries);
> +
> +    node_distances = (uint8_t *)&distance_table[1];
> +    i = 0;
> +
> +    for (src = 0; src < nb_numa_nodes; src++) {
> +        for (dst = 0; dst < nb_numa_nodes; dst++) {
> +            node_distances[i++] = numa_info[src].distance[dst];
> +        }
> +    }
> +
> +    distance_table_size = distance_table_entries * sizeof(uint8_t) +
> +                          sizeof(uint32_t);
> +    _FDT(fdt_setprop(fdt, rtas, "ibm,numa-distance-table",
> +                     distance_table, distance_table_size));
> +}
> +
> +/*
> + * This helper could be compressed in a single function with
> + * FORM1 logic since we're setting the same DT values, with the
> + * difference being a call to spapr_numa_FORM2_write_rtas_tables()
> + * in the end. The separation was made to avoid clogging FORM1 code
> + * which already has to deal with compat modes from previous
> + * QEMU machine types.
> + */
> +static void spapr_numa_FORM2_write_rtas_dt(SpaprMachineState *spapr,
> +                                           void *fdt, int rtas)
> +{
> +    MachineState *ms = MACHINE(spapr);
> +    uint32_t number_nvgpus_nodes = spapr->gpu_numa_id -
> +                                   spapr_numa_initial_nvgpu_numa_id(ms);
> +
> +    /*
> +     * In FORM2, ibm,associativity-reference-points will point to
> +     * the element in the ibm,associativity array that contains the
> +     * primary domain index (for FORM2, the first element).
> +     *
> +     * This value (in our case, the numa-id) is then used as an index
> +     * to retrieve all other attributes of the node (distance,
> +     * bandwidth, latency) via ibm,numa-lookup-index-table and other
> +     * ibm,numa-*-table properties.
> +     */
> +    uint32_t refpoints[] = { cpu_to_be32(1) };
> +
> +    uint32_t maxdomain = ms->numa_state->num_nodes + number_nvgpus_nodes;
> +    uint32_t maxdomains[] = { cpu_to_be32(1), cpu_to_be32(maxdomain) };
> +
> +    _FDT(fdt_setprop(fdt, rtas, "ibm,associativity-reference-points",
> +                     refpoints, sizeof(refpoints)));
> +
> +    _FDT(fdt_setprop(fdt, rtas, "ibm,max-associativity-domains",
> +                     maxdomains, sizeof(maxdomains)));
> +
> +    spapr_numa_FORM2_write_rtas_tables(spapr, fdt, rtas);
> +}
> +
>  /*
>   * Helper that writes ibm,associativity-reference-points and
>   * max-associativity-domains in the RTAS pointed by @rtas
> @@ -454,6 +595,11 @@ static void 
> spapr_numa_FORM1_write_rtas_dt(SpaprMachineState *spapr,
>   */
>  void spapr_numa_write_rtas_dt(SpaprMachineState *spapr, void *fdt, int rtas)
>  {
> +    if (spapr_ovec_test(spapr->ov5_cas, OV5_FORM2_AFFINITY)) {
> +        spapr_numa_FORM2_write_rtas_dt(spapr, fdt, rtas);
> +        return;
> +    }
> +
>      spapr_numa_FORM1_write_rtas_dt(spapr, fdt, rtas);
>  }
>  
> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> index 6b3dfc5dc2..ee7504b976 100644
> --- a/include/hw/ppc/spapr.h
> +++ b/include/hw/ppc/spapr.h
> @@ -118,6 +118,13 @@ typedef enum {
>  #define FORM1_DIST_REF_POINTS            4
>  #define FORM1_NUMA_ASSOC_SIZE            (FORM1_DIST_REF_POINTS + 1)
>  
> +/*
> + * FORM2 NUMA affinity has a single associativity domain, giving
> + * us a assoc size of 2.
> + */
> +#define FORM2_DIST_REF_POINTS            1
> +#define FORM2_NUMA_ASSOC_SIZE            (FORM2_DIST_REF_POINTS + 1)
> +
>  typedef struct SpaprCapabilities SpaprCapabilities;
>  struct SpaprCapabilities {
>      uint8_t caps[SPAPR_CAP_NUM];
> @@ -145,6 +152,7 @@ struct SpaprMachineClass {
>      hwaddr rma_limit;          /* clamp the RMA to this size */
>      bool pre_5_1_assoc_refpoints;
>      bool pre_5_2_numa_associativity;
> +    bool pre_6_2_numa_affinity;
>  
>      bool (*phb_placement)(SpaprMachineState *spapr, uint32_t index,
>                            uint64_t *buid, hwaddr *pio,
> @@ -250,6 +258,7 @@ struct SpaprMachineState {
>      SpaprTpmProxy *tpm_proxy;
>  
>      uint32_t FORM1_assoc_array[NUMA_NODES_MAX_NUM][FORM1_NUMA_ASSOC_SIZE];
> +    uint32_t FORM2_assoc_array[NUMA_NODES_MAX_NUM][FORM2_NUMA_ASSOC_SIZE];
>  
>      Error *fwnmi_migration_blocker;
>  };
> diff --git a/include/hw/ppc/spapr_ovec.h b/include/hw/ppc/spapr_ovec.h
> index 48b716a060..c3e8b98e7e 100644
> --- a/include/hw/ppc/spapr_ovec.h
> +++ b/include/hw/ppc/spapr_ovec.h
> @@ -49,6 +49,7 @@ typedef struct SpaprOptionVector SpaprOptionVector;
>  /* option vector 5 */
>  #define OV5_DRCONF_MEMORY       OV_BIT(2, 2)
>  #define OV5_FORM1_AFFINITY      OV_BIT(5, 0)
> +#define OV5_FORM2_AFFINITY      OV_BIT(5, 2)
>  #define OV5_HP_EVT              OV_BIT(6, 5)
>  #define OV5_HPT_RESIZE          OV_BIT(6, 7)
>  #define OV5_DRMEM_V2            OV_BIT(22, 0)

-- 
David Gibson                    | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
                                | _way_ _around_!
http://www.ozlabs.org/~dgibson

Attachment: signature.asc
Description: PGP signature

Reply via email to