Hi Jan,
On 2022/9/8 17:09, Jan Beulich wrote:
On 02.09.2022 05:31, Wei Chen wrote:
--- /dev/null
+++ b/xen/common/numa.c
@@ -0,0 +1,442 @@
+/*
+ * Generic VM initialization for NUMA setups.
+ * Copyright 2002,2003 Andi Kleen, SuSE Labs.
+ * Adapted for Xen: Ryan Harper <ry...@us.ibm.com>
+ */
+
+#include <xen/init.h>
+#include <xen/keyhandler.h>
+#include <xen/mm.h>
+#include <xen/nodemask.h>
+#include <xen/numa.h>
+#include <xen/param.h>
+#include <xen/sched.h>
+#include <xen/softirq.h>
+
+struct node_data __ro_after_init node_data[MAX_NUMNODES];
+
+/* Mapping from pdx to node id */
+unsigned int __ro_after_init memnode_shift;
+unsigned long __ro_after_init memnodemapsize;
+uint8_t *__ro_after_init memnodemap;
+static uint8_t __ro_after_init _memnodemap[64];
These last two want to use nodeid_t instead of uint8_t. Originally
the latter used typeof(*memnodemap) for (I think - iirc it was me who
made it that way) this reason: That way correcting memnodemap's type
would have propagated without the need for further adjustments.
Thanks for this info, should I need to restore it to use
"typeof(*memnodemap)" in next version ?
+nodeid_t __read_mostly cpu_to_node[NR_CPUS] = {
+ [0 ... NR_CPUS-1] = NUMA_NO_NODE
+};
+
+cpumask_t __read_mostly node_to_cpumask[MAX_NUMNODES];
+
+nodemask_t __read_mostly node_online_map = { { [0] = 1UL } };
+
+bool __read_mostly numa_off;
The v3 review discussing this possibly becoming __ro_after_init didn't
really finish (you didn't reply to my latest request there), but you
also didn't change the attribute. Please clarify.
I think I had answered your question by:
>> I think yes, it will be used in numa_disabled and numa_disabled will
>> be called in cpu_add."
And you replied me with:
> In the original code I cannot spot such a path - can you please point
> out how exactly you see numa_disabled() reachable from cpu_add()? I'm
> clearly overlooking something ..."
But there is a time difference here, your reply was sent after I sent
v3, maybe you didn't notice it
About the new question:
cpu_add will call srat_disabled, srat_disabled will access numa_off.
srat_disabled is a function without __init.
+static int __init populate_memnodemap(const struct node *nodes,
+ unsigned int numnodes, unsigned int
shift,
+ nodeid_t *nodeids)
Can't this be pointer-to-const, and then also in the caller?
Yes, it's possible, I will fix it.
+static unsigned int __init extract_lsb_from_nodes(const struct node *nodes,
+ nodeid_t numnodes)
+{
+ unsigned int i;
+ nodeid_t nodes_used = 0;
This once again is a variable which doesn't really hold a node ID (but
instead is a counter), and hence would better be unsigned int (see
./CODING_STYLE).
Ok.
+ unsigned long spdx, epdx;
+ unsigned long bitfield = 0, memtop = 0;
+
+ for ( i = 0; i < numnodes; i++ )
+ {
+ spdx = paddr_to_pdx(nodes[i].start);
+ epdx = paddr_to_pdx(nodes[i].end - 1) + 1;
+ if ( spdx >= epdx )
+ continue;
+ bitfield |= spdx;
+ nodes_used++;
+ if ( epdx > memtop )
+ memtop = epdx;
+ }
+ if ( nodes_used <= 1 )
+ i = BITS_PER_LONG - 1;
+ else
+ i = find_first_bit(&bitfield, sizeof(unsigned long)*8);
Please add the missing blanks around * .
Ok.
+ memnodemapsize = (memtop >> i) + 1;
+ return i;
Please add the missing blank line before the (main) return statement
of the function.
I'll fix him and other places, if there are any.
+int __init compute_hash_shift(const struct node *nodes,
+ nodeid_t numnodes, nodeid_t *nodeids)
While I agree that nodeid_t can hold all necessary values, I still
don't think a cound should be expressed by nodeid_t. As above - see
./CODING_STYLE.
Ok.
+{
+ unsigned int shift;
+
+ shift = extract_lsb_from_nodes(nodes, numnodes);
+ if ( memnodemapsize <= ARRAY_SIZE(_memnodemap) )
+ memnodemap = _memnodemap;
+ else if ( allocate_cachealigned_memnodemap() )
+ return -1;
+ printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n", shift);
This wants to be %u now. I'd also like to ask to drop the full stop
at this occasion.
Ok, that makes sense.
+ if ( populate_memnodemap(nodes, numnodes, shift, nodeids) != 1 )
+ {
+ printk(KERN_INFO "Your memory is not aligned you need to "
+ "rebuild your hypervisor with a bigger NODEMAPSIZE "
+ "shift=%d\n", shift);
Again %u please.
Ack.
+/* initialize NODE_DATA given nodeid and start/end */
+void __init setup_node_bootmem(nodeid_t nodeid, paddr_t start, paddr_t end)
Please capitalize the first letter of the comment (see ./CODING_STYLE).
Ok.
+void __init numa_init_array(void)
+{
+ unsigned int i;
+ nodeid_t rr;
+
+ /*
+ * There are unfortunately some poorly designed mainboards around
+ * that only connect memory to a single CPU. This breaks the 1:1 cpu->node
+ * mapping. To avoid this fill in the mapping for all possible
+ * CPUs, as the number of CPUs is not known yet.
+ * We round robin the existing nodes.
+ */
Along with the style correction re-flowing of the text would have been
nice, such the lines aren't wrapped seemingly randomly without utilizing
permitted line length.
I will adjust it.
+void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
+{
+ unsigned int i;
+ paddr_t start = pfn_to_paddr(start_pfn);
+ paddr_t end = pfn_to_paddr(end_pfn);
+
+#ifdef CONFIG_NUMA_EMU
+ if ( numa_fake && !numa_emulation(start_pfn, end_pfn) )
+ return;
+#endif
+
+#ifdef CONFIG_NUMA
+ if ( !numa_off && !numa_scan_nodes(start, end) )
+ return;
+#endif
+
+ printk(KERN_INFO "%s\n",
+ numa_off ? "NUMA turned off" : "No NUMA configuration found");
+
+ printk(KERN_INFO "Faking a node at %"PRIpaddr"-%"PRIpaddr"\n",
+ start, end);
+ /* setup dummy node covering all memory */
Please again capitalize the first character of the comment.
Ok.
+static void cf_check dump_numa(unsigned char key)
+{
+ s_time_t now = NOW();
+ unsigned int i, j, n;
+ struct domain *d;
+ struct page_info *page;
Along with the various other style corrections perhaps add const here?
I will add it.
+ unsigned int page_num_node[MAX_NUMNODES];
+ const struct vnuma_info *vnuma;
+
+ printk("'%c' pressed -> dumping numa info (now = %"PRI_stime")\n", key,
+ now);
+
+ for_each_online_node ( i )
+ {
+ paddr_t pa = pfn_to_paddr(node_start_pfn(i) + 1);
+
+ printk("NODE%u start->%lu size->%lu free->%lu\n",
+ i, node_start_pfn(i), node_spanned_pages(i),
+ avail_node_heap_pages(i));
+ /* sanity check phys_to_nid() */
First char of comment again.
Ok.
Thanks.
Wei Chen
Jan