This is an automated email from the ASF dual-hosted git repository.
jfthomps pushed a commit to branch VCL-1136_KVM_NUMA_and_huge_pages
in repository https://gitbox.apache.org/repos/asf/vcl.git
The following commit(s) were added to
refs/heads/VCL-1136_KVM_NUMA_and_huge_pages by this push:
new 65678bf2 VCL-1136 - NUMA and huge page performance improvement for KVM
65678bf2 is described below
commit 65678bf2f3f50b0c5f3c3bf9feb322a174b2d0a2
Author: Josh Thompson <[email protected]>
AuthorDate: Mon Dec 11 17:13:39 2023 -0500
VCL-1136 - NUMA and huge page performance improvement for KVM
Linux.pm: (cleaned up some whitespace)
-added get_cpu_numa_data
-added get_memory_huge_pages
libvirt.pm: modified generate_domain_xml: added code that evaluates host
and VM core and RAM specifications and if sufficient resources are needed by
the VM and available on the host, configures NUMA and/or huge pages on the VM
to allow it to run more efficiently
---
managementnode/lib/VCL/Module/OS/Linux.pm | 147 ++++++++++++++++++++-
.../lib/VCL/Module/Provisioning/libvirt.pm | 127 +++++++++++++++++-
2 files changed, 262 insertions(+), 12 deletions(-)
diff --git a/managementnode/lib/VCL/Module/OS/Linux.pm
b/managementnode/lib/VCL/Module/OS/Linux.pm
index 8ddc6848..4df1fea4 100644
--- a/managementnode/lib/VCL/Module/OS/Linux.pm
+++ b/managementnode/lib/VCL/Module/OS/Linux.pm
@@ -495,7 +495,7 @@ sub post_load {
my $image_name = $self->data->get_image_name();
my $computer_node_name = $self->data->get_computer_node_name();
my $image_os_install_type = $self->data->get_image_os_install_type();
-
+
notify($ERRORS{'OK'}, 0, "beginning Linux post_load tasks, image:
$image_name, computer: $computer_node_name");
# Wait for computer to respond to SSH
@@ -578,7 +578,7 @@ sub post_load {
}
}
}
-
+
return $self->SUPER::post_load();
}
@@ -1286,11 +1286,11 @@ sub reserve {
# Add a local vcl user group if it doesn't already exist
# Do this before OS.pm::reserve calls add_user_accounts
$self->add_vcl_usergroup();
-
+
# Configure sshd to only listen on the private interface and add
ext_sshd service listening on the public interface
# This needs to be done after update_public_ip_address is called from
OS.pm::reserve
$self->configure_ext_sshd() || return;
-
+
# Call OS.pm's reserve subroutine
$self->SUPER::reserve() || return;
@@ -3101,7 +3101,7 @@ sub delete_user {
if ($home_directory_on_local_disk) {
$delete_home_directory = 1;
-
+
# Fetch exclude_list
my @exclude_list = $self->get_exclude_list();
if ((grep(/\/home\/$username/, @exclude_list))) {
@@ -3151,7 +3151,7 @@ sub delete_user {
}
}
}
-
+
if ($delete_home_directory) {
notify($ERRORS{'DEBUG'}, 0, "home directory will be deleted:
$home_directory_path");
$userdel_command .= ' -r';
@@ -4188,6 +4188,59 @@ sub get_cpu_speed {
#//////////////////////////////////////////////////////////////////////////////
+=head2 get_cpu_numa_data
+
+ Parameters : none
+ Returns : array
+ Description : Retrieves numa information of computer's CPUs
+
+=cut
+
+sub get_cpu_numa_data {
+ my $self = shift;
+ if (ref($self) !~ /VCL::Module/i) {
+ notify($ERRORS{'CRITICAL'}, 0, "subroutine was called as a
function, it must be called as a class method");
+ return;
+ }
+
+ my $computer_node_name = $self->data->get_computer_node_name();
+
+ my $command = "lscpu";
+ my ($exit_status, $output) = $self->execute($command);
+
+ if (!defined($output)) {
+ notify($ERRORS{'WARNING'}, 0, "failed to retrieve CPU NUMA info
from $computer_node_name");
+ return;
+ }
+
+ my ($numacnt) = map {$_ =~ /NUMA node\(s\):\s*(\d+)/} @$output;
+ if ($numacnt) {
+ $numacnt = int($numacnt);
+ notify($ERRORS{'DEBUG'}, 0, "retrieved $computer_node_name CPU
NUMA nodes: $numacnt");
+ }
+ else {
+ notify($ERRORS{'WARNING'}, 0, "failed to determine
$computer_node_name CPU NUMA nodes, 'NUMA node(s):' line does not exist in the
cpuinfo output:\n" . join("\n", @$output));
+ return ();
+ }
+
+ my @numanodes;
+
+ for (my $i = 0; $i < $numacnt; $i++) {
+ ($numanodes[$i]) = map {$_ =~ /NUMA node$i
CPU\(s\):\s*([-,0-9]+)/} @$output;
+ if ($numanodes[$i]) {
+ notify($ERRORS{'DEBUG'}, 0, "retrieved
$computer_node_name NUMA node $i CPU(s): $numanodes[$i]");
+ }
+ else {
+ notify($ERRORS{'WARNING'}, 0, "failed to determine
$computer_node_name NUMA node $i CPU(s), 'NUMA node$i CPU(s):' line does not
exist in the cpuinfo output:\n" . join("\n", @$output));
+ return ();
+ }
+ }
+ notify($ERRORS{'DEBUG'}, 0, "NUMA CPU data for $computer_node_name:\n"
. format_data(@numanodes));
+ return @numanodes;
+}
+
+#//////////////////////////////////////////////////////////////////////////////
+
=head2 get_total_memory
Parameters : none
@@ -4229,6 +4282,88 @@ sub get_total_memory {
#//////////////////////////////////////////////////////////////////////////////
+=head2 get_memory_huge_pages
+
+ Parameters : none
+ Returns : hash reference
+ Description : Retrieves information about the computer's huge pages
+
+=cut
+
+sub get_memory_huge_pages {
+ my $self = shift;
+ if (ref($self) !~ /VCL::Module/i) {
+ notify($ERRORS{'CRITICAL'}, 0, "subroutine was called as a
function, it must be called as a class method");
+ return;
+ }
+
+ my $computer_node_name = $self->data->get_computer_node_name();
+
+ my $command = "cat /proc/meminfo";
+ my ($exit_status, $output) = $self->execute($command);
+
+ if (!defined($output)) {
+ notify($ERRORS{'WARNING'}, 0, "failed to retrieve huge page
size from $computer_node_name");
+ return;
+ }
+
+ my ($hugepagesize) = map {$_ =~ /Hugepagesize:\s*(\d+) kB/} @$output;
+ if ($hugepagesize) {
+ notify($ERRORS{'DEBUG'}, 0, "retrieved $computer_node_name huge
page size: $hugepagesize kB");
+ }
+ else {
+ notify($ERRORS{'WARNING'}, 0, "failed to determine
$computer_node_name huge page size from command: '$command', output:\n" .
join("\n", @$output));
+ return;
+ }
+ my ($freemem) = map {$_ =~ /MemFree:\s*(\d+) kB/} @$output;
+ if ($freemem) {
+ notify($ERRORS{'DEBUG'}, 0, "retrieved $computer_node_name
MemFree: $freemem kB");
+ }
+ my ($hugefree) = map {$_ =~ /HugePages_Free:\s*(\d+)/} @$output;
+ if ($hugefree) {
+ $hugefree = $hugefree * $hugepagesize;
+ notify($ERRORS{'DEBUG'}, 0, "retrieved $computer_node_name huge
memory free $hugefree kB");
+ }
+
+ $command = "numastat -m";
+ ($exit_status, $output) = $self->execute($command);
+
+ if (!defined($output)) {
+ notify($ERRORS{'WARNING'}, 0, "failed to retrieve NUMA memory
info from $computer_node_name");
+ return;
+ }
+
+ my @nodes = ();
+ my $nodecnt = 0;
+
+ for my $line (@$output) {
+ if($line =~ /Node/) {
+ my @parts = split(' ', $line);
+ my $len = scalar(@parts);
+ $nodecnt = $parts[$len - 2] + 1;
+ notify($ERRORS{'DEBUG'}, 0, "NUMA node count on
$computer_node_name: $nodecnt");
+ next;
+ }
+ if($line =~ /(MemFree|HugePages_Total|HugePages_Free)/) {
+ my @parts = split(' ', $line);
+ for(my $i = 0; $i < $nodecnt; $i++) {
+ $nodes[$i]{$parts[0]} = $parts[$i + 1] * 1024;
# convert to kB
+ }
+ }
+ }
+ notify($ERRORS{'DEBUG'}, 0, "NUMA data for $computer_node_name\n" .
format_data(@nodes));
+ if($nodecnt == 0 || scalar(@nodes) == 0) {
+ notify($ERRORS{'WARNING'}, 0, "failed to determine
$computer_node_name NUMA memory data from command: '$command', output:\n" .
join("\n", @$output));
+ return ();
+ }
+ return ( hugepagesize => $hugepagesize, # in kB
+ numapagedata => \@nodes,
+ totalfreemem => $freemem,
+ totalfreehugemem => $hugefree );
+}
+
+#//////////////////////////////////////////////////////////////////////////////
+
=head2 get_exclude_list
Parameters : none
diff --git a/managementnode/lib/VCL/Module/Provisioning/libvirt.pm
b/managementnode/lib/VCL/Module/Provisioning/libvirt.pm
index 6f2dea8d..58c0bbe3 100644
--- a/managementnode/lib/VCL/Module/Provisioning/libvirt.pm
+++ b/managementnode/lib/VCL/Module/Provisioning/libvirt.pm
@@ -1844,11 +1844,83 @@ EOF
# Windows, however, expects it to be in so called 'localtime'."
my $clock_offset = ($image_os_type =~ /windows/) ? 'localtime' : 'utc';
- my $cpusockets = $cpu_count;
- my $cpucores = 1;
- if($cpu_count > 2) {
- $cpusockets = 2;
- $cpucores = ($cpu_count - ($cpu_count % 2)) / 2;
+ my $cpusockets;
+ my $cpucores;
+ my $use_numa = 0;
+ my @numacpu;
+ my $optimize_memory = 0;
+ my $use_huge_pages = 0;
+ my $pernode_memory = 0;
+ my $cpuset = '';
+ my %numa_memory = $self->vmhost_os->get_memory_huge_pages();
+
+ if($cpu_count < 32) {
+ $cpusockets = $cpu_count;
+ $cpucores = 1;
+ if($cpu_count > 2) {
+ $cpusockets = 2;
+ $cpucores = ($cpu_count + ($cpu_count % 2)) / 2;
+ }
+ # use regular memory if available to leave huge pages for NUMA
optomized VMs but
+ # use huge pages if not enough regular memory free
+ if ($memory_kb > $numa_memory{totalfreemem} &&
+ $memory_kb <= $numa_memory{totalfreehugemem}) {
+ $use_huge_pages = 1;
+ notify($ERRORS{'DEBUG'}, 0, "Using huge pages for
memory backing");
+ }
+ }
+ else {
+ notify($ERRORS{'DEBUG'}, 0, "CPU count >= 32, using NUMA
settings");
+ $use_numa = 1;
+ @numacpu = $self->vmhost_os->get_cpu_numa_data();
+ my $numacnt = scalar(@numacpu);
+ $cpusockets = $numacnt;
+ if($cpu_count % $numacnt != 0) {
+ # increase cpu_count to be evenly divisable by numa
node count
+ $cpu_count = $cpu_count + ($numacnt - $cpu_count %
$numacnt);
+ notify($ERRORS{'DEBUG'}, 0, "CPU count does not divide
equally among NUMA nodes ($numacnt), increased CPU count to $cpu_count");
+ }
+ $cpucores = $cpu_count / $numacnt;
+
+ my $huge_memory_kb = $memory_kb;
+ if ($memory_kb % ($numa_memory{hugepagesize} * $numacnt)) {
+ # $memory_kb does not evenly divide into huge pages
across all NUMA nodes, need to adjust $memory_kb
+ $huge_memory_kb = $memory_kb +
($numa_memory{hugepagesize} * $numacnt) - ($memory_kb %
($numa_memory{hugepagesize} * $numacnt));
+ notify($ERRORS{'DEBUG'}, 0, "required memory does not
divide evenly among huge page size and NUMA nodes, increasing to
$huge_memory_kb kB if huge pages are used");
+ notify($ERRORS{'DEBUG'}, 0, "----> huge page size:
$numa_memory{hugepagesize}");
+ notify($ERRORS{'DEBUG'}, 0, "----> NUMA nodes:
$numacnt");
+ notify($ERRORS{'DEBUG'}, 0, "----> memory per node: " .
$huge_memory_kb / $numacnt);
+ }
+ $optimize_memory = 1;
+ $use_huge_pages = 1;
+ $pernode_memory = $memory_kb / $numacnt;
+ my $huge_pernode_memory = $huge_memory_kb / $numacnt;
+
+ # memory needed by VM: $memory_kb
+ # divide that by number of NUMA nodes: $numacnt
+ # check that each NUMA node has that much memory free
+ my @tmp_cpuset;
+ for (my $i = 0; $i < $numacnt; $i++) {
+ if($numa_memory{numapagedata}[$i]{MemFree} <
$pernode_memory) {
+ notify($ERRORS{'DEBUG'}, 0, "not enough memory
free to evenly split among NUMA nodes; memory will not be NUMA optimized") if
($optimize_memory);
+ $optimize_memory = 0;
+ }
+ if($numa_memory{numapagedata}[$i]{HugePages_Free} <
$huge_pernode_memory) {
+ notify($ERRORS{'DEBUG'}, 0, "not enough huge
pages free to evenly split among NUMA nodes; memory will not use huge pages")
if ($use_huge_pages);
+ $use_huge_pages = 0;
+ }
+ push @tmp_cpuset, $numacpu[$i];
+ $cpuset = "$cpuset," . $numacpu[$i];
+ }
+ if ($use_huge_pages) {
+ $optimize_memory = 1;
+ }
+ $cpuset = join(",", @tmp_cpuset);
+ if ($use_huge_pages) {
+ notify($ERRORS{'DEBUG'}, 0, "Using huge pages for
memory backing");
+ $memory_kb = $huge_memory_kb;
+ $pernode_memory = $huge_pernode_memory;
+ }
}
my $xml_hashref = {
@@ -1962,7 +2034,50 @@ EOF
notify($ERRORS{'DEBUG'}, 0, "vmpath ($vmhost_vmpath) is on NFS;
setting disk cache to none");
$xml_hashref->{'devices'}[0]{'disk'}[0]{'driver'}{'cache'} =
'none';
}
-
+
+ if ($use_numa) {
+ my $host_numacells = scalar(@numacpu);
+ # vcpu section
+ $xml_hashref->{'vcpu'} = [ {'placement' => 'static', 'cpuset'
=> $cpuset, 'content' => $cpu_count}];
+
+ # cputune section
+ my @pins = ();
+ for (my $i = 0, my $index, my $cpusetval; $i < $cpu_count;
$i++) {
+ $index = int($i / $cpucores);
+ $cpusetval = $numacpu[$index];
+ $pins[$i] = {'vcpu' => $i, 'cpuset' => $cpusetval};
+ }
+ $xml_hashref->{'cputune'} = { 'vcpupin' => \@pins };
+
+ if ($optimize_memory) {
+ # numatune section
+ my @memnodes = ();
+ for (my $i = 0; $i < $host_numacells; $i++) {
+ $memnodes[$i] = { 'cellid' => $i, 'mode' =>
'strict', 'nodeset' => $i };
+ }
+ my $nodeset = "0-" . ($host_numacells - 1);
+ $xml_hashref->{'numatune'} = { 'memory' => {'node' =>
'strict', 'nodeset' => $nodeset}, 'memnode' => \@memnodes};
+
+ # cpu numa section
+ my @numacells = ();
+ for (my $i = 0,
+ my $cores_per_cell = $cpu_count /
$host_numacells,
+ my $lower,
+ my $upper;
+ $i < $host_numacells;
+ $i++) {
+ $lower = $i * $cores_per_cell;
+ $upper = $lower + $cores_per_cell - 1;
+ $numacells[$i] = { 'id' => $i, 'cpus' =>
"$lower-$upper", 'memory' => $pernode_memory, 'unit' => 'KiB' };
+ }
+ $xml_hashref->{'cpu'}[0]{'numa'} = {'cell' =>
\@numacells};
+ }
+ }
+
+ if ($use_huge_pages) {
+ $xml_hashref->{'memoryBacking'} = { 'hugepages' => {} };
+ }
+
notify($ERRORS{'DEBUG'}, 0, "generated domain XML:\n" .
format_data($xml_hashref));
return hash_to_xml_string($xml_hashref, 'domain');
}