(vcl) branch VCL-1136_KVM_NUMA_and_huge_pages updated: VCL-1136 - NUMA and huge page performance improvement for KVM

jfthomps Mon, 11 Dec 2023 14:19:02 -0800

This is an automated email from the ASF dual-hosted git repository.

jfthomps pushed a commit to branch VCL-1136_KVM_NUMA_and_huge_pages
in repository https://gitbox.apache.org/repos/asf/vcl.git



The following commit(s) were added to 
refs/heads/VCL-1136_KVM_NUMA_and_huge_pages by this push:
     new 65678bf2 VCL-1136 - NUMA and huge page performance improvement for KVM
65678bf2 is described below

commit 65678bf2f3f50b0c5f3c3bf9feb322a174b2d0a2
Author: Josh Thompson <[email protected]>
AuthorDate: Mon Dec 11 17:13:39 2023 -0500

    VCL-1136 - NUMA and huge page performance improvement for KVM
    
    Linux.pm: (cleaned up some whitespace)
    -added get_cpu_numa_data
    -added get_memory_huge_pages
    
    libvirt.pm: modified generate_domain_xml: added code that evaluates host 
and VM core and RAM specifications and if sufficient resources are needed by 
the VM and available on the host, configures NUMA and/or huge pages on the VM 
to allow it to run more efficiently
---
 managementnode/lib/VCL/Module/OS/Linux.pm          | 147 ++++++++++++++++++++-
 .../lib/VCL/Module/Provisioning/libvirt.pm         | 127 +++++++++++++++++-
 2 files changed, 262 insertions(+), 12 deletions(-)

diff --git a/managementnode/lib/VCL/Module/OS/Linux.pm 
b/managementnode/lib/VCL/Module/OS/Linux.pm
index 8ddc6848..4df1fea4 100644
--- a/managementnode/lib/VCL/Module/OS/Linux.pm
+++ b/managementnode/lib/VCL/Module/OS/Linux.pm
@@ -495,7 +495,7 @@ sub post_load {
        my $image_name            = $self->data->get_image_name();
        my $computer_node_name    = $self->data->get_computer_node_name();
        my $image_os_install_type = $self->data->get_image_os_install_type();
-       
+
        notify($ERRORS{'OK'}, 0, "beginning Linux post_load tasks, image: 
$image_name, computer: $computer_node_name");
 
        # Wait for computer to respond to SSH
@@ -578,7 +578,7 @@ sub post_load {
                        }
                }
        }
-       
+
        return $self->SUPER::post_load();
 }
 
@@ -1286,11 +1286,11 @@ sub reserve {
        # Add a local vcl user group if it doesn't already exist
        # Do this before OS.pm::reserve calls add_user_accounts
        $self->add_vcl_usergroup();
-       
+
        # Configure sshd to only listen on the private interface and add 
ext_sshd service listening on the public interface
        # This needs to be done after update_public_ip_address is called from 
OS.pm::reserve
        $self->configure_ext_sshd() || return;
-       
+
        # Call OS.pm's reserve subroutine
        $self->SUPER::reserve() || return;
        
@@ -3101,7 +3101,7 @@ sub delete_user {
        
        if ($home_directory_on_local_disk) {
                $delete_home_directory = 1;
-               
+
                # Fetch exclude_list
                my @exclude_list = $self->get_exclude_list();
                if ((grep(/\/home\/$username/, @exclude_list))) {
@@ -3151,7 +3151,7 @@ sub delete_user {
                        }
                }
        }
-       
+
        if ($delete_home_directory) {
                notify($ERRORS{'DEBUG'}, 0, "home directory will be deleted: 
$home_directory_path");
                $userdel_command .= ' -r';
@@ -4188,6 +4188,59 @@ sub get_cpu_speed {
 
 #//////////////////////////////////////////////////////////////////////////////
 
+=head2 get_cpu_numa_data
+
+ Parameters  : none
+ Returns     : array
+ Description : Retrieves numa information of computer's CPUs
+
+=cut
+
+sub get_cpu_numa_data {
+       my $self = shift;
+       if (ref($self) !~ /VCL::Module/i) {
+               notify($ERRORS{'CRITICAL'}, 0, "subroutine was called as a 
function, it must be called as a class method");
+               return;
+       }
+       
+       my $computer_node_name = $self->data->get_computer_node_name();
+       
+       my $command = "lscpu";
+       my ($exit_status, $output) = $self->execute($command);
+       
+       if (!defined($output)) {
+               notify($ERRORS{'WARNING'}, 0, "failed to retrieve CPU NUMA info 
from $computer_node_name");
+               return;
+       }
+       
+       my ($numacnt) = map {$_ =~ /NUMA node\(s\):\s*(\d+)/} @$output;
+       if ($numacnt) {
+               $numacnt = int($numacnt);
+               notify($ERRORS{'DEBUG'}, 0, "retrieved $computer_node_name CPU 
NUMA nodes: $numacnt");
+       }
+       else {
+               notify($ERRORS{'WARNING'}, 0, "failed to determine 
$computer_node_name CPU NUMA nodes, 'NUMA node(s):' line does not exist in the 
cpuinfo output:\n" . join("\n", @$output));
+               return ();
+       }
+
+       my @numanodes;
+
+       for (my $i = 0; $i < $numacnt; $i++) {
+               ($numanodes[$i]) = map {$_ =~ /NUMA node$i 
CPU\(s\):\s*([-,0-9]+)/} @$output;
+               if ($numanodes[$i]) {
+                       notify($ERRORS{'DEBUG'}, 0, "retrieved 
$computer_node_name NUMA node $i CPU(s): $numanodes[$i]");
+               }
+               else {
+                       notify($ERRORS{'WARNING'}, 0, "failed to determine 
$computer_node_name NUMA node $i CPU(s), 'NUMA node$i CPU(s):' line does not 
exist in the cpuinfo output:\n" . join("\n", @$output));
+                       return ();
+               }
+       }
+       notify($ERRORS{'DEBUG'}, 0, "NUMA CPU data for $computer_node_name:\n" 
. format_data(@numanodes));
+       return @numanodes;
+}
+
+#//////////////////////////////////////////////////////////////////////////////
+
 =head2 get_total_memory
 
  Parameters  : none
@@ -4229,6 +4282,88 @@ sub get_total_memory {
 
 #//////////////////////////////////////////////////////////////////////////////
 
+=head2 get_memory_huge_pages
+
+ Parameters  : none
+ Returns     : hash reference
+ Description : Retrieves information about the computer's huge pages
+
+=cut
+
+sub get_memory_huge_pages {
+       my $self = shift;
+       if (ref($self) !~ /VCL::Module/i) {
+               notify($ERRORS{'CRITICAL'}, 0, "subroutine was called as a 
function, it must be called as a class method");
+               return;
+       }
+       
+       my $computer_node_name = $self->data->get_computer_node_name();
+
+       my $command = "cat /proc/meminfo";
+       my ($exit_status, $output) = $self->execute($command);
+       
+       if (!defined($output)) {
+               notify($ERRORS{'WARNING'}, 0, "failed to retrieve huge page 
size from $computer_node_name");
+               return;
+       }
+
+       my ($hugepagesize) = map {$_ =~ /Hugepagesize:\s*(\d+) kB/} @$output;
+       if ($hugepagesize) {
+               notify($ERRORS{'DEBUG'}, 0, "retrieved $computer_node_name huge 
page size: $hugepagesize kB");
+       }
+       else {
+               notify($ERRORS{'WARNING'}, 0, "failed to determine 
$computer_node_name huge page size from command: '$command', output:\n" . 
join("\n", @$output));
+               return;
+       }
+       my ($freemem) = map {$_ =~ /MemFree:\s*(\d+) kB/} @$output;
+       if ($freemem) {
+               notify($ERRORS{'DEBUG'}, 0, "retrieved $computer_node_name 
MemFree: $freemem kB");
+       }
+       my ($hugefree) = map {$_ =~ /HugePages_Free:\s*(\d+)/} @$output;
+       if ($hugefree) {
+               $hugefree = $hugefree * $hugepagesize;
+               notify($ERRORS{'DEBUG'}, 0, "retrieved $computer_node_name huge 
memory free $hugefree kB");
+       }
+       
+       $command = "numastat -m";
+       ($exit_status, $output) = $self->execute($command);
+       
+       if (!defined($output)) {
+               notify($ERRORS{'WARNING'}, 0, "failed to retrieve NUMA memory 
info from $computer_node_name");
+               return;
+       }
+
+       my @nodes = ();
+       my $nodecnt = 0;
+
+       for my $line (@$output) {
+               if($line =~ /Node/) {
+                       my @parts = split(' ', $line);
+                       my $len = scalar(@parts);
+                       $nodecnt = $parts[$len - 2] + 1;
+                       notify($ERRORS{'DEBUG'}, 0, "NUMA node count on 
$computer_node_name: $nodecnt");
+                       next;
+               }
+               if($line =~ /(MemFree|HugePages_Total|HugePages_Free)/) {
+                       my @parts = split(' ', $line);
+                       for(my $i = 0; $i < $nodecnt; $i++) {
+                               $nodes[$i]{$parts[0]} = $parts[$i + 1] * 1024; 
# convert to kB
+                       }
+               }
+       }
+       notify($ERRORS{'DEBUG'}, 0, "NUMA data for $computer_node_name\n" . 
format_data(@nodes));
+       if($nodecnt == 0 || scalar(@nodes) == 0) {
+               notify($ERRORS{'WARNING'}, 0, "failed to determine 
$computer_node_name NUMA memory data from command: '$command', output:\n" . 
join("\n", @$output));
+               return ();
+       }
+       return ( hugepagesize => $hugepagesize, # in kB
+                numapagedata => \@nodes,
+                totalfreemem => $freemem,
+                totalfreehugemem => $hugefree );
+}
+
+#//////////////////////////////////////////////////////////////////////////////
+
 =head2 get_exclude_list
  
  Parameters  : none
diff --git a/managementnode/lib/VCL/Module/Provisioning/libvirt.pm 
b/managementnode/lib/VCL/Module/Provisioning/libvirt.pm
index 6f2dea8d..58c0bbe3 100644
--- a/managementnode/lib/VCL/Module/Provisioning/libvirt.pm
+++ b/managementnode/lib/VCL/Module/Provisioning/libvirt.pm
@@ -1844,11 +1844,83 @@ EOF
        #    Windows, however, expects it to be in so called 'localtime'."
        my $clock_offset = ($image_os_type =~ /windows/) ? 'localtime' : 'utc';
        
-       my $cpusockets = $cpu_count;
-       my $cpucores = 1;
-       if($cpu_count > 2) {
-               $cpusockets = 2;
-               $cpucores = ($cpu_count - ($cpu_count % 2)) / 2;
+       my $cpusockets;
+       my $cpucores;
+       my $use_numa = 0;
+       my @numacpu;
+       my $optimize_memory = 0;
+       my $use_huge_pages = 0;
+       my $pernode_memory = 0;
+       my $cpuset = '';
+       my %numa_memory = $self->vmhost_os->get_memory_huge_pages();
+
+       if($cpu_count < 32) {
+               $cpusockets = $cpu_count;
+               $cpucores = 1;
+               if($cpu_count > 2) {
+                       $cpusockets = 2;
+                       $cpucores = ($cpu_count + ($cpu_count % 2)) / 2;
+               }
+               # use regular memory if available to leave huge pages for NUMA 
optomized VMs but
+               #   use huge pages if not enough regular memory free
+               if ($memory_kb > $numa_memory{totalfreemem} &&
+                   $memory_kb <= $numa_memory{totalfreehugemem}) {
+                       $use_huge_pages = 1;
+                       notify($ERRORS{'DEBUG'}, 0, "Using huge pages for 
memory backing");
+               }
+       }
+       else {
+               notify($ERRORS{'DEBUG'}, 0, "CPU count >= 32, using NUMA 
settings");
+               $use_numa = 1;
+               @numacpu = $self->vmhost_os->get_cpu_numa_data();
+               my $numacnt = scalar(@numacpu);
+               $cpusockets = $numacnt;
+               if($cpu_count % $numacnt != 0) {
+                       # increase cpu_count to be evenly divisable by numa 
node count
+                       $cpu_count = $cpu_count + ($numacnt - $cpu_count % 
$numacnt);
+                       notify($ERRORS{'DEBUG'}, 0, "CPU count does not divide 
equally among NUMA nodes ($numacnt), increased CPU count to $cpu_count");
+               }
+               $cpucores = $cpu_count / $numacnt;
+
+               my $huge_memory_kb = $memory_kb;
+               if ($memory_kb % ($numa_memory{hugepagesize} * $numacnt)) {
+                       # $memory_kb does not evenly divide into huge pages 
across all NUMA nodes, need to adjust $memory_kb
+                       $huge_memory_kb = $memory_kb + 
($numa_memory{hugepagesize} * $numacnt) - ($memory_kb % 
($numa_memory{hugepagesize} * $numacnt));
+                       notify($ERRORS{'DEBUG'}, 0, "required memory does not 
divide evenly among huge page size and NUMA nodes, increasing to 
$huge_memory_kb kB if huge pages are used");
+                       notify($ERRORS{'DEBUG'}, 0, "----> huge page size: 
$numa_memory{hugepagesize}");
+                       notify($ERRORS{'DEBUG'}, 0, "----> NUMA nodes: 
$numacnt");
+                       notify($ERRORS{'DEBUG'}, 0, "----> memory per node: " . 
$huge_memory_kb / $numacnt);
+               }
+               $optimize_memory = 1;
+               $use_huge_pages = 1;
+               $pernode_memory = $memory_kb / $numacnt;
+               my $huge_pernode_memory = $huge_memory_kb / $numacnt;
+
+               # memory needed by VM: $memory_kb
+               # divide that by number of NUMA nodes: $numacnt
+               # check that each NUMA node has that much memory free
+               my @tmp_cpuset;
+               for (my $i = 0; $i < $numacnt; $i++) {
+                       if($numa_memory{numapagedata}[$i]{MemFree} < 
$pernode_memory) {
+                               notify($ERRORS{'DEBUG'}, 0, "not enough memory 
free to evenly split among NUMA nodes; memory will not be NUMA optimized") if 
($optimize_memory);
+                               $optimize_memory = 0;
+                       }
+                       if($numa_memory{numapagedata}[$i]{HugePages_Free} < 
$huge_pernode_memory) {
+                               notify($ERRORS{'DEBUG'}, 0, "not enough huge 
pages free to evenly split among NUMA nodes; memory will not use huge pages") 
if ($use_huge_pages);
+                               $use_huge_pages = 0;
+                       }
+                       push @tmp_cpuset, $numacpu[$i];
+                       $cpuset = "$cpuset," . $numacpu[$i];
+               }
+               if ($use_huge_pages) {
+                       $optimize_memory = 1;
+               }
+               $cpuset = join(",", @tmp_cpuset);
+               if ($use_huge_pages) {
+                       notify($ERRORS{'DEBUG'}, 0, "Using huge pages for 
memory backing");
+                       $memory_kb = $huge_memory_kb;
+                       $pernode_memory = $huge_pernode_memory;
+               }
        }
 
        my $xml_hashref = {
@@ -1962,7 +2034,50 @@ EOF
                notify($ERRORS{'DEBUG'}, 0, "vmpath ($vmhost_vmpath) is on NFS; 
setting disk cache to none");
                $xml_hashref->{'devices'}[0]{'disk'}[0]{'driver'}{'cache'} = 
'none';
        }
-       
+
+       if ($use_numa) {
+               my $host_numacells = scalar(@numacpu);
+               # vcpu section
+               $xml_hashref->{'vcpu'} = [ {'placement' => 'static', 'cpuset' 
=> $cpuset, 'content' => $cpu_count}];
+
+               # cputune section
+               my @pins = ();
+               for (my $i = 0, my $index, my $cpusetval; $i < $cpu_count; 
$i++) {
+                       $index = int($i / $cpucores);
+                       $cpusetval = $numacpu[$index];
+                       $pins[$i] = {'vcpu' => $i, 'cpuset' => $cpusetval};
+               }
+               $xml_hashref->{'cputune'} = { 'vcpupin' => \@pins };
+
+               if ($optimize_memory) {
+                       # numatune section
+                       my @memnodes = ();
+                       for (my $i = 0; $i < $host_numacells; $i++) {
+                               $memnodes[$i] = { 'cellid' => $i, 'mode' => 
'strict', 'nodeset' => $i };
+                       }
+                       my $nodeset = "0-" . ($host_numacells - 1);
+                       $xml_hashref->{'numatune'} = { 'memory' => {'node' => 
'strict', 'nodeset' => $nodeset}, 'memnode' => \@memnodes};
+
+                       # cpu numa section
+                       my @numacells = ();
+                       for (my $i = 0,
+                                 my $cores_per_cell = $cpu_count / 
$host_numacells,
+                                 my $lower,
+                                 my $upper;
+                                 $i < $host_numacells;
+                                 $i++) {
+                               $lower = $i * $cores_per_cell;
+                               $upper = $lower + $cores_per_cell - 1;
+                               $numacells[$i] = { 'id' => $i, 'cpus' => 
"$lower-$upper", 'memory' => $pernode_memory, 'unit' => 'KiB' };
+                       }
+                       $xml_hashref->{'cpu'}[0]{'numa'} = {'cell' => 
\@numacells};
+               }
+       }
+
+       if ($use_huge_pages) {
+               $xml_hashref->{'memoryBacking'} = { 'hugepages' => {} };
+       }
+
        notify($ERRORS{'DEBUG'}, 0, "generated domain XML:\n" . 
format_data($xml_hashref));
        return hash_to_xml_string($xml_hashref, 'domain');
 }

(vcl) branch VCL-1136_KVM_NUMA_and_huge_pages updated: VCL-1136 - NUMA and huge page performance improvement for KVM

Reply via email to