Public bug reported:

It is possible that placement gets out of sync which can cause
scheduling problems that would go unknown.  I've built out this script
would would be nice to have as `nova-manage placement audit`:

================================================================================
#!/usr/bin/env python

import argparse
import sys

from openstack import connection
import openstack.config

config = openstack.config.OpenStackConfig()
parser = argparse.ArgumentParser()
config.register_argparse_arguments(parser, sys.argv)

options = parser.parse_args()

cloud_region = config.get_one(argparse=options)
conn = connection.Connection(config=cloud_region)

# Grab list of all hypervisors and their servers
hypervisors = conn.compute.get('/os-hypervisors?with_servers=true', 
microversion='2.53').json().get('hypervisors')

# Generate a dictionary mapping of hypervisor => [instances]
hypervisor_mapping = {h['id']: [s['uuid'] for s in h.get('servers', [])] for h 
in hypervisors}
hypervisor_names = {h['id']: h['hypervisor_hostname'] for h in hypervisors}

# Grab list of all resource providers
resource_providers = 
conn.placement.get('/resource_providers').json().get('resource_providers')
for rp in resource_providers:
  # Check if RP has VCPU in inventory (i.e. compute node)
  inventories = conn.placement.get('/resource_providers/%s/inventories' % 
rp['uuid']).json().get('inventories')

  # Skip those without VCPU and MEMORY_MB (non computes)
  if 'MEMORY_MB' not in inventories and 'VCPU' not in inventories:
    continue

  # Get all allocations for RP
  allocations = conn.placement.get('/resource_providers/%s/allocations' % 
rp['uuid']).json().get('allocations')

  # Is there a compute node for this RP?
  if rp['uuid'] not in hypervisor_mapping:
    print "openstack resource provider delete %s # resource provider does not 
have matching provider" % rp['uuid']
    continue

  for allocation_id, info in allocations.iteritems():
    # The instance does not exist where placement says it should be.
    if allocation_id not in hypervisor_mapping[rp['uuid']]:
      hypervisor = None

      # Try to find where it's hiding.
      for hyp, instances in hypervisor_mapping.iteritems():
        if allocation_id in instances:
          hypervisor = hyp
          break

      # We found it.
      if hypervisor:
        classes = ','.join(["%s=%s" % (key, value) for key, value in 
info.get('resources').iteritems()])
        print "openstack resource provider allocation set --allocation rp=%s,%s 
%s # instance allocated on wrong rp" % (hypervisor, classes, allocation_id)
        continue

      # We don't know where this is.  Let's see if it exists in Nova.
      server = conn.placement.get('/servers/%s' % allocation_id)
      if server.status_code == 404:
        print "openstack resource provider allocation delete %s # instance 
deleted" % allocation_id
        continue

      # TODO: idk? edge cases?
      raise
================================================================================

It would likely need to be rewritten to use the built-in placement HTTP
client and objects to avoid extra API calls.

** Affects: nova
     Importance: Undecided
         Status: New

-- 
You received this bug notification because you are a member of Yahoo!
Engineering Team, which is subscribed to OpenStack Compute (nova).
https://bugs.launchpad.net/bugs/1793569

Title:
  Add placement audit commands

Status in OpenStack Compute (nova):
  New

Bug description:
  It is possible that placement gets out of sync which can cause
  scheduling problems that would go unknown.  I've built out this script
  would would be nice to have as `nova-manage placement audit`:

  
================================================================================
  #!/usr/bin/env python

  import argparse
  import sys

  from openstack import connection
  import openstack.config

  config = openstack.config.OpenStackConfig()
  parser = argparse.ArgumentParser()
  config.register_argparse_arguments(parser, sys.argv)

  options = parser.parse_args()

  cloud_region = config.get_one(argparse=options)
  conn = connection.Connection(config=cloud_region)

  # Grab list of all hypervisors and their servers
  hypervisors = conn.compute.get('/os-hypervisors?with_servers=true', 
microversion='2.53').json().get('hypervisors')

  # Generate a dictionary mapping of hypervisor => [instances]
  hypervisor_mapping = {h['id']: [s['uuid'] for s in h.get('servers', [])] for 
h in hypervisors}
  hypervisor_names = {h['id']: h['hypervisor_hostname'] for h in hypervisors}

  # Grab list of all resource providers
  resource_providers = 
conn.placement.get('/resource_providers').json().get('resource_providers')
  for rp in resource_providers:
    # Check if RP has VCPU in inventory (i.e. compute node)
    inventories = conn.placement.get('/resource_providers/%s/inventories' % 
rp['uuid']).json().get('inventories')

    # Skip those without VCPU and MEMORY_MB (non computes)
    if 'MEMORY_MB' not in inventories and 'VCPU' not in inventories:
      continue

    # Get all allocations for RP
    allocations = conn.placement.get('/resource_providers/%s/allocations' % 
rp['uuid']).json().get('allocations')

    # Is there a compute node for this RP?
    if rp['uuid'] not in hypervisor_mapping:
      print "openstack resource provider delete %s # resource provider does not 
have matching provider" % rp['uuid']
      continue

    for allocation_id, info in allocations.iteritems():
      # The instance does not exist where placement says it should be.
      if allocation_id not in hypervisor_mapping[rp['uuid']]:
        hypervisor = None

        # Try to find where it's hiding.
        for hyp, instances in hypervisor_mapping.iteritems():
          if allocation_id in instances:
            hypervisor = hyp
            break

        # We found it.
        if hypervisor:
          classes = ','.join(["%s=%s" % (key, value) for key, value in 
info.get('resources').iteritems()])
          print "openstack resource provider allocation set --allocation 
rp=%s,%s %s # instance allocated on wrong rp" % (hypervisor, classes, 
allocation_id)
          continue

        # We don't know where this is.  Let's see if it exists in Nova.
        server = conn.placement.get('/servers/%s' % allocation_id)
        if server.status_code == 404:
          print "openstack resource provider allocation delete %s # instance 
deleted" % allocation_id
          continue

        # TODO: idk? edge cases?
        raise
  
================================================================================

  It would likely need to be rewritten to use the built-in placement
  HTTP client and objects to avoid extra API calls.

To manage notifications about this bug go to:
https://bugs.launchpad.net/nova/+bug/1793569/+subscriptions

-- 
Mailing list: https://launchpad.net/~yahoo-eng-team
Post to     : yahoo-eng-team@lists.launchpad.net
Unsubscribe : https://launchpad.net/~yahoo-eng-team
More help   : https://help.launchpad.net/ListHelp

Reply via email to