On Wed, Jun 23, 2010 at 04:47:38PM +0300, [email protected] wrote:
> From: Apollon Oikonomopoulos <[email protected]>
>
> This patch introduces basic shared block storage support.
>
> It introduces a new storage backend, bdev.PersistentBlockDevice, to use as a
> backend for shared block storage.
>
> A new disk template, DT_BLOCK is introduced as well and added to
> DTS_EXT_MIRROR
> and DTS_MAY_ADOPT.
>
> This is very basic support and includes no storage manipulation (provisioning,
> resizing, renaming) which will have to be implemented through a "driver"
> framework.
>
> Signed-off-by: Apollon Oikonomopoulos <[email protected]>
> ---
> lib/bdev.py | 137
> ++++++++++++++++++++++++++++++++++++++++++++++++++++++
> lib/cmdlib.py | 102 +++++++++++++++++++++++++---------------
> lib/constants.py | 14 +++--
> lib/objects.py | 11 +++-
> 4 files changed, 217 insertions(+), 47 deletions(-)
>
> diff --git a/lib/bdev.py b/lib/bdev.py
> index 3d68aea..728b51f 100644
> --- a/lib/bdev.py
> +++ b/lib/bdev.py
> @@ -1937,10 +1937,147 @@ class FileStorage(BlockDev):
>
> return FileStorage(unique_id, children, size)
>
> +class PersistentBlockDevice(BlockDev):
> + """A block device with persistent node
> +
> + May be either directly attached, or exposed through DM (e.g. dm-multipath).
> + udev helpers are probably required to give persistent, human-friendly
> + names.
> +
> + For the time being, pathnames are required to lie under /dev.
> +
> + """
> + def __init__(self, unique_id, children, size):
> + """Attaches to a static block device.
> +
> + The unique_id is a path under /dev.
> +
> + """
> + super(PersistentBlockDevice, self).__init__(unique_id, children, size)
> + if not isinstance(unique_id, (tuple, list)):
> + raise ValueError("Invalid configuration data %s" % str(unique_id))
> + self.dev_path = unique_id[1]
> + if not os.path.realpath(self.dev_path).startswith('/dev/'):
> + raise ValueError("Full path '%s' lies outside /dev" %
> + os.path.realpath(self.dev_path))
> + self.major = self.minor = None
> + self.Attach()
> +
> + @classmethod
> + def Create(cls, unique_id, children, size):
> + """Create a new device
> +
> + This is a noop, we only return a PersistentBlockDevice instance
> +
> + """
> + return PersistentBlockDevice(unique_id, children, size=0)
> +
> + def Remove(self):
> + """Remove a device
> +
> + This is a noop
> +
> + """
> + pass
> +
> + def Rename(self, new_id):
> + """Rename this device.
> +
> + """
> + raise NotImplementedError
> +
> + def Attach(self):
> + """Attach to an existing block device.
> +
> +
> + """
> + self.attached = False
> + try:
> + st = os.stat(self.dev_path)
> + except OSError, err:
> + logging.error("Error stat()'ing %s: %s" % (self.dev_path, str(err)))
> + return False
> +
> + if not stat.S_ISBLK(st[stat.ST_MODE]):
> + logging.error("%s is not a block device", self.dev_path)
> + return False
Hmm… I wonder if using _ThrowError here wouldn't be better for error
reporting (but yes, we don't use it everywhere, I don't remember why
Attach() uses return False).
> + self.major = os.major(st.st_rdev)
> + self.minor = os.minor(st.st_rdev)
> + self.attached = True
> +
> + return True
> +
> + def Assemble(self):
> + """Assemble the device.
> +
> + """
> + pass
> +
> + def Shutdown(self):
> + """Shutdown the device.
> +
> + """
> + pass
> +
> + def Open(self, force=False):
> + """Make the device ready for I/O.
> +
> + """
> + pass
> +
> + def Close(self):
> + """Notifies that the device will no longer be used for I/O.
> +
> + """
> + pass
The fact that Open/Close are no-op means that you'll run all LUNS in
read/write mode, all the time. That'll make data loss much more easy to
trigger, and might require enhancements to Ganeti to prevent starting an
instance on multiple nodes (nowadays we kind of rely on the fact that
we run DRBD in A/P mode, and not A/A).
> + def Grow(self, amount):
> + """Grow the logical volume.
> +
> + """
> + raise NotImplementedError
> +
> + def GetSyncStatus(self):
> + """Returns the sync status of the device.
> +
> + If this device is a mirroring device, this function returns the
> + status of the mirror.
> +
> + If the persistent block device name exists, it is assumed to be in
> + sync.
> +
> + The status was already read in Attach, so we just return it.
> +
> + @rtype: objects.BlockDevStatus
> +
> + """
> +
> + if self.attached:
> + ldisk_status = constants.LDS_OKAY
> + else:
> + ldisk_status = constants.LDS_FAULTY
> +
> + return objects.BlockDevStatus(dev_path=self.dev_path,
> + major=self.major,
> + minor=self.minor,
> + sync_percent=None,
> + estimated_time=None,
> + is_degraded=False,
> + ldisk_status=ldisk_status)
> +
> +
> + def DisconnectNet(self):
> + pass
> +
> + def AttachNet(self, multimaster):
> + pass
Hmm, I'm not really sure if we need to implement these last three calls
(GetSyncStatus, DisconnectNet, AttachNet) at all.
If I understand correctly the design, Ganeti will be completely
indifferent to the device status, as the SAN itself will take care of
synching, etc. (at least untile the storage plugin framework is
implemented). As such, do we need to query at all these attributes?
> DEV_MAP = {
> constants.LD_LV: LogicalVolume,
> constants.LD_DRBD8: DRBD8,
> + constants.LD_BLOCK: PersistentBlockDevice,
> }
>
> if constants.ENABLE_FILE_STORAGE or constants.ENABLE_SHARED_FILE_STORAGE:
> diff --git a/lib/cmdlib.py b/lib/cmdlib.py
> index fec6c75..9e49e8d 100644
> --- a/lib/cmdlib.py
> +++ b/lib/cmdlib.py
> @@ -5963,6 +5963,18 @@ def _GenerateDiskTemplate(lu, template_name,
> disk_index)),
> mode=disk["mode"])
> disks.append(disk_dev)
> + elif template_name == constants.DT_BLOCK:
> + if len(secondary_nodes) != 0:
> + raise errors.ProgrammerError("Wrong template configuration")
> +
> + for idx, disk in enumerate(disk_info):
> + disk_index = idx + base_index
> + disk_dev = objects.Disk(dev_type=constants.LD_BLOCK, size=disk["size"],
> + logical_id=('dummy', disk["adopt"]),
> + iv_name="disk/%d" % disk_index,
> + mode=disk["mode"])
> + disks.append(disk_dev)
> +
> else:
> raise errors.ProgrammerError("Invalid disk template '%s'" %
> template_name)
> return disks
> @@ -6081,6 +6093,7 @@ def _ComputeDiskSize(disk_template, disks):
> constants.DT_DRBD8: sum(d["size"] + 128 for d in disks),
> constants.DT_FILE: None,
> constants.DT_SHARED_FILE: 0,
> + constants.DT_BLOCK: 0,
> }
>
> if disk_template not in req_size_dict:
> @@ -6772,34 +6785,46 @@ class LUCreateInstance(LogicalUnit):
> _CheckNodesFreeDisk(self, nodenames, req_size)
>
> if self.adopt_disks: # instead, we must check the adoption data
> - all_lvs = set([i["adopt"] for i in self.disks])
> - if len(all_lvs) != len(self.disks):
> - raise errors.OpPrereqError("Duplicate volume names given for
> adoption",
> + all_disks = set([i["adopt"] for i in self.disks])
> + if len(all_disks) != len(self.disks):
> + raise errors.OpPrereqError("Duplicate disk names given for adoption",
> errors.ECODE_INVAL)
> - for lv_name in all_lvs:
> - try:
> - self.cfg.ReserveLV(lv_name, self.proc.GetECId())
> - except errors.ReservationError:
> - raise errors.OpPrereqError("LV named %s used by another instance" %
> - lv_name, errors.ECODE_NOTUNIQUE)
> -
> - node_lvs = self.rpc.call_lv_list([pnode.name],
> - self.cfg.GetVGName())[pnode.name]
> - node_lvs.Raise("Cannot get LV information from node %s" % pnode.name)
> - node_lvs = node_lvs.payload
> - delta = all_lvs.difference(node_lvs.keys())
> - if delta:
> - raise errors.OpPrereqError("Missing logical volume(s): %s" %
> - utils.CommaJoin(delta),
> - errors.ECODE_INVAL)
> - online_lvs = [lv for lv in all_lvs if node_lvs[lv][2]]
> - if online_lvs:
> - raise errors.OpPrereqError("Online logical volumes found, cannot"
> - " adopt: %s" %
> utils.CommaJoin(online_lvs),
> - errors.ECODE_STATE)
> - # update the size of disk based on what is found
> - for dsk in self.disks:
> - dsk["size"] = int(float(node_lvs[dsk["adopt"]][0]))
> + if self.op.disk_template == constants.DT_PLAIN:
> + for lv_name in all_disks:
> + try:
> + self.cfg.ReserveLV(lv_name, self.proc.GetECId())
> + except errors.ReservationError:
> + raise errors.OpPrereqError("LV named %s used by another
> instance" %
> + lv_name, errors.ECODE_NOTUNIQUE)
> +
> + node_lvs = self.rpc.call_lv_list([pnode.name],
> + self.cfg.GetVGName())[pnode.name]
> + node_lvs.Raise("Cannot get LV information from node %s" % pnode.name)
> + node_lvs = node_lvs.payload
> + delta = all_disks.difference(node_lvs.keys())
> + if delta:
> + raise errors.OpPrereqError("Missing logical volume(s): %s" %
> + utils.CommaJoin(delta),
> + errors.ECODE_INVAL)
> + online_lvs = [lv for lv in all_disks if node_lvs[lv][2]]
> + if online_lvs:
> + raise errors.OpPrereqError("Online logical volumes found, cannot"
> + " adopt: %s" %
> utils.CommaJoin(online_lvs),
> + errors.ECODE_STATE)
> + # update the size of disk based on what is found
> + for dsk in self.disks:
> + dsk["size"] = int(float(node_lvs[dsk["adopt"]][0]))
> + elif self.op.disk_template == constants.DT_BLOCK:
> + node_disks = self.rpc.call_bdev_list([pnode.name])[pnode.name]
> + node_disks = node_disks.payload
> + logging.info("node_disks: %s", node_disks)
> + delta = all_disks.difference(node_disks.keys())
> + if delta:
> + raise errors.OpPrereqError("Missing block device(s): %s" %
> + utils.CommaJoin(delta),
> + errors.ECODE_INVAL)
> + for dsk in self.disks:
> + dsk["size"] = int(float(node_disks[dsk["adopt"]]))
Can you confirm in this last block, you're just moving the DT_PLAIN code
under an 'if', and add the DT_BLOCK under 'elif'?
thanks,
iustin