Hi,

I have the beginnings of an implementation of the nbd protocol 
(https://github.com/NetworkBlockDevice/nbd) for iPXE. My efforts so far are 
attached, although note that I have stripped out a bunch of extensions from the 
above so it probably won't actually compile as-is (The stripped code is not 
part of the standard nbd protocol, and includes some deep changes like allowing 
writes to a physical disk).

Writes are not implemented as yet, as my project doesn't require them (writes 
will be redirected to local storage, but the attached code just completes the 
writes as though they succeeded)
My eventual goal is to support the following scenarios:

1.       Diskless windows boot with writes stored to local cache

2.       Deploy images of windows machine live (boot and image simultaneously 
rather than waiting for the image process to complete before the machine can be 
used)

3.       Restore windows machines from backup live (boot as soon as the restore 
process has started)

I originally implemented this in my own custom protocol but nbd supports the 
"sparse" IO calls I need, and a standard nbd implementation might be more 
useful to others, maybe?

So firstly, is the attached code on the right track? It's fairly minimal, and 
needs a lot more error checking, but is it architecturally sound? Am I doing 
the right things with interfaces?

Secondly, is anyone actually interested in an nbd implementation for iPXE? It 
would boot into DOS or another environment that uses int13h, but obviously 
would require drivers for any other OS. Linux can already net boot into an nbd 
root volume without needing nbd support from iPXE, and many of the other 
interesting use cases for nbd are already possible via iSCSI.

Thanks

James

FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL );

/**
 * @file
 *
 * NBD Protocol (NBD)
 *
 */

#include <errno.h>
#include <ipxe/uri.h>
#include <ipxe/open.h>
#include <ipxe/xfer.h>
#include <ipxe/features.h>
#include <ipxe/blockdev.h>
#include <ipxe/acpi.h>
#include <ipxe/socket.h>
#include <ipxe/iobuf.h>
#include <ipxe/if_ether.h>
#include <ipxe/tcpip.h>
#include <ipxe/settings.h>
#include <ipxe/vlan.h>
#include <int13.h>
#include <realmode.h>

#define NBD_STATE_RX_FLAG               0x100
#define NBD_STATE_TX_FLAG               0x200
#define NBD_STATE_IDLE                  0x0
#define NBD_STATE_RX_NEGOTIATION        (NBD_STATE_RX_FLAG|0x01)
#define NBD_STATE_TX_NEGOTIATION        (NBD_STATE_TX_FLAG|0x02)
#define NBD_STATE_TX_OPTION             (NBD_STATE_TX_FLAG|0x03)
#define NBD_STATE_RX_OPTION             (NBD_STATE_RX_FLAG|0x04)
#define NBD_STATE_RX_OPTION_EXPORT_NAME (NBD_STATE_RX_FLAG|0x05)
#define NBD_STATE_TX_TRANSMISSION_READ  (NBD_STATE_TX_FLAG|0x06)
#define NBD_STATE_RX_TRANSMISSION_READ  (NBD_STATE_TX_FLAG|0x07)
#define NBD_STATE_TX_TRANSMISSION_WRITE (NBD_STATE_RX_FLAG|0x08)
#define NBD_STATE_RX_TRANSMISSION_WRITE (NBD_STATE_RX_FLAG|0x09)

#define NBD_FLAG_C_FIXED_NEWSTYLE       0x01
#define NBD_FLAG_C_NO_ZEROES            0x02

#define NBD_OPT_EXPORT_NAME             0x01

#define NBD_CMD_READ                    0x00
#define NBD_CMD_WRITE                   0x01
#define NBD_CMD_DISC                    0x02
#define NBD_CMD_FLUSH                   0x03
#define NBD_CMD_TRIM                    0x04
#define NBD_CMD_WRITE_ZEROES            0x06

static __inline uint16_t
get_uint16(char *buf, int offset) {
  return ntohs(*(uint16_t *)&buf[offset]);
}

static __inline uint32_t
get_uint32(char *buf, int offset) {
  return ntohl(*(uint32_t *)&buf[offset]);
}

static __inline uint64_t
get_uint64(char *buf, int offset) {
  return ntohll(*(uint64_t *)&buf[offset]);
}

static __inline void
put_uint16(char *buf, int offset, uint16_t value) {
  *(uint16_t *)&buf[offset] = htons(value);
}

static __inline void
put_uint32(char *buf, int offset, uint32_t value) {
  *(uint32_t *)&buf[offset] = htonl(value);
}

static __inline void
put_uint64(char *buf, int offset, uint64_t value) {
  *(uint64_t *)&buf[offset] = htonll(value);
}

FEATURE (FEATURE_PROTOCOL, "NBD", DHCP_EB_FEATURE_NBD, 1);

#define NBD_PORT 10809

/** NBD boot firmware table signature */
#define NBDFT_SIG ACPI_SIGNATURE ( 'n', 'B', 'F', 'T' )

/* windows only seems to make a very small number of write */
#define WRITE_CACHE_MAX 16

struct write_cache {
  uint64_t lba;
  char data[512];
};

/**
 * NBD Boot Firmware Table (nBFT)
 */
struct nbft_table {
  /** ACPI header */
  union {
    struct acpi_description_header acpi;
    char nbft_header_bytes[48];
  };
  uint8_t mac[ETH_ALEN];
  uint16_t vlan;
  struct in6_addr ip_address;
  int8_t ip_prefix_length;
  struct in6_addr ip_gateway;
  struct in6_addr ip_dns_server[2];
  struct in6_addr ip_dhcp_server;
  uint16_t pci_bus_dev_func;
  uint16_t port;
  uint16_t server_length;
  uint16_t path_length;
  uint16_t write_cache_count;
  char strings[1];
} __attribute__ (( packed ));

struct nbd_cmd {
  struct refcnt refcnt;
  struct interface block;
  struct interface nbd;
  uint64_t lba;
  uint32_t count;
  userptr_t buffer;
  size_t len;
};

struct nbd_dev {
  struct refcnt refcnt;
  struct net_device *netdev;
  struct interface block;
  struct interface nbd;
  struct interface socket;
  int state;
  struct nbd_cmd *capacity_cmd;
  struct nbd_cmd *rw_cmd;
  char *server;
  uint16_t port;
  char *export;
  uint64_t total_sectors;
  uint16_t sector_size;
  char tx_buf[16384];
  char rx_buf[16384];
  uint16_t rx_count;
  struct write_cache *write_cache;
  struct nbft_table *nbft;
};

static void nbd_dev_tx_resume(struct nbd_dev *nbddev);

static void
nbd_dev_complete_capacity(struct nbd_dev *nbddev) {
  struct block_device_capacity capacity;
  capacity.blksize = nbddev->sector_size;
  capacity.blocks = nbddev->total_sectors;
  capacity.max_count = 31;
  block_capacity(&nbddev->capacity_cmd->block, &capacity);
  dbg_printf("capacity is set\n");
  intf_shutdown(&nbddev->capacity_cmd->block, 0);
  dbg_printf("intf_shutdown is called\n");
  nbddev->capacity_cmd = NULL;
}

static int
nbd_get_rx_data(struct nbd_dev *nbddev, struct io_buffer *iobuf, int required) {
  int copy_length;
  if (nbddev->rx_count >= required) {
    return 1;
  }
  if (required - nbddev->rx_count < (int)iob_len(iobuf)) {
    copy_length = required - nbddev->rx_count;
  } else {
    copy_length = iob_len(iobuf);
  }
  memcpy(nbddev->rx_buf + nbddev->rx_count, iobuf->data, copy_length);
  nbddev->rx_count += copy_length;
  iob_pull(iobuf, copy_length);
  return (nbddev->rx_count >= required);
}

static int
nbd_dev_socket_deliver(struct nbd_dev *nbddev, struct io_buffer *iobuf, struct 
xfer_metadata *meta) {
  uint16_t rx_required;
  int rc = 0;

  (void)meta;

  while (1) { 
    switch(nbddev->state) {
    case NBD_STATE_RX_NEGOTIATION:
      rx_required = 18;
      if (!nbd_get_rx_data(nbddev, iobuf, rx_required)) {
        goto done;
      }
      if (get_uint64(nbddev->rx_buf, 0) != 0x4e42444d41474943) {
        dbg_printf("bad checksum 1\n");
        rc = 1;
        goto done;
      }
      nbddev->state = NBD_STATE_TX_NEGOTIATION;
      nbd_dev_tx_resume(nbddev);
      break;
    case NBD_STATE_RX_OPTION:
      rx_required = 20;
      if (!nbd_get_rx_data(nbddev, iobuf, rx_required)) {
        goto done;
      }
      rx_required = 20 + get_uint64(nbddev->rx_buf, 16);
      if (!nbd_get_rx_data(nbddev, iobuf, rx_required)) {
        goto done;
      }
      nbddev->state = NBD_STATE_TX_OPTION;
      break;
    case NBD_STATE_RX_OPTION_EXPORT_NAME:
      rx_required = 10;
      if (!nbd_get_rx_data(nbddev, iobuf, rx_required)) {
        goto done;
      }
      nbddev->sector_size = 512;
      nbddev->total_sectors = get_uint64(nbddev->rx_buf, 0) / 
nbddev->sector_size;
      dbg_printf("total_sectors = %d\n", (int)nbddev->total_sectors);
      dbg_printf("flags = %04x\n", get_uint16(nbddev->rx_buf, 8));
      if (nbddev->capacity_cmd) {
        nbd_dev_complete_capacity(nbddev);
      }
      nbddev->state = NBD_STATE_IDLE;
      break;
    case NBD_STATE_RX_TRANSMISSION_READ:
      rx_required = 16;
      if (!nbd_get_rx_data(nbddev, iobuf, rx_required)) {
        goto done;
      }
      if (get_uint32(nbddev->rx_buf, 4) == 0) {
        rx_required = 16 + nbddev->rw_cmd->count * nbddev->sector_size;
        if (!nbd_get_rx_data(nbddev, iobuf, rx_required)) {
          goto done;
        }
      }
      memcpy((void *)nbddev->rw_cmd->buffer, nbddev->rx_buf + 16, 
nbddev->rw_cmd->count * nbddev->sector_size);
      nbddev->state = NBD_STATE_IDLE;
      intf_shutdown(&nbddev->rw_cmd->block, 0);
      free(nbddev->rw_cmd);
      nbddev->rw_cmd = NULL;
      break;
    default:
      goto done;
    }
    if (nbddev->rx_count > rx_required) {
      dbg_printf("excess data ??\n");
    }
    nbddev->rx_count = 0;
  }
done:
  free_iob(iobuf);
  return rc;
}

static void
nbd_dev_tx_resume(struct nbd_dev *nbddev) {
  struct io_buffer *iobuf;
  userptr_t ptr;
  int rc;
  char *data;
  int length;
  int next_state;

  while (1) {
    if (!xfer_window(&nbddev->socket)) {
      return;
    }
    switch (nbddev->state) {
    case NBD_STATE_TX_NEGOTIATION:
      dbg_printf("NBD_STATE_TX_NEGOTIATION\n");
      length = sizeof(uint32_t);
      data = malloc(length);
      put_uint32(data, 0, NBD_FLAG_C_FIXED_NEWSTYLE | NBD_FLAG_C_NO_ZEROES);
      ptr = (userptr_t)data;
      next_state = NBD_STATE_TX_OPTION;
      break;
    case NBD_STATE_TX_OPTION:
      dbg_printf("NBD_STATE_TX_OPTION\n");
      length = 16 + strlen(nbddev->export);
      data = malloc(length);
      put_uint64(data, 0, 0x49484156454F5054);
      put_uint32(data, 8, NBD_OPT_EXPORT_NAME);
      put_uint32(data, 12, strlen(nbddev->export));
      memcpy(data + 16, nbddev->export, strlen(nbddev->export));
      ptr = (userptr_t)data;
      next_state = NBD_STATE_RX_OPTION_EXPORT_NAME;
      break;
    case NBD_STATE_TX_TRANSMISSION_READ:
      length = 28;
      data = malloc(length);
      put_uint32(data, 0, 0x25609513);
      put_uint16(data, 4, 0);
      put_uint16(data, 6, NBD_CMD_READ);
      put_uint64(data, 8, 0x1234123412341234);
      put_uint64(data, 16, nbddev->rw_cmd->lba * nbddev->sector_size);
      put_uint32(data, 24, nbddev->rw_cmd->count * nbddev->sector_size);
      ptr = (userptr_t)data;
      next_state = NBD_STATE_RX_TRANSMISSION_READ;
      break;
    default:
      return;
    }
    iobuf = xfer_alloc_iob(&nbddev->socket, length);
    if (!iobuf) {
      dbg_printf("xfer_alloc_iob failed\n");
    }
    copy_from_user(iob_put(iobuf, length), ptr, 0, length);
    rc = xfer_deliver_iob(&nbddev->socket, iobuf);
    if (!rc) {
      nbddev->state = next_state;
    } else {
      dbg_printf("xfer_deliver_iob rc = %08x (%s)\n", rc, strerror(rc));
    }
  }
}

static void
nbd_dev_socket_close(struct nbd_dev *nbddev, int rc) {
  (void)nbddev;
  dbg_printf("nbd_dev_socket_close rc = %d (%s)\n", rc, strerror(rc));
  intf_shutdown(&nbddev->socket, rc);
}

static struct interface_operation nbd_dev_socket_ops[] = {
  INTF_OP ( xfer_deliver, struct nbd_dev *, nbd_dev_socket_deliver ),
  INTF_OP ( xfer_window_changed, struct nbd_dev *, nbd_dev_tx_resume ),
  INTF_OP ( intf_close, struct nbd_dev *, nbd_dev_socket_close ),
};

static struct interface_descriptor nbd_dev_socket_desc =
  INTF_DESC ( struct nbd_dev, socket, nbd_dev_socket_ops );

static void
nbd_cmd_block_close(struct nbd_cmd *nbdcmd, int rc) {
  dbg_printf("nbdcmd_close rc = %d\n", rc);
  intf_shutdown(&nbdcmd->block, rc);
}

static struct interface_operation nbd_cmd_block_ops[] = {
  INTF_OP(intf_close, struct nbd_cmd *, nbd_cmd_block_close),
};

static struct interface_descriptor nbd_cmd_block_desc = 
INTF_DESC_PASSTHRU(struct nbd_cmd, block, nbd_cmd_block_ops, nbd);

static void nbd_set_ip4(struct in6_addr *dst, struct in_addr *src) {
  memset(dst, 0, sizeof(*dst));
  dst->s6_addr16[6] = 0xFFFF;
  memcpy(&dst->s6_addr32[3], src, sizeof(*src));
}

static void nbd_set_ip4_setting(struct settings *settings, struct in6_addr 
*addr, const struct setting *setting, int count) {
  struct in_addr in[count];
  int i_count;

  fetch_ipv4_array_setting(settings, setting, in, count);
  for (i_count = 0; i_count < count; i_count++) {
    nbd_set_ip4(&addr[i_count], &in[i_count]);
  }
}

#define array_size(a) (sizeof((a)) / sizeof((a)[0]))

static int
nbd_dev_describe(struct nbd_dev *nbddev, struct acpi_description_header *acpi, 
size_t len) {
  struct nbft_table *nbft = container_of ( acpi, struct nbft_table, acpi );
  struct settings *parent;
  struct settings *origin;
  uint32_t offset;
  char *base;

  dbg_printf("nbd_dev_describe\n");

  nbddev->nbft = nbft;

  if (len < offsetof(struct nbft_table, strings) + strlen(nbddev->server)) {
    dbg_printf("bailing - not enough space\n");
    return -ENOBUFS;
  }

  nbddev->netdev = last_opened_netdev();
  parent = netdev_settings(nbddev->netdev);
  fetch_setting(parent, &ip_setting, &origin, NULL, NULL, 0);

  /* Populate table */
  nbft->acpi.signature = cpu_to_le32(NBDFT_SIG);
  nbft->acpi.revision = 1;
  memcpy(nbft->mac, nbddev->netdev->ll_addr, sizeof(nbft->mac));
  dbg_printf("mac = %02x:%02x:%02x:%02x:%02x:%02x\n", nbft->mac[0], 
nbft->mac[1], nbft->mac[2], nbft->mac[3], nbft->mac[4], nbft->mac[5]);
  nbft->vlan = vlan_tag(nbddev->netdev);
  nbd_set_ip4_setting(parent, &nbft->ip_address, &ip_setting, 1);
  dbg_printf("ip_address = %s\n", inet_ntoa(*(struct in_addr 
*)&nbft->ip_address.s6_addr32[3]));
  nbd_set_ip4_setting(parent, &nbft->ip_gateway, &gateway_setting, 1);
  nbd_set_ip4_setting(parent, &nbft->ip_dns_server[0], &dns_setting, 
array_size(nbft->ip_dns_server));
  nbft->ip_prefix_length = 24;
  nbft->port = nbddev->port;
  nbft->server_length = strlen(nbddev->server);
  base = (char *)nbft;
  offset = offsetof(struct nbft_table, strings);
  memcpy(base + offset, nbddev->server, nbft->server_length);
  offset += nbft->server_length;
  nbddev->write_cache = (struct write_cache *)(base + offset);
  nbft->acpi.length = cpu_to_le32(offset);
  return 0;
}

static int
nbd_dev_block_read(struct nbd_dev *nbddev, struct interface *block, uint64_t 
lba, unsigned int count, userptr_t buffer, size_t len) {
  nbddev->rw_cmd = zalloc(sizeof(*nbddev->rw_cmd));
  if (!nbddev->rw_cmd) {
    dbg_printf("unable to allocate rw_cmd\n");
    return 1;
  }
  nbddev->rw_cmd->lba = lba;
  nbddev->rw_cmd->count = count;
  nbddev->rw_cmd->buffer = buffer;
  nbddev->rw_cmd->len = len;
  ref_init(&nbddev->rw_cmd->refcnt, NULL);
  intf_init(&nbddev->rw_cmd->block, &nbd_cmd_block_desc, 
&nbddev->rw_cmd->refcnt);
  nbddev->state = NBD_STATE_TX_TRANSMISSION_READ;
  intf_plug_plug(&nbddev->rw_cmd->block, block);
  nbd_dev_tx_resume(nbddev);
  return 0;
}

static int
nbd_dev_block_write(struct nbd_dev *nbddev, struct interface *block, uint64_t 
lba, unsigned int count, userptr_t buffer, size_t len) {
  int i_count;
  int i_cache;

  (void)len;

  dbg_printf("nbd_dev_block_write %d, %d\n", (int)lba, count);

  if (nbddev->rw_cmd || nbddev->state != NBD_STATE_IDLE) {
    dbg_printf("nbd_dev_block_write command already in progress??\n");
  }
  nbddev->rw_cmd = zalloc(sizeof(*nbddev->rw_cmd));
  intf_plug_plug(&nbddev->rw_cmd->block, block);
  intf_shutdown(&nbddev->rw_cmd->block, 0);
  nbddev->rw_cmd = NULL;
  return 0;
}

static int
nbd_dev_block_read_capacity(struct nbd_dev *nbddev, struct interface *block) {
  dbg_printf("nbd_dev_block_read_capacity\n");

  if (nbddev->capacity_cmd) {
    dbg_printf("duplicate capacity cmd???\n");
  }

  nbddev->capacity_cmd = zalloc(sizeof(*nbddev->capacity_cmd));

  ref_init(&nbddev->capacity_cmd->refcnt, NULL);
  intf_init(&nbddev->capacity_cmd->block, &nbd_cmd_block_desc, 
&nbddev->capacity_cmd->refcnt);
  intf_plug_plug(&nbddev->capacity_cmd->block, block);

  if (nbddev->total_sectors != 0) {
    nbd_dev_complete_capacity(nbddev);
  }
  return 0;
}

static void
nbd_dev_block_close(struct nbd_dev *nbddev, int rc) {
  dbg_printf("nbd_dev_block_close rc = %08x (%s)\n", rc, strerror(rc));
  intf_shutdown(&nbddev->block, rc);
}

static struct interface_operation nbd_dev_block_ops[] = {
  INTF_OP(acpi_describe, struct nbd_dev *, nbd_dev_describe),
  INTF_OP(block_read, struct nbd_dev *, nbd_dev_block_read),
  INTF_OP(block_write, struct nbd_dev *, nbd_dev_block_write),
  INTF_OP(block_read_capacity, struct nbd_dev *, nbd_dev_block_read_capacity),
  INTF_OP(intf_close, struct nbd_dev *, nbd_dev_block_close),
};

static void
nbd_dev_nbd_close(struct nbd_dev *nbddev, int rc) {
  dbg_printf("nbd_dev_nbd_close rc = %08x (%s)\n", rc, strerror(rc));
  intf_shutdown(&nbddev->nbd, rc);
}

static struct interface_descriptor nbd_dev_block_desc = 
INTF_DESC_PASSTHRU(struct nbd_dev, block, nbd_dev_block_ops, nbd);

static struct interface_operation nbd_dev_nbd_ops[] = {
  INTF_OP(intf_close, struct nbd_dev *, nbd_dev_nbd_close),
};

static struct interface_descriptor nbd_dev_nbd_desc = INTF_DESC_PASSTHRU(struct 
nbd_dev, nbd, nbd_dev_nbd_ops, block);

static void
nbddev_free(struct refcnt *refcnt) {
  dbg_printf("nbddev_free\n");
  struct nbd_dev *nbddev = container_of(refcnt, struct nbd_dev, refcnt);
  free(nbddev);
}

static int
nbd_connect(struct nbd_dev *nbddev) {
  struct sockaddr_tcpip sa;
  int rc;

  dbg_printf("nbd_connect\n");
  memset(&sa, 0, sizeof(sa));
  sa.st_port = htons(nbddev->port);
  nbddev->state = NBD_STATE_RX_NEGOTIATION;
  rc = xfer_open_named_socket(&nbddev->socket, SOCK_STREAM, (struct sockaddr 
*)&sa, nbddev->server, NULL);
  if (!rc) {
    dbg_printf("xfer_open_named_socket rc = %d (%s)\n", rc, strerror(rc));
    return rc;
  }
  return rc;
}

int nbd_open_uri(struct interface *parent, struct uri *uri) {
  int rc;
  struct nbd_dev *nbddev;

  dbg_printf("nbd_open_uri\n");
  dbg_printf("host = %s\n", uri->host);

  nbddev = zalloc(sizeof(*nbddev));
  nbddev->port = strtoul(uri->port, NULL, 10);
  if (!nbddev->port) {
    nbddev->port = NBD_PORT;
  }
  nbddev->server = strdup(uri->host);
  nbddev->export = strdup(uri->path + 1);

  ref_init(&nbddev->refcnt, nbddev_free);
  intf_init(&nbddev->block, &nbd_dev_block_desc, &nbddev->refcnt);
  intf_init(&nbddev->nbd, &nbd_dev_nbd_desc, &nbddev->refcnt);
  intf_init(&nbddev->socket, &nbd_dev_socket_desc, &nbddev->refcnt);
  rc = nbd_connect(nbddev);
  intf_plug_plug(&nbddev->block, parent);
  ref_put(&nbddev->refcnt);
  return rc;
}

struct uri_opener nbd_uri_opener __uri_opener = {
  .scheme  = "nbd",
  .open  = nbd_open_uri,
};

_______________________________________________
ipxe-devel mailing list
ipxe-devel@lists.ipxe.org
https://lists.ipxe.org/mailman/listinfo.cgi/ipxe-devel

Reply via email to