583 lines
18 KiB
C
583 lines
18 KiB
C
/*
|
|
* Copyright (c) 2020 Nutanix Inc. All rights reserved.
|
|
*
|
|
* Authors: Thanos Makatos <thanos@nutanix.com>
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions are met:
|
|
* * Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* * Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
* * Neither the name of Nutanix nor the names of its contributors may be
|
|
* used to endorse or promote products derived from this software without
|
|
* specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
|
|
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
|
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
|
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
|
|
* DAMAGE.
|
|
*
|
|
*/
|
|
|
|
#include <assert.h>
|
|
#include <errno.h>
|
|
#include <limits.h>
|
|
#include <string.h>
|
|
#include <stdlib.h>
|
|
|
|
#include "common.h"
|
|
#include "migration.h"
|
|
#include "private.h"
|
|
#include "migration_priv.h"
|
|
|
|
bool
|
|
MOCK_DEFINE(vfio_migr_state_transition_is_valid)(uint32_t from, uint32_t to)
|
|
{
|
|
return migr_states[from].state & (1 << to);
|
|
}
|
|
|
|
EXPORT size_t
|
|
vfu_get_migr_register_area_size(void)
|
|
{
|
|
return ROUND_UP(sizeof(struct vfio_user_migration_info),
|
|
sysconf(_SC_PAGE_SIZE));
|
|
}
|
|
|
|
/*
|
|
* TODO no need to dynamically allocate memory, we can keep struct migration
|
|
* in vfu_ctx_t.
|
|
*/
|
|
struct migration *
|
|
init_migration(const vfu_migration_callbacks_t * callbacks,
|
|
uint64_t data_offset, int *err)
|
|
{
|
|
struct migration *migr;
|
|
|
|
if (data_offset < vfu_get_migr_register_area_size()) {
|
|
*err = EINVAL;
|
|
return NULL;
|
|
}
|
|
|
|
migr = calloc(1, sizeof(*migr));
|
|
if (migr == NULL) {
|
|
*err = ENOMEM;
|
|
return NULL;
|
|
}
|
|
|
|
/*
|
|
* FIXME: incorrect, if the client doesn't give a pgsize value, it means "no
|
|
* migration support", handle this
|
|
* FIXME must be available even if migration callbacks aren't used
|
|
*/
|
|
migr->pgsize = sysconf(_SC_PAGESIZE);
|
|
|
|
/* FIXME this should be done in vfu_ctx_realize */
|
|
migr->info.device_state = VFIO_DEVICE_STATE_V1_RUNNING;
|
|
migr->data_offset = data_offset;
|
|
|
|
migr->callbacks = *callbacks;
|
|
if (migr->callbacks.transition == NULL ||
|
|
migr->callbacks.get_pending_bytes == NULL ||
|
|
migr->callbacks.prepare_data == NULL ||
|
|
migr->callbacks.read_data == NULL ||
|
|
migr->callbacks.write_data == NULL) {
|
|
free(migr);
|
|
*err = EINVAL;
|
|
return NULL;
|
|
}
|
|
|
|
return migr;
|
|
}
|
|
|
|
void
|
|
MOCK_DEFINE(migr_state_transition)(struct migration *migr,
|
|
enum migr_iter_state state)
|
|
{
|
|
assert(migr != NULL);
|
|
/* FIXME validate that state transition */
|
|
migr->iter.state = state;
|
|
}
|
|
|
|
vfu_migr_state_t
|
|
MOCK_DEFINE(migr_state_vfio_to_vfu)(uint32_t device_state)
|
|
{
|
|
switch (device_state) {
|
|
case VFIO_DEVICE_STATE_V1_STOP:
|
|
return VFU_MIGR_STATE_STOP;
|
|
case VFIO_DEVICE_STATE_V1_RUNNING:
|
|
return VFU_MIGR_STATE_RUNNING;
|
|
case VFIO_DEVICE_STATE_V1_SAVING:
|
|
/*
|
|
* FIXME How should the device operate during the stop-and-copy
|
|
* phase? Should we only allow the migration data to be read from
|
|
* the migration region? E.g. Access to any other region should be
|
|
* failed? This might be a good question to send to LKML.
|
|
*/
|
|
return VFU_MIGR_STATE_STOP_AND_COPY;
|
|
case VFIO_DEVICE_STATE_V1_RUNNING | VFIO_DEVICE_STATE_V1_SAVING:
|
|
return VFU_MIGR_STATE_PRE_COPY;
|
|
case VFIO_DEVICE_STATE_V1_RESUMING:
|
|
return VFU_MIGR_STATE_RESUME;
|
|
}
|
|
return -1;
|
|
}
|
|
|
|
/**
|
|
* Returns 0 on success, -1 on error setting errno.
|
|
*/
|
|
int
|
|
MOCK_DEFINE(state_trans_notify)(vfu_ctx_t *vfu_ctx,
|
|
int (*fn)(vfu_ctx_t *, vfu_migr_state_t),
|
|
uint32_t vfio_device_state)
|
|
{
|
|
/*
|
|
* We've already checked that device_state is valid by calling
|
|
* vfio_migr_state_transition_is_valid.
|
|
*/
|
|
return fn(vfu_ctx, migr_state_vfio_to_vfu(vfio_device_state));
|
|
}
|
|
|
|
/**
|
|
* Returns 0 on success, -1 on failure setting errno.
|
|
*/
|
|
ssize_t
|
|
MOCK_DEFINE(migr_trans_to_valid_state)(vfu_ctx_t *vfu_ctx, struct migration *migr,
|
|
uint32_t device_state, bool notify)
|
|
{
|
|
if (notify) {
|
|
int ret;
|
|
assert(!vfu_ctx->in_cb);
|
|
vfu_ctx->in_cb = CB_MIGR_STATE;
|
|
ret = state_trans_notify(vfu_ctx, migr->callbacks.transition,
|
|
device_state);
|
|
vfu_ctx->in_cb = CB_NONE;
|
|
|
|
if (ret != 0) {
|
|
return ret;
|
|
}
|
|
}
|
|
migr->info.device_state = device_state;
|
|
migr_state_transition(migr, VFIO_USER_MIGR_ITER_STATE_INITIAL);
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
* Returns 0 on success, -1 on failure setting errno.
|
|
*/
|
|
ssize_t
|
|
MOCK_DEFINE(handle_device_state)(vfu_ctx_t *vfu_ctx, struct migration *migr,
|
|
uint32_t device_state, bool notify)
|
|
{
|
|
|
|
assert(migr != NULL);
|
|
|
|
if (!vfio_migr_state_transition_is_valid(migr->info.device_state,
|
|
device_state)) {
|
|
return ERROR_INT(EINVAL);
|
|
}
|
|
return migr_trans_to_valid_state(vfu_ctx, migr, device_state, notify);
|
|
}
|
|
|
|
/**
|
|
* Returns 0 on success, -1 on error setting errno.
|
|
*/
|
|
static ssize_t
|
|
handle_pending_bytes(vfu_ctx_t *vfu_ctx, struct migration *migr,
|
|
uint64_t *pending_bytes, bool is_write)
|
|
{
|
|
assert(migr != NULL);
|
|
assert(pending_bytes != NULL);
|
|
|
|
if (is_write) {
|
|
return ERROR_INT(EINVAL);
|
|
}
|
|
|
|
if (migr->iter.state == VFIO_USER_MIGR_ITER_STATE_FINISHED) {
|
|
*pending_bytes = 0;
|
|
return 0;
|
|
}
|
|
|
|
switch (migr->iter.state) {
|
|
case VFIO_USER_MIGR_ITER_STATE_INITIAL:
|
|
case VFIO_USER_MIGR_ITER_STATE_DATA_PREPARED:
|
|
/*
|
|
* FIXME what happens if data haven't been consumed in the previous
|
|
* iteration? Check https://www.spinics.net/lists/kvm/msg228608.html.
|
|
*/
|
|
*pending_bytes = migr->iter.pending_bytes = migr->callbacks.get_pending_bytes(vfu_ctx);
|
|
|
|
if (*pending_bytes == 0) {
|
|
migr_state_transition(migr, VFIO_USER_MIGR_ITER_STATE_FINISHED);
|
|
} else {
|
|
migr_state_transition(migr, VFIO_USER_MIGR_ITER_STATE_STARTED);
|
|
}
|
|
break;
|
|
case VFIO_USER_MIGR_ITER_STATE_STARTED:
|
|
/*
|
|
* FIXME We might be wrong returning a cached value, check
|
|
* https://www.spinics.net/lists/kvm/msg228608.html
|
|
*
|
|
*/
|
|
*pending_bytes = migr->iter.pending_bytes;
|
|
break;
|
|
default:
|
|
return ERROR_INT(EINVAL);
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* FIXME reading or writing migration registers with the wrong device state or
|
|
* out of sequence is undefined, but should not result in EINVAL, it should
|
|
* simply be ignored. However this way it's easier to catch development errors.
|
|
* Make this behavior conditional.
|
|
*/
|
|
|
|
/**
|
|
* Returns 0 on success, -1 on error setting errno.
|
|
*/
|
|
static ssize_t
|
|
handle_data_offset_when_saving(vfu_ctx_t *vfu_ctx, struct migration *migr,
|
|
bool is_write)
|
|
{
|
|
int ret = 0;
|
|
|
|
assert(migr != NULL);
|
|
|
|
if (is_write) {
|
|
vfu_log(vfu_ctx, LOG_ERR, "data_offset is RO when saving");
|
|
return ERROR_INT(EINVAL);
|
|
}
|
|
|
|
switch (migr->iter.state) {
|
|
case VFIO_USER_MIGR_ITER_STATE_STARTED:
|
|
ret = migr->callbacks.prepare_data(vfu_ctx, &migr->iter.offset,
|
|
&migr->iter.size);
|
|
if (ret != 0) {
|
|
return ret;
|
|
}
|
|
/*
|
|
* FIXME must first read data_offset and then data_size. They way we've
|
|
* implemented it now, if data_size is read before data_offset we
|
|
* transition to state VFIO_USER_MIGR_ITER_STATE_DATA_PREPARED without
|
|
* calling callbacks.prepare_data, which is wrong. Maybe we need
|
|
* separate states for data_offset and data_size.
|
|
*/
|
|
migr_state_transition(migr, VFIO_USER_MIGR_ITER_STATE_DATA_PREPARED);
|
|
break;
|
|
case VFIO_USER_MIGR_ITER_STATE_DATA_PREPARED:
|
|
/*
|
|
* data_offset is invariant during a save iteration.
|
|
*/
|
|
break;
|
|
default:
|
|
vfu_log(vfu_ctx, LOG_ERR,
|
|
"reading data_offset out of sequence is undefined");
|
|
return ERROR_INT(EINVAL);
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
* Returns 0 on success, -1 on error setting errno.
|
|
*/
|
|
static ssize_t
|
|
handle_data_offset(vfu_ctx_t *vfu_ctx, struct migration *migr,
|
|
uint64_t *offset, bool is_write)
|
|
{
|
|
int ret;
|
|
|
|
assert(migr != NULL);
|
|
assert(offset != NULL);
|
|
|
|
switch (migr->info.device_state) {
|
|
case VFIO_DEVICE_STATE_V1_SAVING:
|
|
case VFIO_DEVICE_STATE_V1_RUNNING | VFIO_DEVICE_STATE_V1_SAVING:
|
|
ret = handle_data_offset_when_saving(vfu_ctx, migr, is_write);
|
|
if (ret == 0 && !is_write) {
|
|
*offset = migr->iter.offset + migr->data_offset;
|
|
}
|
|
return ret;
|
|
case VFIO_DEVICE_STATE_V1_RESUMING:
|
|
if (is_write) {
|
|
/* TODO writing to read-only registers should be simply ignored */
|
|
vfu_log(vfu_ctx, LOG_ERR, "bad write to migration data_offset");
|
|
return ERROR_INT(EINVAL);
|
|
}
|
|
ret = migr->callbacks.prepare_data(vfu_ctx, offset, NULL);
|
|
if (ret != 0) {
|
|
return ret;
|
|
}
|
|
*offset += migr->data_offset;
|
|
return 0;
|
|
}
|
|
/* TODO improve error message */
|
|
vfu_log(vfu_ctx, LOG_ERR,
|
|
"bad access to migration data_offset in state %s",
|
|
migr_states[migr->info.device_state].name);
|
|
return ERROR_INT(EINVAL);
|
|
}
|
|
|
|
/**
|
|
* Returns 0 on success, -1 on failure setting errno.
|
|
*/
|
|
static ssize_t
|
|
handle_data_size_when_saving(vfu_ctx_t *vfu_ctx, struct migration *migr,
|
|
bool is_write)
|
|
{
|
|
assert(migr != NULL);
|
|
|
|
if (is_write) {
|
|
/* TODO improve error message */
|
|
vfu_log(vfu_ctx, LOG_ERR, "data_size is RO when saving");
|
|
return ERROR_INT(EINVAL);
|
|
}
|
|
|
|
if (migr->iter.state != VFIO_USER_MIGR_ITER_STATE_STARTED &&
|
|
migr->iter.state != VFIO_USER_MIGR_ITER_STATE_DATA_PREPARED) {
|
|
vfu_log(vfu_ctx, LOG_ERR,
|
|
"reading data_size ouf of sequence is undefined");
|
|
return ERROR_INT(EINVAL);
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
* Returns 0 on success, -1 on error setting errno.
|
|
*/
|
|
static ssize_t
|
|
handle_data_size_when_resuming(vfu_ctx_t *vfu_ctx, struct migration *migr,
|
|
uint64_t size, bool is_write)
|
|
{
|
|
assert(migr != NULL);
|
|
|
|
if (is_write) {
|
|
return migr->callbacks.data_written(vfu_ctx, size);
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
* Returns 0 on success, -1 on failure setting errno.
|
|
*/
|
|
static ssize_t
|
|
handle_data_size(vfu_ctx_t *vfu_ctx, struct migration *migr,
|
|
uint64_t *size, bool is_write)
|
|
{
|
|
int ret;
|
|
|
|
assert(vfu_ctx != NULL);
|
|
assert(size != NULL);
|
|
|
|
switch (migr->info.device_state){
|
|
case VFIO_DEVICE_STATE_V1_SAVING:
|
|
case VFIO_DEVICE_STATE_V1_RUNNING | VFIO_DEVICE_STATE_V1_SAVING:
|
|
ret = handle_data_size_when_saving(vfu_ctx, migr, is_write);
|
|
if (ret == 0 && !is_write) {
|
|
*size = migr->iter.size;
|
|
}
|
|
return ret;
|
|
case VFIO_DEVICE_STATE_V1_RESUMING:
|
|
return handle_data_size_when_resuming(vfu_ctx, migr, *size, is_write);
|
|
}
|
|
/* TODO improve error message */
|
|
vfu_log(vfu_ctx, LOG_ERR, "bad access to data_size");
|
|
return ERROR_INT(EINVAL);
|
|
}
|
|
|
|
/**
|
|
* Returns 0 on success, -1 on failure setting errno.
|
|
*/
|
|
ssize_t
|
|
MOCK_DEFINE(migration_region_access_registers)(vfu_ctx_t *vfu_ctx, char *buf,
|
|
size_t count, loff_t pos,
|
|
bool is_write)
|
|
{
|
|
struct migration *migr = vfu_ctx->migration;
|
|
int ret;
|
|
uint32_t *device_state, old_device_state;
|
|
|
|
assert(migr != NULL);
|
|
|
|
switch (pos) {
|
|
case offsetof(struct vfio_user_migration_info, device_state):
|
|
if (count != sizeof(migr->info.device_state)) {
|
|
vfu_log(vfu_ctx, LOG_ERR,
|
|
"bad device_state access size %ld", count);
|
|
return ERROR_INT(EINVAL);
|
|
}
|
|
device_state = (uint32_t *)buf;
|
|
if (!is_write) {
|
|
*device_state = migr->info.device_state;
|
|
return 0;
|
|
}
|
|
old_device_state = migr->info.device_state;
|
|
vfu_log(vfu_ctx, LOG_DEBUG,
|
|
"migration: transitioning from state %s to state %s",
|
|
migr_states[old_device_state].name,
|
|
migr_states[*device_state].name);
|
|
|
|
ret = handle_device_state(vfu_ctx, migr, *device_state, true);
|
|
if (ret == 0) {
|
|
vfu_log(vfu_ctx, LOG_DEBUG,
|
|
"migration: transitioned from state %s to state %s",
|
|
migr_states[old_device_state].name,
|
|
migr_states[*device_state].name);
|
|
} else {
|
|
vfu_log(vfu_ctx, LOG_ERR,
|
|
"migration: failed to transition from state %s to state %s",
|
|
migr_states[old_device_state].name,
|
|
migr_states[*device_state].name);
|
|
}
|
|
break;
|
|
case offsetof(struct vfio_user_migration_info, pending_bytes):
|
|
if (count != sizeof(migr->info.pending_bytes)) {
|
|
vfu_log(vfu_ctx, LOG_ERR,
|
|
"bad pending_bytes access size %ld", count);
|
|
return ERROR_INT(EINVAL);
|
|
}
|
|
ret = handle_pending_bytes(vfu_ctx, migr, (uint64_t *)buf, is_write);
|
|
break;
|
|
case offsetof(struct vfio_user_migration_info, data_offset):
|
|
if (count != sizeof(migr->info.data_offset)) {
|
|
vfu_log(vfu_ctx, LOG_ERR,
|
|
"bad data_offset access size %ld", count);
|
|
return ERROR_INT(EINVAL);
|
|
}
|
|
ret = handle_data_offset(vfu_ctx, migr, (uint64_t *)buf, is_write);
|
|
break;
|
|
case offsetof(struct vfio_user_migration_info, data_size):
|
|
if (count != sizeof(migr->info.data_size)) {
|
|
vfu_log(vfu_ctx, LOG_ERR,
|
|
"bad data_size access size %ld", count);
|
|
return ERROR_INT(EINVAL);
|
|
}
|
|
ret = handle_data_size(vfu_ctx, migr, (uint64_t *)buf, is_write);
|
|
break;
|
|
default:
|
|
vfu_log(vfu_ctx, LOG_ERR, "bad migration region register offset %#lx",
|
|
pos);
|
|
return ERROR_INT(EINVAL);
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
ssize_t
|
|
migration_region_access(vfu_ctx_t *vfu_ctx, char *buf, size_t count,
|
|
loff_t pos, bool is_write)
|
|
{
|
|
struct migration *migr = vfu_ctx->migration;
|
|
ssize_t ret;
|
|
|
|
assert(migr != NULL);
|
|
assert(buf != NULL);
|
|
|
|
/*
|
|
* FIXME don't call the device callback if the migration state is in not in
|
|
* pre-copy/stop-and-copy/resuming state, since the behavior is undefined
|
|
* in that case.
|
|
*/
|
|
|
|
if (pos + count <= sizeof(struct vfio_user_migration_info)) {
|
|
ret = migration_region_access_registers(vfu_ctx, buf, count,
|
|
pos, is_write);
|
|
if (ret != 0) {
|
|
return ret;
|
|
}
|
|
} else {
|
|
|
|
if (pos < (loff_t)migr->data_offset) {
|
|
/*
|
|
* TODO we can simply ignore the access to that part and handle
|
|
* any access to the data region properly.
|
|
*/
|
|
vfu_log(vfu_ctx, LOG_WARNING,
|
|
"bad access to dead space %#lx-%#lx in migration region",
|
|
pos, pos + count - 1);
|
|
return ERROR_INT(EINVAL);
|
|
}
|
|
|
|
pos -= migr->data_offset;
|
|
if (is_write) {
|
|
ret = migr->callbacks.write_data(vfu_ctx, buf, count, pos);
|
|
if (ret < 0) {
|
|
return -1;
|
|
}
|
|
} else {
|
|
/*
|
|
* FIXME <linux/vfio.h> says:
|
|
*
|
|
* d. Read data_size bytes of data from (region + data_offset) from the
|
|
* migration region.
|
|
*
|
|
* Does this mean that partial reads are not allowed?
|
|
*/
|
|
ret = migr->callbacks.read_data(vfu_ctx, buf, count, pos);
|
|
if (ret < 0) {
|
|
return -1;
|
|
}
|
|
}
|
|
}
|
|
|
|
return count;
|
|
}
|
|
|
|
bool
|
|
MOCK_DEFINE(device_is_stopped_and_copying)(struct migration *migr)
|
|
{
|
|
return migr != NULL && migr->info.device_state == VFIO_DEVICE_STATE_V1_SAVING;
|
|
}
|
|
|
|
bool
|
|
MOCK_DEFINE(device_is_stopped)(struct migration *migr)
|
|
{
|
|
return migr != NULL && migr->info.device_state == VFIO_DEVICE_STATE_V1_STOP;
|
|
}
|
|
|
|
size_t
|
|
migration_get_pgsize(struct migration *migr)
|
|
{
|
|
assert(migr != NULL);
|
|
|
|
return migr->pgsize;
|
|
}
|
|
|
|
int
|
|
migration_set_pgsize(struct migration *migr, size_t pgsize)
|
|
{
|
|
assert(migr != NULL);
|
|
|
|
// FIXME?
|
|
if (pgsize != PAGE_SIZE) {
|
|
return ERROR_INT(EINVAL);
|
|
}
|
|
|
|
migr->pgsize = pgsize;
|
|
return 0;
|
|
}
|
|
|
|
bool
|
|
access_migration_needs_quiesce(const vfu_ctx_t *vfu_ctx, size_t region_index,
|
|
uint64_t offset)
|
|
{
|
|
/*
|
|
* Writing to the migration state register with an unaligned access won't
|
|
* trigger this check but that's not a problem because
|
|
* migration_region_access_registers will fail the access.
|
|
*/
|
|
return region_index == VFU_PCI_DEV_MIGR_REGION_IDX
|
|
&& vfu_ctx->migration != NULL
|
|
&& offset == offsetof(struct vfio_user_migration_info, device_state);
|
|
}
|
|
|
|
/* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */
|