Blob Blame History Raw
/*
 * (C) Copyright HUAWEI Technology Corp. 2017   All Rights Reserved.
 *
 * ana.c
 * Version 1.00
 *
 * Tool to make use of a NVMe-feature called  Asymmetric Namespace Access.
 * It determines the ANA state of a device and prints a priority value to stdout.
 *
 * Author(s): Cheng Jike <chengjike.cheng@huawei.com>
 *            Li Jie <lijie34@huawei.com>
 *
 * This file is released under the GPL version 2, or any later version.
 */
#include <stdio.h>
#include <sys/ioctl.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <stdbool.h>
#include <libudev.h>

#include "debug.h"
#include "nvme-lib.h"
#include "prio.h"
#include "util.h"
#include "structs.h"

enum {
	ANA_ERR_GETCTRL_FAILED		= 1,
	ANA_ERR_NOT_NVME,
	ANA_ERR_NOT_SUPPORTED,
	ANA_ERR_GETANAS_OVERFLOW,
	ANA_ERR_GETANAS_NOTFOUND,
	ANA_ERR_GETANALOG_FAILED,
	ANA_ERR_GETNSID_FAILED,
	ANA_ERR_GETNS_FAILED,
	ANA_ERR_NO_MEMORY,
	ANA_ERR_NO_INFORMATION,
};

static const char *ana_errmsg[] = {
	[ANA_ERR_GETCTRL_FAILED]	= "couldn't get ctrl info",
	[ANA_ERR_NOT_NVME]		= "not an NVMe device",
	[ANA_ERR_NOT_SUPPORTED]		= "ANA not supported",
	[ANA_ERR_GETANAS_OVERFLOW]	= "buffer overflow in ANA log",
	[ANA_ERR_GETANAS_NOTFOUND]	= "NSID or ANAGRPID not found",
	[ANA_ERR_GETANALOG_FAILED]	= "couldn't get ana log",
	[ANA_ERR_GETNSID_FAILED]	= "couldn't get NSID",
	[ANA_ERR_GETNS_FAILED]		= "couldn't get namespace info",
	[ANA_ERR_NO_MEMORY]		= "out of memory",
	[ANA_ERR_NO_INFORMATION]	= "invalid fd",
};

static const char *anas_string[] = {
	[NVME_ANA_OPTIMIZED]			= "ANA Optimized State",
	[NVME_ANA_NONOPTIMIZED]			= "ANA Non-Optimized State",
	[NVME_ANA_INACCESSIBLE]			= "ANA Inaccessible State",
	[NVME_ANA_PERSISTENT_LOSS]		= "ANA Persistent Loss State",
	[NVME_ANA_CHANGE]			= "ANA Change state",
};

static const char *aas_print_string(int rc)
{
	rc &= 0xff;
	if (rc >= 0 && rc < (int)ARRAY_SIZE(anas_string) &&
	    anas_string[rc] != NULL)
		return anas_string[rc];

	return "invalid ANA state";
}

static int get_ana_state(__u32 nsid, __u32 anagrpid, void *ana_log,
			 size_t ana_log_len)
{
	void *base = ana_log;
	struct nvme_ana_rsp_hdr *hdr = base;
	struct nvme_ana_group_desc *ana_desc;
	size_t offset = sizeof(struct nvme_ana_rsp_hdr);
	__u32 nr_nsids;
	size_t nsid_buf_size;
	int i;
	unsigned int j;

	for (i = 0; i < le16_to_cpu(hdr->ngrps); i++) {
		ana_desc = base + offset;

		offset += sizeof(*ana_desc);
		if (offset > ana_log_len)
			return -ANA_ERR_GETANAS_OVERFLOW;

		nr_nsids = le32_to_cpu(ana_desc->nnsids);
		nsid_buf_size = nr_nsids * sizeof(__le32);

		offset += nsid_buf_size;
		if (offset > ana_log_len)
			return -ANA_ERR_GETANAS_OVERFLOW;

		for (j = 0; j < nr_nsids; j++) {
			if (nsid == le32_to_cpu(ana_desc->nsids[j]))
				return ana_desc->state;
		}

		if (anagrpid != 0 && anagrpid == le32_to_cpu(ana_desc->grpid))
			return ana_desc->state;

	}
	return -ANA_ERR_GETANAS_NOTFOUND;
}

static int get_ana_info(struct path * pp)
{
	int	rc;
	__u32 nsid;
	struct nvme_id_ctrl ctrl;
	struct nvme_id_ns ns;
	void *ana_log;
	size_t ana_log_len;
	bool is_anagrpid_const;

	rc = nvme_id_ctrl_ana(pp->fd, &ctrl);
	if (rc < 0) {
		log_nvme_errcode(rc, pp->dev, "nvme_identify_ctrl");
		return -ANA_ERR_GETCTRL_FAILED;
	} else if (rc == 0)
		return -ANA_ERR_NOT_SUPPORTED;

	nsid = nvme_get_nsid(pp->fd);
	if (nsid <= 0) {
		log_nvme_errcode(rc, pp->dev, "nvme_get_nsid");
		return -ANA_ERR_GETNSID_FAILED;
	}
	is_anagrpid_const = ctrl.anacap & (1 << 6);

	/*
	 * Code copied from nvme-cli/nvme.c. We don't need to allocate an
	 * [nanagrpid*mnan] array of NSIDs because each NSID can occur at most
	 * in one ANA group.
	 */
	ana_log_len = sizeof(struct nvme_ana_rsp_hdr) +
		le32_to_cpu(ctrl.nanagrpid)
		* sizeof(struct nvme_ana_group_desc);

	if (is_anagrpid_const) {
		rc = nvme_identify_ns(pp->fd, nsid, 0, &ns);
		if (rc) {
			log_nvme_errcode(rc, pp->dev, "nvme_identify_ns");
			return -ANA_ERR_GETNS_FAILED;
		}
	} else
		ana_log_len += le32_to_cpu(ctrl.mnan) * sizeof(__le32);

	ana_log = malloc(ana_log_len);
	if (!ana_log)
		return -ANA_ERR_NO_MEMORY;
	pthread_cleanup_push(free, ana_log);
	rc = nvme_ana_log(pp->fd, ana_log, ana_log_len,
			  is_anagrpid_const ? NVME_ANA_LOG_RGO : 0);
	if (rc) {
		log_nvme_errcode(rc, pp->dev, "nvme_ana_log");
		rc = -ANA_ERR_GETANALOG_FAILED;
	} else
		rc = get_ana_state(nsid,
				   is_anagrpid_const ?
				   le32_to_cpu(ns.anagrpid) : 0,
				   ana_log, ana_log_len);
	pthread_cleanup_pop(1);
	if (rc >= 0)
		condlog(4, "%s: ana state = %02x [%s]", pp->dev, rc,
			aas_print_string(rc));
	return rc;
}

/*
 * Priorities modeled roughly after the ALUA model (alua.c/sysfs.c)
 * Reference: ANA Base Protocol (NVMe TP 4004a, 11/13/2018).
 *
 * Differences:
 *
 * - The ANA base spec defines no implicit or explicit (STPG) state management.
 *   If a state is encountered that doesn't allow normal I/O (all except
 *   OPTIMIZED and NON_OPTIMIZED), we can't do anything but either wait for a
 *   Access State Change Notice (can't do that in multipathd as we don't receive
 *   those), or retry commands in regular time intervals until ANATT is expired
 *   (not implemented). Mapping UNAVAILABLE state to ALUA STANDBY is the best we
 *   can currently do.
 *
 *   FIXME: Waiting for ANATT could be implemented with a "delayed failback"
 *   mechanism. The current "failback" method can't be used, as it would
 *   affect failback to every state, and here only failback to UNAVAILABLE
 *   should be delayed.
 *
 * - PERSISTENT_LOSS state is even below ALUA's UNAVAILABLE state.
 *   FIXME: According to the ANA TP, accessing paths in PERSISTENT_LOSS state
 *   in any way makes no sense (e.g. ยง8.19.6 - paths in this state shouldn't
 *   even be checked under "all paths down" conditions). Device mapper can,
 *   and will, select a PG for IO if it has non-failed paths, even if the
 *   PG has priority 0. We could avoid that only with an "ANA path checker".
 *
 * - ALUA has no CHANGE state. The ANA TP ยง8.18.3 / ยง8.19.4 suggests
 *   that CHANGE state should be treated in roughly the same way as
 *   INACCESSIBLE. Therefore we assign the same prio to it.
 *
 * - ALUA's LBA-dependent state has no ANA equivalent.
 */

int getprio(struct path *pp, __attribute__((unused)) char *args,
	    __attribute__((unused)) unsigned int timeout)
{
	int rc;

	if (pp->fd < 0)
		rc = -ANA_ERR_NO_INFORMATION;
	else
		rc = get_ana_info(pp);

	switch (rc) {
	case NVME_ANA_OPTIMIZED:
		return 50;
	case NVME_ANA_NONOPTIMIZED:
		return 10;
	case NVME_ANA_INACCESSIBLE:
	case NVME_ANA_CHANGE:
		return 1;
	case NVME_ANA_PERSISTENT_LOSS:
		return 0;
	default:
		break;
	}
	if (rc < 0 && -rc < (int)ARRAY_SIZE(ana_errmsg))
		condlog(2, "%s: ANA error: %s", pp->dev, ana_errmsg[-rc]);
	else
		condlog(1, "%s: invalid ANA rc code %d", pp->dev, rc);
	return -1;
}