Blob Blame History Raw
/*
 * (C) Copyright HUAWEI Technology Corp. 2017, All Rights Reserved.
 *
 * path_latency.c
 *
 * Prioritizer for device mapper multipath, where the corresponding priority
 * values of specific paths are provided by a latency algorithm. And the
 * latency algorithm is dependent on arguments("io_num" and "base_num").
 *
 * The principle of the algorithm as follows:
 * 1. By sending a certain number "io_num" of read IOs to the current path
 *    continuously, the IOs' average latency can be calculated.
 * 2. Max value and min value of average latency are constant. According to
 *    the average latency of each path and the "base_num" of logarithmic
 *    scale, the priority "rc" of each path can be provided.
 *
 * Author(s): Yang Feng <philip.yang@huawei.com>
 * Revised:   Guan Junxiong <guanjunxiong@huawei.com>
 *
 * This file is released under the GPL version 2, or any later version.
 */

#define _GNU_SOURCE
#include <stdio.h>
#include <math.h>
#include <ctype.h>
#include <time.h>
#include <fcntl.h>
#include <sys/ioctl.h>
#include <linux/fs.h>
#include <unistd.h>

#include "debug.h"
#include "prio.h"
#include "structs.h"
#include "util.h"
#include "time-util.h"

#define pp_pl_log(prio, fmt, args...) condlog(prio, "path_latency prio: " fmt, ##args)

#define MAX_IO_NUM		200
#define MIN_IO_NUM		20
#define DEF_IO_NUM		100

#define MAX_BASE_NUM		10
#define MIN_BASE_NUM		1.1
// This is 10**(1/4). 4 prio steps correspond to a factor of 10.
#define DEF_BASE_NUM		1.77827941004

#define MAX_AVG_LATENCY		100000000.	/* Unit: us */
#define MIN_AVG_LATENCY		1.		/* Unit: us */

#define DEFAULT_PRIORITY	0

#define USEC_PER_SEC		1000000LL
#define NSEC_PER_USEC		1000LL

#define DEF_BLK_SIZE		4096

static int prepare_directio_read(int fd, int *blksz, char **pbuf,
		int *restore_flags)
{
	unsigned long pgsize = getpagesize();
	long flags;

	if (ioctl(fd, BLKBSZGET, blksz) < 0) {
		pp_pl_log(3,"catnnot get blocksize, set default");
		*blksz = DEF_BLK_SIZE;
	}
	if (posix_memalign((void **)pbuf, pgsize, *blksz))
		return -1;

	flags = fcntl(fd, F_GETFL);
	if (flags < 0)
		goto free_out;
	if (!(flags & O_DIRECT)) {
		flags |= O_DIRECT;
		if (fcntl(fd, F_SETFL, flags) < 0)
			goto free_out;
		*restore_flags = 1;
	}

	return 0;

free_out:
	free(*pbuf);

	return -1;
}

static void cleanup_directio_read(int fd, char *buf, int restore_flags)
{
	long flags;

	free(buf);

	if (!restore_flags)
		return;
	if ((flags = fcntl(fd, F_GETFL)) >= 0) {
		int ret __attribute__ ((unused));
		flags &= ~O_DIRECT;
		/* No point in checking for errors */
		ret = fcntl(fd, F_SETFL, flags);
	}
}

static int do_directio_read(int fd, unsigned int timeout, char *buf, int sz)
{
	fd_set read_fds;
	struct timeval tm = { .tv_sec = timeout };
	int ret;
	int num_read;

	if (lseek(fd, 0, SEEK_SET) == -1)
		return -1;
	FD_ZERO(&read_fds);
	FD_SET(fd, &read_fds);
	ret = select(fd+1, &read_fds, NULL, NULL, &tm);
	if (ret <= 0)
		return -1;
	num_read = read(fd, buf, sz);
	if (num_read != sz)
		return -1;

	return 0;
}

int check_args_valid(int io_num, double base_num)
{
	if ((io_num < MIN_IO_NUM) || (io_num > MAX_IO_NUM)) {
		pp_pl_log(0, "args io_num is outside the valid range");
		return 0;
	}

	if ((base_num < MIN_BASE_NUM) || (base_num > MAX_BASE_NUM)) {
		pp_pl_log(0, "args base_num is outside the valid range");
		return 0;
	}

	return 1;
}

/*
 * In multipath.conf, args form: io_num=n base_num=m. For example, args are
 * "io_num=20 base_num=10", this function can get io_num value 20 and
 * base_num value 10.
 */
static int get_ionum_and_basenum(char *args, int *ionum, double *basenum)
{
	char split_char[] = " \t";
	char *arg, *temp;
	char *str, *str_inval;
	int i;
	int flag_io = 0, flag_base = 0;

	if ((args == NULL) || (ionum == NULL) || (basenum == NULL)) {
		pp_pl_log(0, "args string is NULL");
		return 0;
	}

	arg = temp = STRDUP(args);
	if (!arg)
		return 0;

	for (i = 0; i < 2; i++) {
		str = get_next_string(&temp, split_char);
		if (!str)
			goto out;
		if (!strncmp(str, "io_num=", 7) && strlen(str) > 7) {
			*ionum = (int)strtoul(str + 7, &str_inval, 10);
			if (str == str_inval)
				goto out;
			flag_io = 1;
		}
		else if (!strncmp(str, "base_num=", 9) && strlen(str) > 9) {
			*basenum = strtod(str + 9, &str_inval);
			if (str == str_inval)
				goto out;
			flag_base = 1;
		}
	}

	if (!flag_io || !flag_base)
		goto out;
	if (check_args_valid(*ionum, *basenum) == 0)
		goto out;

	FREE(arg);
	return 1;
out:
	FREE(arg);
	return 0;
}

/*
 * Do not scale the prioriy in a certain range such as [0, 1024]
 * because scaling will eliminate the effect of base_num.
 */
int calcPrio(double lg_avglatency, double lg_maxavglatency,
		double lg_minavglatency)
{
	if (lg_avglatency <= lg_minavglatency)
		return lg_maxavglatency - lg_minavglatency;

	if (lg_avglatency >= lg_maxavglatency)
		return 0;

	return lg_maxavglatency - lg_avglatency;
}

int getprio(struct path *pp, char *args, unsigned int timeout)
{
	int rc, temp;
	int io_num = 0;
	double base_num = 0;
	double lg_avglatency, lg_maxavglatency, lg_minavglatency;
	double standard_deviation;
	double lg_toldelay = 0;
	int blksize;
	char *buf;
	int restore_flags = 0;
	double lg_base;
	double sum_squares = 0;

	if (pp->fd < 0)
		return -1;

	if (get_ionum_and_basenum(args, &io_num, &base_num) == 0) {
		io_num = DEF_IO_NUM;
		base_num = DEF_BASE_NUM;
		pp_pl_log(0, "%s: fails to get path_latency args, set default:"
				"io_num=%d base_num=%.3lf",
				pp->dev, io_num, base_num);
	}

	lg_base = log(base_num);
	lg_maxavglatency = log(MAX_AVG_LATENCY) / lg_base;
	lg_minavglatency = log(MIN_AVG_LATENCY) / lg_base;

	if (prepare_directio_read(pp->fd, &blksize, &buf, &restore_flags) < 0)
		return PRIO_UNDEF;

	temp = io_num;
	while (temp-- > 0) {
		struct timespec tv_before, tv_after, tv_diff;
		double diff, reldiff;

		(void)clock_gettime(CLOCK_MONOTONIC, &tv_before);

		if (do_directio_read(pp->fd, timeout, buf, blksize)) {
			pp_pl_log(0, "%s: path down", pp->dev);
			cleanup_directio_read(pp->fd, buf, restore_flags);
			return -1;
		}

		(void)clock_gettime(CLOCK_MONOTONIC, &tv_after);

		timespecsub(&tv_after, &tv_before, &tv_diff);
		diff = tv_diff.tv_sec * 1000 * 1000 + tv_diff.tv_nsec / 1000;

		if (diff == 0)
			/*
			 * Avoid taking log(0).
			 * This unlikely case is treated as minimum -
			 * the sums don't increase
			 */
			continue;

		/* we scale by lg_base here */
		reldiff = log(diff) / lg_base;

		/*
		 * We assume that the latency complies with Log-normal
		 * distribution. The logarithm of latency is in normal
		 * distribution.
		 */
		lg_toldelay += reldiff;
		sum_squares += reldiff * reldiff;
	}

	cleanup_directio_read(pp->fd, buf, restore_flags);

	lg_avglatency = lg_toldelay / (long long)io_num;

	if (lg_avglatency > lg_maxavglatency) {
		pp_pl_log(2,
			  "%s: average latency (%lld us) is outside the thresold (%lld us)",
			  pp->dev, (long long)pow(base_num, lg_avglatency),
			  (long long)MAX_AVG_LATENCY);
		return DEFAULT_PRIORITY;
	}

	standard_deviation = sqrt((sum_squares - lg_toldelay * lg_avglatency)
				  / (io_num - 1));

	rc = calcPrio(lg_avglatency, lg_maxavglatency, lg_minavglatency);

	pp_pl_log(3, "%s: latency avg=%.2e uncertainty=%.1f prio=%d\n",
		  pp->dev, exp(lg_avglatency * lg_base),
		  exp(standard_deviation * lg_base), rc);

	return rc;
}