/* * (C) Copyright HUAWEI Technology Corp. 2017, All Rights Reserved. * * path_latency.c * * Prioritizer for device mapper multipath, where the corresponding priority * values of specific paths are provided by a latency algorithm. And the * latency algorithm is dependent on arguments("io_num" and "base_num"). * * The principle of the algorithm as follows: * 1. By sending a certain number "io_num" of read IOs to the current path * continuously, the IOs' average latency can be calculated. * 2. Max value and min value of average latency are constant. According to * the average latency of each path and the "base_num" of logarithmic * scale, the priority "rc" of each path can be provided. * * Author(s): Yang Feng * Revised: Guan Junxiong * * This file is released under the GPL version 2, or any later version. */ #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include "debug.h" #include "prio.h" #include "structs.h" #include "util.h" #include "time-util.h" #define pp_pl_log(prio, fmt, args...) condlog(prio, "path_latency prio: " fmt, ##args) #define MAX_IO_NUM 200 #define MIN_IO_NUM 20 #define DEF_IO_NUM 100 #define MAX_BASE_NUM 10 #define MIN_BASE_NUM 1.1 // This is 10**(1/4). 4 prio steps correspond to a factor of 10. #define DEF_BASE_NUM 1.77827941004 #define MAX_AVG_LATENCY 100000000. /* Unit: us */ #define MIN_AVG_LATENCY 1. /* Unit: us */ #define DEFAULT_PRIORITY 0 #define USEC_PER_SEC 1000000LL #define NSEC_PER_USEC 1000LL #define DEF_BLK_SIZE 4096 static int prepare_directio_read(int fd, int *blksz, char **pbuf, int *restore_flags) { unsigned long pgsize = getpagesize(); long flags; if (ioctl(fd, BLKBSZGET, blksz) < 0) { pp_pl_log(3,"catnnot get blocksize, set default"); *blksz = DEF_BLK_SIZE; } if (posix_memalign((void **)pbuf, pgsize, *blksz)) return -1; flags = fcntl(fd, F_GETFL); if (flags < 0) goto free_out; if (!(flags & O_DIRECT)) { flags |= O_DIRECT; if (fcntl(fd, F_SETFL, flags) < 0) goto free_out; *restore_flags = 1; } return 0; free_out: free(*pbuf); return -1; } static void cleanup_directio_read(int fd, char *buf, int restore_flags) { long flags; free(buf); if (!restore_flags) return; if ((flags = fcntl(fd, F_GETFL)) >= 0) { int ret __attribute__ ((unused)); flags &= ~O_DIRECT; /* No point in checking for errors */ ret = fcntl(fd, F_SETFL, flags); } } static int do_directio_read(int fd, unsigned int timeout, char *buf, int sz) { fd_set read_fds; struct timeval tm = { .tv_sec = timeout }; int ret; int num_read; if (lseek(fd, 0, SEEK_SET) == -1) return -1; FD_ZERO(&read_fds); FD_SET(fd, &read_fds); ret = select(fd+1, &read_fds, NULL, NULL, &tm); if (ret <= 0) return -1; num_read = read(fd, buf, sz); if (num_read != sz) return -1; return 0; } int check_args_valid(int io_num, double base_num) { if ((io_num < MIN_IO_NUM) || (io_num > MAX_IO_NUM)) { pp_pl_log(0, "args io_num is outside the valid range"); return 0; } if ((base_num < MIN_BASE_NUM) || (base_num > MAX_BASE_NUM)) { pp_pl_log(0, "args base_num is outside the valid range"); return 0; } return 1; } /* * In multipath.conf, args form: io_num=n base_num=m. For example, args are * "io_num=20 base_num=10", this function can get io_num value 20 and * base_num value 10. */ static int get_ionum_and_basenum(char *args, int *ionum, double *basenum) { char split_char[] = " \t"; char *arg, *temp; char *str, *str_inval; int i; int flag_io = 0, flag_base = 0; if ((args == NULL) || (ionum == NULL) || (basenum == NULL)) { pp_pl_log(0, "args string is NULL"); return 0; } arg = temp = STRDUP(args); if (!arg) return 0; for (i = 0; i < 2; i++) { str = get_next_string(&temp, split_char); if (!str) goto out; if (!strncmp(str, "io_num=", 7) && strlen(str) > 7) { *ionum = (int)strtoul(str + 7, &str_inval, 10); if (str == str_inval) goto out; flag_io = 1; } else if (!strncmp(str, "base_num=", 9) && strlen(str) > 9) { *basenum = strtod(str + 9, &str_inval); if (str == str_inval) goto out; flag_base = 1; } } if (!flag_io || !flag_base) goto out; if (check_args_valid(*ionum, *basenum) == 0) goto out; FREE(arg); return 1; out: FREE(arg); return 0; } /* * Do not scale the prioriy in a certain range such as [0, 1024] * because scaling will eliminate the effect of base_num. */ int calcPrio(double lg_avglatency, double lg_maxavglatency, double lg_minavglatency) { if (lg_avglatency <= lg_minavglatency) return lg_maxavglatency - lg_minavglatency; if (lg_avglatency >= lg_maxavglatency) return 0; return lg_maxavglatency - lg_avglatency; } int getprio(struct path *pp, char *args, unsigned int timeout) { int rc, temp; int io_num = 0; double base_num = 0; double lg_avglatency, lg_maxavglatency, lg_minavglatency; double standard_deviation; double lg_toldelay = 0; int blksize; char *buf; int restore_flags = 0; double lg_base; double sum_squares = 0; if (pp->fd < 0) return -1; if (get_ionum_and_basenum(args, &io_num, &base_num) == 0) { io_num = DEF_IO_NUM; base_num = DEF_BASE_NUM; pp_pl_log(0, "%s: fails to get path_latency args, set default:" "io_num=%d base_num=%.3lf", pp->dev, io_num, base_num); } lg_base = log(base_num); lg_maxavglatency = log(MAX_AVG_LATENCY) / lg_base; lg_minavglatency = log(MIN_AVG_LATENCY) / lg_base; if (prepare_directio_read(pp->fd, &blksize, &buf, &restore_flags) < 0) return PRIO_UNDEF; temp = io_num; while (temp-- > 0) { struct timespec tv_before, tv_after, tv_diff; double diff, reldiff; (void)clock_gettime(CLOCK_MONOTONIC, &tv_before); if (do_directio_read(pp->fd, timeout, buf, blksize)) { pp_pl_log(0, "%s: path down", pp->dev); cleanup_directio_read(pp->fd, buf, restore_flags); return -1; } (void)clock_gettime(CLOCK_MONOTONIC, &tv_after); timespecsub(&tv_after, &tv_before, &tv_diff); diff = tv_diff.tv_sec * 1000 * 1000 + tv_diff.tv_nsec / 1000; if (diff == 0) /* * Avoid taking log(0). * This unlikely case is treated as minimum - * the sums don't increase */ continue; /* we scale by lg_base here */ reldiff = log(diff) / lg_base; /* * We assume that the latency complies with Log-normal * distribution. The logarithm of latency is in normal * distribution. */ lg_toldelay += reldiff; sum_squares += reldiff * reldiff; } cleanup_directio_read(pp->fd, buf, restore_flags); lg_avglatency = lg_toldelay / (long long)io_num; if (lg_avglatency > lg_maxavglatency) { pp_pl_log(2, "%s: average latency (%lld us) is outside the thresold (%lld us)", pp->dev, (long long)pow(base_num, lg_avglatency), (long long)MAX_AVG_LATENCY); return DEFAULT_PRIORITY; } standard_deviation = sqrt((sum_squares - lg_toldelay * lg_avglatency) / (io_num - 1)); rc = calcPrio(lg_avglatency, lg_maxavglatency, lg_minavglatency); pp_pl_log(3, "%s: latency avg=%.2e uncertainty=%.1f prio=%d\n", pp->dev, exp(lg_avglatency * lg_base), exp(standard_deviation * lg_base), rc); return rc; }