Blob Blame History Raw
/*
 * Copyright (c) 2013, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *   * Redistributions of source code must retain the above copyright notice,
 *     this list of conditions and the following disclaimer.
 *   * Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in the
 *     documentation and/or other materials provided with the distribution.
 *   * Neither the name of Intel Corporation nor the names of its contributors
 *     may be used to endorse or promote products derived from this software
 *     without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#define _GNU_SOURCE
#include <inttypes.h>
#include <stdlib.h>
#include <sys/types.h>
#include <stdio.h>
#include <unistd.h>
#include <string.h>
#include <strings.h>
#include <pthread.h>
#include <fcntl.h>
#include <sys/stat.h>
#include <errno.h>
#include <limits.h>
#include <locale.h>
#include <math.h>
#include <sys/wait.h>
#include "../include/types.h"
#include "../include/util.h"
#include "../include/os/os_util.h"

uint64_t g_clkofsec;
double g_nsofclk;
unsigned int g_pqos_moni_id;

boolean_t
os_authorized(void)
{
	return (B_TRUE);
}

int
os_numatop_lock(boolean_t *locked __attribute__((unused)))
{
	/* Not supported on Linux */
	return (0);
}

void
os_numatop_unlock(void)
{
	/* Not supported on Linux */
}

int
os_procfs_psinfo_get(pid_t pid __attribute__((unused)),
	void *info __attribute__((unused)))
{
	/* Not supported on Linux */
	return (0);
}

/*
 * Retrieve the process's executable name from '/proc'
 */
int
os_procfs_pname_get(pid_t pid, char *buf, int size)
{
	char pname[PATH_MAX];
	int procfd; /* file descriptor for /proc/nnnnn/comm */
	int len;

	snprintf(pname, sizeof (pname), "/proc/%d/comm", pid);
	if ((procfd = open(pname, O_RDONLY)) < 0) {
		return (-1);
	}

	if ((len = read(procfd, buf, size)) < 0) {
		(void) close(procfd);
		return (-1);
	}

	buf[len - 1] = 0;
	(void) close(procfd);
	return (0);
}

/*
 * Retrieve the lwpid in process from '/proc'.
 */
int
os_procfs_lwp_enum(pid_t pid, int **lwps, int *num)
{
	char path[PATH_MAX];

	(void) snprintf(path, sizeof (path), "/proc/%d/task", pid);
	return (procfs_enum_id(path, lwps, num));
}

/*
 * Check if the specified pid/lwpid can be found in '/proc'.
 */
boolean_t
os_procfs_lwp_valid(pid_t pid __attribute__((unused)),
	int lwpid __attribute__((unused)))
{
	/* Not supported on Linux */
	return (B_TRUE);
}

/*
 * Bind current thread to a cpu or unbind current thread
 * from a cpu.
 */
int
processor_bind(int cpu)
{
	cpu_set_t cs;

	CPU_ZERO (&cs);
	CPU_SET (cpu, &cs);

	if (sched_setaffinity(0, sizeof (cs), &cs) < 0) {
		debug_print(NULL, 2, "Fail to bind to CPU%d\n", cpu);
    		return (-1);
    	}

	return (0);
}

int
processor_unbind(void)
{
	cpu_set_t cs;
	int i;

	CPU_ZERO (&cs);
	for (i = 0; i < g_ncpus; i++) {
		CPU_SET (i, &cs);
	}

	if (sched_setaffinity(0, sizeof (cs), &cs) < 0) {
		debug_print(NULL, 2, "Fail to unbind from CPU\n");
		return (-1);
	}

	return (0);
}

static int
calibrate_cpuinfo(double *nsofclk, uint64_t *clkofsec)
{
	char unit[11] = {0};
	double freq = 0.0;

	if (arch__cpuinfo_freq(&freq, unit)) {
		return -1;
	}

	if (fabsl(freq) < 1.0E-6) {
		return (-1);
	}

	*clkofsec = freq;
	*nsofclk = (double)NS_SEC / *clkofsec;

	debug_print(NULL, 2, "calibrate_cpuinfo: nsofclk = %.4f, "
	    "clkofsec = %lu\n", *nsofclk, *clkofsec);

	return (0);
}

/*
 * On all recent Intel CPUs, the TSC frequency is always
 * the highest p-state. So get that frequency from sysfs.
 * e.g. 2262000
 */
static int
calibrate_cpufreq(double *nsofclk, uint64_t *clkofsec)
{
	int fd, i;
	char buf[32];
	uint64_t freq;

	if ((fd = open(CPU0_CPUFREQ_PATH, O_RDONLY)) < 0) {
		return (-1);
	}

	if ((i = read(fd, buf, sizeof (buf) - 1)) <= 0) {
		close(fd);
		return (-1);
	}

	close(fd);
	buf[i] = 0;
	if ((freq = atoll(buf)) == 0) {
		return (-1);
	}

	*clkofsec = freq * KHZ;
	*nsofclk = (double)NS_SEC / *clkofsec;

	debug_print(NULL, 2, "calibrate_cpufreq: nsofclk = %.4f, "
	    "clkofsec = %lu\n", *nsofclk, *clkofsec);

	return (0);
}

/*
 * Measure how many TSC cycles in a second and how many
 * nanoseconds in a TSC cycle.
 */
static void
calibrate_by_tsc(double *nsofclk, uint64_t *clkofsec)
{
	uint64_t start_ms, end_ms, diff_ms;
	uint64_t start_tsc, end_tsc;
	int i;

	for (i = 0; i < g_ncpus; i++) {
		/*
		 * Bind current thread to cpuN to ensure the
		 * thread can not be migrated to another cpu
		 * while the rdtsc runs.
		 */
		if (processor_bind(i) == 0) {
			break;
		}
	}

	if (i == g_ncpus) {
		return;
	}

	/*
	 * Make sure the start_ms is at the beginning of
	 * one millisecond.
	 */
	end_ms = current_ms(&g_tvbase);
	while ((start_ms = current_ms(&g_tvbase)) == end_ms) {}

	start_tsc = rdtsc();
	while ((end_ms = current_ms(&g_tvbase)) < (start_ms + 100)) {}
	end_tsc = rdtsc();

	diff_ms = end_ms - start_ms;
	*nsofclk = (double)(diff_ms * NS_MS) /
	    (double)(end_tsc - start_tsc);

	*clkofsec = (uint64_t)((double)NS_SEC / *nsofclk);

	/*
	 * Unbind current thread from cpu once the measurement completed.
	 */
	processor_unbind();

	debug_print(NULL, 2, "calibrate_by_tsc: nsofclk = %.4f, "
	    "clkofsec = %lu\n", *nsofclk, *clkofsec);
}

/*
 * calibrate_by_tsc() is the last method used by os_calibrate()
 * to calculate cpu frequency if cpu freq is not available by both
 * procfs and sysfs.
 *
 * On intel, calibrate_by_tsc() uses TSC register which gets updated
 * in sync of processor clock and thus cpu freq can be calculated
 * programmatically using this register.
 *
 * OTOH, PowerPC does not have analogue to TSC. There is a register
 * called TB (Time Base) but it's get updated at constant freq and
 * thus we can't find cpu frequency using TB register. But for
 * powerpc, cpu frequency is always gets exposed via either procfs
 * or sysfs and thus there is no point for depending on any other
 * method for powerpc.
 */
void
os_calibrate(double *nsofclk, uint64_t *clkofsec)
{
	if (calibrate_cpuinfo(nsofclk, clkofsec) == 0) {
		return;
	}
	
	if (calibrate_cpufreq(nsofclk, clkofsec) == 0) {
		return;	
	}

	calibrate_by_tsc(nsofclk, clkofsec);
}

static boolean_t
int_get(char *str, int *digit)
{
	char *end;
	long val;

	/* Distinguish success/failure after strtol */
	errno = 0;
	val = strtol(str, &end, 10);
	if (((errno == ERANGE) && ((val == LONG_MAX) || (val == LONG_MIN))) ||
		((errno != 0) && (val == 0))) {
		return (B_FALSE);
	}

	if (end == str) {
		return (B_FALSE);
	}	

	*digit = val;
	return (B_TRUE);
}

/*
 * The function is only called for processing small digits. 
 * For example, if the string is "0-9", extract the digit 0 and 9.
 */
static boolean_t
hyphen_int_extract(char *str, int *start, int *end)
{
	char tmp[DIGIT_LEN_MAX];

	if (strlen(str) >= DIGIT_LEN_MAX) {
		return (B_FALSE);
	}

	if (sscanf(str, "%511[^-]", tmp) <= 0) {
		return (B_FALSE);
	} 
   
   	if (!int_get(tmp, start)) {
   		return (B_FALSE);
   	}
   
	if (sscanf(str, "%*[^-]-%511s", tmp) <= 0) {
		return (B_FALSE);
	}

   	if (!int_get(tmp, end)) {
   		return (B_FALSE);
   	}

	return (B_TRUE);
}

static boolean_t
arrary_add(int *arr, int arr_size, int index, int value, int num)
{
	int i;
	
	if ((index >= arr_size) || ((index + num) > arr_size)) {
		return (B_FALSE);
	}

	for (i = 0; i < num; i++) {
		arr[index + i] = value + i;
	}

	return (B_TRUE);
}

/*
 * Extract the digits from string. For example:
 * "1-2,5-7"	return 1 2 5 6 7 in "arr".
 */
static boolean_t
str_int_extract(char *str, int *arr, int arr_size, int *num)
{
	char *p, *cur, *scopy;
	int start, end, total = 0;
	int len = strlen(str);
	boolean_t ret = B_FALSE;

	if ((scopy = malloc(len + 1)) == NULL) {
		return (B_FALSE);
	}

	strncpy(scopy, str, len);
	scopy[len] = 0;
	cur = scopy;

	while (cur < (scopy + len)) {
		if ((p = strchr(cur, ',')) != NULL) {
			*p = 0;
		}

		if (strchr(cur, '-') != NULL) {
			if (hyphen_int_extract(cur, &start, &end)) {
				if (arrary_add(arr, arr_size, total, start, end - start + 1)) {
					total += end - start + 1;
				} else {
					goto L_EXIT;
				}
			}
		} else {
			if (int_get(cur, &start)) {
				if (arrary_add(arr, arr_size, total, start, 1)) {
					total++;
				} else {
					goto L_EXIT;	
     				}
			}
		}

		if (p != NULL) {
			cur = p + 1;
		} else {
			break;
		}
	}

	*num = total;
	ret = B_TRUE;

L_EXIT:	
	free(scopy);
	return (ret);
}

static boolean_t
file_int_extract(char *path, int *arr, int arr_size, int *num)
{
	FILE *fp;
	char buf[LINE_SIZE];
	
	if ((fp = fopen(path, "r")) == NULL) {
		return (B_FALSE);
	}

	if (fgets(buf, LINE_SIZE, fp) == NULL) {
		fclose(fp);
		return (B_FALSE);
	}

	fclose(fp);
	return (str_int_extract(buf, arr, arr_size, num));
}

boolean_t
os_sysfs_node_enum(int *node_arr, int arr_size, int *num)
{
	return (file_int_extract(NODE_NONLINE_PATH, node_arr, arr_size, num));
}

boolean_t
os_sysfs_cpu_enum(int nid, int *cpu_arr, int arr_size, int *num)
{
	char path[PATH_MAX];
	
	snprintf(path, PATH_MAX, "%s/node%d/cpulist", NODE_INFO_ROOT, nid);
	return (file_int_extract(path, cpu_arr, arr_size, num));
}

int
os_sysfs_online_ncpus(void)
{
	int cpu_arr[NCPUS_MAX], num;
	char path[PATH_MAX];

	if (sysconf(_SC_NPROCESSORS_CONF) > NCPUS_MAX) {
		return (-1);
	}

	snprintf(path, PATH_MAX, "/sys/devices/system/cpu/online");
	if (!file_int_extract(path, cpu_arr, NCPUS_MAX, &num)) {
		return (-1);
	}

	return (num);
}

static boolean_t
memsize_parse(char *str, uint64_t *size)
{
	char *p;
	char tmp[DIGIT_LEN_MAX];
	
	if ((p = strchr(str, ':')) == NULL) {
		return (B_FALSE);
	}

	++p;
	if (sscanf(p, "%*[^0-9]%511[0-9]", tmp) <= 0) {
		return (B_FALSE);
	}

	*size = strtoll(tmp, NULL, 10) * KB_BYTES;
	return (B_TRUE);
}

boolean_t
os_sysfs_meminfo(int nid, node_meminfo_t *info)
{
	FILE *fp;
	char path[PATH_MAX];
	char *line = NULL;
	size_t len = 0;
	boolean_t ret = B_FALSE;
	int num = sizeof (node_meminfo_t) / sizeof (uint64_t), i = 0;

	memset(info, 0, sizeof (node_meminfo_t));	
	snprintf(path, PATH_MAX, "%s/node%d/meminfo", NODE_INFO_ROOT, nid);
	if ((fp = fopen(path, "r")) == NULL) {
		return (B_FALSE);
	}

	while ((getline(&line, &len, fp) > 0) && (i < num)) {
		if (strstr(line, "MemTotal:") != NULL) {
			if (!memsize_parse(line, &info->mem_total)) {
				goto L_EXIT;
			}
			i++;
			continue;
		}

		if (strstr(line, "MemFree:") != NULL) {
			if (!memsize_parse(line, &info->mem_free)) {
				goto L_EXIT;
			}
			i++;
			continue;
		}

		if (strstr(line, "Active:") != NULL) {
			if (!memsize_parse(line, &info->active)) {
				goto L_EXIT;
			}
			i++;
			continue;
		}

		if (strstr(line, "Inactive:") != NULL) {			
			if (!memsize_parse(line, &info->inactive)) {
				goto L_EXIT;
			}
			i++;
			continue;
		}

		if (strstr(line, "Dirty:") != NULL) {
			if (!memsize_parse(line, &info->dirty)) {
				goto L_EXIT;
			}
			i++;
			continue;
		}

		if (strstr(line, "Writeback:") != NULL) {
			if (!memsize_parse(line, &info->writeback)) {
				goto L_EXIT;
			}
			i++;
			continue;
		}

		if (strstr(line, "Mapped:") != NULL) {
			if (!memsize_parse(line, &info->mapped)) {
				goto L_EXIT;
			}
			i++;
			continue;
		}
	}

	ret = B_TRUE;

L_EXIT:
	if (line != NULL) {
		free(line);
	}

	fclose(fp);
	return (ret);
}

int
os_sysfs_cqm_llc_scale(const char *path, double *scale)
{
	FILE *fp;
	char buf[LINE_SIZE];

	*scale = 0.0;

	if ((fp = fopen(path, "r")) == NULL) {
		return (-1);
	}

	if (fgets(buf, LINE_SIZE, fp) == NULL) {
		fclose(fp);
		return (-1);
	}

	fclose(fp);
	*scale = strtod(buf, NULL);

	return 0;
}

int
os_sysfs_uncore_qpi_init(qpi_info_t *qpi, int num)
{
	int i, fd, qpi_num = 0;
	char path[PATH_MAX], buf[32];

	for (i = 0; i < num; i++)
	{
		snprintf(path, PATH_MAX, "/sys/devices/uncore_qpi_%d/type", i);
		if ((fd = open(path, O_RDONLY)) < 0)
			return qpi_num;

		if (read(fd, buf, sizeof(buf)) < 0) {
			close(fd);
			return qpi_num;
		}

		qpi_num++;
		qpi[i].type = atoi(buf);
		qpi[i].config = 0x600;
		qpi[i].id = i;
		qpi[i].value_scaled = 0;
		memset(qpi[i].values, 0, sizeof(qpi[i].values));
		qpi[i].fd = INVALID_FD;
		close(fd);
	}

	return qpi_num;
}

int
os_sysfs_uncore_upi_init(qpi_info_t *qpi, int num)
{
	int i, fd, qpi_num = 0;
	char path[PATH_MAX], buf[32];

	for (i = 0; i < num; i++)
	{
		snprintf(path, PATH_MAX, "/sys/devices/uncore_upi_%d/type", i);
		if ((fd = open(path, O_RDONLY)) < 0)
			return qpi_num;

		if (read(fd, buf, sizeof(buf)) < 0) {
			close(fd);
			return qpi_num;
		}

		qpi_num++;
		qpi[i].type = atoi(buf);
		qpi[i].config = 0x0f02;
		qpi[i].id = i;
		qpi[i].value_scaled = 0;
		memset(qpi[i].values, 0, sizeof(qpi[i].values));
		qpi[i].fd = INVALID_FD;
		close(fd);
	}

	return qpi_num;
}

int
os_sysfs_uncore_imc_init(imc_info_t *imc, int num)
{
	int i, fd, imc_num = 0;
	char path[PATH_MAX], buf[32];
	
	for (i = 0; i < num; i++)
	{
		snprintf(path, PATH_MAX, "/sys/devices/uncore_imc_%d/type", i);
		if ((fd = open(path, O_RDONLY)) < 0)
			return imc_num;

		if (read(fd, buf, sizeof(buf)) < 0) {
			close(fd);
			return imc_num;
		}		

		imc_num++;
		imc[i].type = atoi(buf);
		imc[i].id = i;
		imc[i].value_scaled = 0;
		memset(imc[i].values, 0, sizeof(imc[i].values));
		imc[i].fd = INVALID_FD;
		close(fd);
	}

	return imc_num;
}

static boolean_t execute_command(const char *command, const char *type)
{
	FILE *fp;

	fp = popen(command, type);
	if (fp == NULL) {
		debug_print(NULL, 2, "Execute '%s' failed (errno = %d)\n",
			command, errno);
		return B_FALSE;
	}

	pclose(fp);
	debug_print(NULL, 2, "Execute '%s' ok\n", command);

	return B_TRUE;
}

static boolean_t resctrl_mounted(void)
{
	char path[128];
	FILE *fp;

	snprintf(path, sizeof(path), "/sys/fs/resctrl/tasks");

	if ((fp = fopen(path, "r")) == NULL)
		return B_FALSE;

	fclose(fp);
	return B_TRUE;
}

boolean_t os_cmt_init(void)
{
	char command[128];

	g_pqos_moni_id = 0;

	if (resctrl_mounted())
		return B_TRUE;

	snprintf(command, sizeof(command),
		"mount -t resctrl resctrl /sys/fs/resctrl 2>/dev/null");

	if (!execute_command(command, "r"))
		return B_FALSE;

	return resctrl_mounted();
}

void os_cmt_fini(void)
{
	char command[128];

	if (!resctrl_mounted())
		return;

	snprintf(command, sizeof(command),
		"umount -f /sys/fs/resctrl 2>/dev/null");

	execute_command(command, "r");
	g_pqos_moni_id = 0;
}

int os_sysfs_cmt_task_set(int pid, int lwpid, struct _perf_pqos *pqos)
{
	char command[128], path[128];

	if (lwpid)
		pqos->task_id = lwpid;
	else if (pid)
		pqos->task_id = pid;
	else
		pqos->task_id = ++g_pqos_moni_id;

	snprintf(path, sizeof(path),
		"/sys/fs/resctrl/mon_groups/%d", pqos->task_id);

	snprintf(command, sizeof(command), "rm -rf %s 2>/dev/null", path);
	if (!execute_command(command, "r"))
		return -1;

	snprintf(command, sizeof(command), "mkdir %s 2>/dev/null", path);
	if (!execute_command(command, "r"))
		return -1;

	if (lwpid == 0)
		snprintf(command, sizeof(command),
			"echo %d > %s/tasks", pid, path);
	else
		snprintf(command, sizeof(command),
			"echo %d > %s/tasks", lwpid, path);

	if (!execute_command(command, "r"))
		return -1;

	return 0;
}

static int cmt_task_node_value(const char *dir, int nid,
	const char *field, uint64_t *val)
{
	FILE *fp;
	char buf[LINE_SIZE], path[128];

	*val = 0;

	if (nid < 10) {
		snprintf(path, sizeof(path),
			"%s/mon_L3_0%d/%s", dir, nid, field);
	} else {
		snprintf(path, sizeof(path),
			"%s/mon_L3_%d/%s", dir, nid, field);
	}

	if ((fp = fopen(path, "r")) == NULL)
		return (-1);

	if (fgets(buf, LINE_SIZE, fp) == NULL) {
		fclose(fp);
		return (-1);
	}

	fclose(fp);
	*val = strtod(buf, NULL);

	debug_print(NULL, 2, "%s: val = %" PRId64 ", nid = %d\n", path, *val, nid);

	return 0;
}

static uint64_t cmt_field_value(char *dir, const char *field, int nid)
{
	uint64_t val = 0, tmp;
	int i;

	if (nid == -1) {
		for (i = 0; i < NNODES_MAX; i++) {
			if (cmt_task_node_value(dir, i, field,
				&tmp) == 0)
				val += tmp;
		}
	} else {
		cmt_task_node_value(dir, nid, field, &val);
	}

	return val;
}

int os_sysfs_cmt_task_value(struct _perf_pqos *pqos, int nid)
{
	char dir[128];
	uint64_t tmp;

	snprintf(dir, sizeof(dir),
		"/sys/fs/resctrl/mon_groups/%d/mon_data", pqos->task_id);

	pqos->occupancy_scaled = cmt_field_value(dir, "llc_occupancy", nid);

	tmp = cmt_field_value(dir, "mbm_total_bytes", nid);
	pqos->totalbw_scaled = tmp - pqos->totalbw;
	pqos->totalbw = tmp;

	tmp = cmt_field_value(dir, "mbm_local_bytes", nid);
	pqos->localbw_scaled = tmp - pqos->localbw;
	pqos->localbw = tmp;

	return 0;
}