Blob Blame History Raw
/*
 * Copyright (c) 2013, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *   * Redistributions of source code must retain the above copyright notice,
 *     this list of conditions and the following disclaimer.
 *   * Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in the
 *     documentation and/or other materials provided with the distribution.
 *   * Neither the name of Intel Corporation nor the names of its contributors
 *     may be used to endorse or promote products derived from this software
 *     without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/* This file contains the routines to process address mapping in running process. */

#include <link.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <strings.h>
#include <sys/types.h>
#include <numa.h>
#include "../include/util.h"
#include "../include/proc.h"
#include "../include/os/os_util.h"
#include "../include/os/map.h"

int
map_init(void)
{
	pagesize_init();
	return (0);
}

void
map_fini(void)
{
	/* Not supported in Linux. */
}

static unsigned int
attr_bitmap(char *attr_str)
{
	unsigned int bitmap = 0;
	
	if (attr_str[0] == 'r') {
		MAP_R_SET(bitmap);
	}

	if (attr_str[1] == 'w') {
		MAP_W_SET(bitmap);
	}

	if (attr_str[2] == 'x') {
		MAP_X_SET(bitmap);
	}

	if (attr_str[3] == 'p') {
		MAP_P_SET(bitmap);
	} else if (attr_str[3] == 's') {
		MAP_S_SET(bitmap);
	}

	return (bitmap);
}

static int
map_entry_add(map_proc_t *map, uint64_t start_addr, uint64_t end_addr,
	unsigned int attr, char *path)
{
	map_entry_t *entry;

	if (array_alloc((void **)(&map->arr), &map->nentry_cur,
		&map->nentry_max, sizeof (map_entry_t), MAP_ENTRY_NUM) != 0) {
		return (-1);
	}
	
	entry = &(map->arr[map->nentry_cur]);
	entry->start_addr = start_addr;
	entry->end_addr = end_addr;
	entry->attr = attr;
	entry->need_resolve = B_TRUE;
	memset(&entry->numa_map, 0, sizeof (numa_map_t));

	if (strlen(path) > 0) {
		strncpy(entry->desc, path, PATH_MAX);
		entry->desc[PATH_MAX - 1] = 0;
	} else {
		entry->desc[0] = 0;
	}

	map->nentry_cur++;
	return (0);
}

static void
numa_map_fini(map_entry_t *entry)
{
	numa_map_t *numa;

	numa = &entry->numa_map;
	if (numa->arr != NULL) {
		free(numa->arr);
	}
		
	memset(numa, 0, sizeof (numa_map_t));
}

static void
map_free(map_proc_t *map)
{
	int i;

	if (map->arr == NULL) {
		return;
	}
	
	for (i = 0; i < map->nentry_cur; i++) {
		numa_map_fini(&map->arr[i]);
	}

	free(map->arr);
	memset(map, 0, sizeof (map_proc_t));
}

static int
map_read(pid_t pid, map_proc_t *map)
{
	char path[PATH_MAX];
	char line[MAPFILE_LINE_SIZE];
	char addr_str[128], attr_str[128], off_str[128];
	char fd_str[128], inode_str[128], path_str[PATH_MAX];
	char s1[64], s2[64];
	uint64_t start_addr, end_addr;
	unsigned int attr;
	int nargs, nadded = 0, ret = -1;
	FILE *fp;
	
	memset(map, 0, sizeof (map_proc_t));
	snprintf(path, sizeof (path), "/proc/%d/maps", pid);
	if ((fp = fopen(path, "r")) == NULL) {
		return (-1);
	}	
	
	while (1) {
		if (fgets(line, sizeof (line), fp) == NULL) {
			break;
		}

		/* 
		 * e.g. 00400000-00405000 r-xp 00000000 fd:00 678793	/usr/bin/vmstat
		 */
		if ((nargs = sscanf(line, "%127[^ ] %127[^ ] %127[^ ] %127[^ ] %127[^ ] %4095[^\n]",
		    addr_str, attr_str, off_str, fd_str, inode_str, path_str)) < 0) {
		    goto L_EXIT;
		}
				
		/*
		 * split to start_addr and end_addr.
		 * e.g. 00400000-00405000 -> start_addr = 00400000, end_addr = 00405000.
		 */
    	if (sscanf(addr_str, "%63[^-]", s1) <= 0) {
    		goto L_EXIT;
    	}
		
		if (sscanf(addr_str, "%*[^-]-%63s", s2) <= 0) {
    		goto L_EXIT;
		}

		start_addr = strtoull(s1, NULL, 16);
		end_addr = strtoull(s2, NULL, 16);

		/*
		 * Convert to the attribute bitmap
		 */
		attr = attr_bitmap(attr_str);

		/*
		 * Path could be null, need to check here.
		 */	
		if (nargs != 6) {
			path_str[0] = 0;
		}
		
		if (map_entry_add(map, start_addr, end_addr, attr, path_str) != 0) {
			goto L_EXIT;	
		}
		
		nadded++;
	}

	if (nadded > 0) {	
		map->loaded = B_TRUE;
		ret = 0;
	}

L_EXIT:
	fclose(fp);
	if ((ret != 0) && (nadded > 0)) {
		map_free(map);
	}

	return (ret);
}

int
map_proc_load(track_proc_t *proc)
{
	map_proc_t *map = &proc->map;
	map_proc_t new_map;
	map_entry_t *old_entry;
	int i;

	if (!map->loaded) {
		if (map_read(proc->pid, map) != 0) {
			return (-1);
		}

		return (0);
	}

	if (map_read(proc->pid, &new_map) != 0) {
		return (-1);
	}

	for (i = 0; i < new_map.nentry_cur; i++) {
		if ((old_entry = map_entry_find(proc, new_map.arr[i].start_addr, 
			new_map.arr[i].end_addr - new_map.arr[i].start_addr)) == NULL) {
			new_map.arr[i].need_resolve = B_TRUE;
		} else {
			new_map.arr[i].need_resolve = old_entry->need_resolve;
		}
	}
	
	map_free(&proc->map);
	memcpy(&proc->map, &new_map, sizeof (map_proc_t));
	return (0);	
}

int
map_proc_fini(track_proc_t *proc)
{
	map_free(&proc->map);
	return (0);
}

/*
 * The callback function used in bsearch() to compare the buffer address.
 */
static int
entryaddr_cmp(const void *p1, const void *p2)
{
	const uint64_t addr = *(const uint64_t *)p1;
	const map_entry_t *entry = (const map_entry_t *)p2;
		
	if (addr < entry->start_addr) {
		return (-1);
	}

	if (addr >= entry->end_addr) {
		return (1);
	}

	return (0);
}

map_entry_t *
map_entry_find(track_proc_t *proc, uint64_t addr, uint64_t size)
{
	map_entry_t *entry;

	entry = bsearch(&addr, proc->map.arr, proc->map.nentry_cur,
		sizeof (map_entry_t), entryaddr_cmp);

	if (entry != NULL) {
		if ((entry->start_addr == addr) && (entry->end_addr == addr + size)) {
			return (entry);
		}
	}

	return (NULL);
}

static numa_entry_t *
numa_entry_add(numa_map_t *numa_map, uint64_t addr, int nid)
{
	numa_entry_t *entry;

	if (array_alloc((void **)(&numa_map->arr), &numa_map->nentry_cur,
		&numa_map->nentry_max, sizeof (numa_entry_t), MAP_ENTRY_NUM) != 0) {
		return (NULL);
	}

	entry = &(numa_map->arr[numa_map->nentry_cur]);
	entry->start_addr = addr;
	entry->end_addr = addr + g_pagesize;
	entry->nid = nid;
	numa_map->nentry_cur++;
	return (entry);
}

static numa_entry_t *
numa_map_update(numa_map_t *numa_map, void **addr_arr, int *node_arr,
	int addr_num, numa_entry_t *last_entry)
{
	numa_entry_t *entry;
	int i = 0, j;

	if ((entry = last_entry) == NULL) {
		if ((entry = numa_entry_add(numa_map, (uint64_t)(addr_arr[i]), 
			node_arr[i])) == NULL) {
			return (NULL);
		}

		i++;
	}

	for (j = i; j < addr_num; j++) {
		if ((entry->nid == node_arr[j]) &&
			(entry->end_addr == (uint64_t)(addr_arr[j]))) {
			entry->end_addr += g_pagesize;
		} else {
			if ((entry = numa_entry_add(numa_map, (uint64_t)(addr_arr[j]), 
				node_arr[j])) == NULL) {
				return (NULL);
			}		
		}
	}
		
	return (entry);
}

int
map_map2numa(track_proc_t *proc, map_entry_t *map_entry)
{
	void *addr_arr[NUMA_MOVE_NPAGES];
	unsigned int i, npages_total, npages_tomove, npages_moved = 0;
	int node_arr[NUMA_MOVE_NPAGES];
	numa_entry_t *last_entry = NULL;
	
	numa_map_fini(map_entry);
	
	npages_total = (map_entry->end_addr - map_entry->start_addr) / g_pagesize;
	while (npages_moved < npages_total) {
		npages_tomove = MIN(NUMA_MOVE_NPAGES, npages_total - npages_moved);
		for (i = 0; i < npages_tomove; i++) {
			addr_arr[i] = (void *)(map_entry->start_addr + 
				(i + npages_moved) * g_pagesize);
		}

		memset(node_arr, 0, sizeof (node_arr));		
		if (numa_move_pages(proc->pid, npages_tomove, addr_arr, NULL,
			node_arr, 0) != 0) {
			return (-1);
		}

		if ((last_entry = numa_map_update(&map_entry->numa_map, addr_arr,
			node_arr, npages_tomove, last_entry)) == NULL) {
			return (-1);			
		}

		npages_moved += npages_tomove;
	}

	return (0);
}

int
map_addr2nodedst(pid_t pid, void **addr_arr, int *lat_arr, int addr_num,
	map_nodedst_t *nodedst_arr, int nnodes, int *naccess_total)
{
	int *status_arr, i, nid;
	
	if ((status_arr = zalloc(sizeof (int) * addr_num)) == NULL) {
		return (-1);
	}

	if (numa_move_pages(pid, addr_num, addr_arr, NULL, status_arr, 0) != 0) {
		free(status_arr);
		return (-1);
	}

	*naccess_total = 0;
	for (i = 0; i < addr_num; i++) {
		nid = status_arr[i];
		if ((nid >= 0) && (nid < nnodes)) {
			nodedst_arr[nid].naccess++;
			nodedst_arr[nid].total_lat += lat_arr[i];
			*naccess_total += 1;
		}
	}

	free(status_arr);
	return (0);
}