|
Packit |
6ad14e |
/*
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
numad - NUMA Daemon to automatically bind processes to NUMA nodes
|
|
Packit |
6ad14e |
Copyright (C) 2012 Bill Gray (bgray@redhat.com), Red Hat Inc
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
numad is free software; you can redistribute it and/or modify it under the
|
|
Packit |
6ad14e |
terms of the GNU Lesser General Public License as published by the Free
|
|
Packit |
6ad14e |
Software Foundation; version 2.1.
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
numad is distributed in the hope that it will be useful, but WITHOUT ANY
|
|
Packit |
6ad14e |
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
|
|
Packit |
6ad14e |
PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
|
|
Packit |
6ad14e |
details.
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
You should find a copy of v2.1 of the GNU Lesser General Public License
|
|
Packit |
6ad14e |
somewhere on your Linux system; if not, write to the Free Software Foundation,
|
|
Packit |
6ad14e |
Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
*/
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
// Compile with: gcc -std=gnu99 -g -Wall -pthread -o numad numad.c -lrt -lm
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
#define _GNU_SOURCE
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
#include <assert.h>
|
|
Packit |
6ad14e |
#include <ctype.h>
|
|
Packit |
6ad14e |
#include <dirent.h>
|
|
Packit |
6ad14e |
#include <errno.h>
|
|
Packit |
6ad14e |
#include <fcntl.h>
|
|
Packit |
6ad14e |
#include <getopt.h>
|
|
Packit |
6ad14e |
#include <limits.h>
|
|
Packit |
6ad14e |
#include <math.h>
|
|
Packit |
6ad14e |
#include <pthread.h>
|
|
Packit |
6ad14e |
#include <sched.h>
|
|
Packit |
6ad14e |
#include <signal.h>
|
|
Packit |
6ad14e |
#include <stdarg.h>
|
|
Packit |
6ad14e |
#include <stdint.h>
|
|
Packit |
6ad14e |
#include <stdio.h>
|
|
Packit |
6ad14e |
#include <stdlib.h>
|
|
Packit |
6ad14e |
#include <string.h>
|
|
Packit |
6ad14e |
#include <time.h>
|
|
Packit |
6ad14e |
#include <unistd.h>
|
|
Packit |
6ad14e |
#include <values.h>
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
#include <sys/ipc.h>
|
|
Packit |
6ad14e |
#include <sys/mman.h>
|
|
Packit |
6ad14e |
#include <sys/msg.h>
|
|
Packit |
6ad14e |
#include <sys/sem.h>
|
|
Packit |
6ad14e |
#include <sys/shm.h>
|
|
Packit |
6ad14e |
#include <sys/stat.h>
|
|
Packit |
6ad14e |
#include <sys/syslog.h>
|
|
Packit |
6ad14e |
#include <sys/time.h>
|
|
Packit |
6ad14e |
#include <sys/types.h>
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
#include <asm/unistd.h>
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
#define VERSION_STRING "20150602"
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
#define VAR_RUN_FILE "/var/run/numad.pid"
|
|
Packit |
6ad14e |
#define VAR_LOG_FILE "/var/log/numad.log"
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
#define KILOBYTE (1024)
|
|
Packit |
6ad14e |
#define MEGABYTE (1024 * 1024)
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
#define FNAME_SIZE 192
|
|
Packit |
6ad14e |
#define BUF_SIZE 1024
|
|
Packit |
6ad14e |
#define BIG_BUF_SIZE 4096
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
// The ONE_HUNDRED factor is used to scale time and CPU usage units.
|
|
Packit |
6ad14e |
// Several CPU quantities are measured in percents of a CPU; and
|
|
Packit |
6ad14e |
// several time values are counted in hundreths of a second.
|
|
Packit |
6ad14e |
#define ONE_HUNDRED 100
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
#define MIN_INTERVAL 5
|
|
Packit |
6ad14e |
#define MAX_INTERVAL 15
|
|
Packit |
6ad14e |
#define CPU_THRESHOLD 50
|
|
Packit |
6ad14e |
#define MEMORY_THRESHOLD 300
|
|
Packit |
6ad14e |
#define DEFAULT_HTT_PERCENT 20
|
|
Packit |
6ad14e |
#define DEFAULT_THP_SCAN_SLEEP_MS 1000
|
|
Packit |
6ad14e |
#define DEFAULT_UTILIZATION_PERCENT 85
|
|
Packit |
6ad14e |
#define DEFAULT_MEMLOCALITY_PERCENT 90
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
#define CONVERT_DIGITS_TO_NUM(p, n) \
|
|
Packit |
6ad14e |
n = *p++ - '0'; \
|
|
Packit |
6ad14e |
while (isdigit(*p)) { \
|
|
Packit |
6ad14e |
n *= 10; \
|
|
Packit |
6ad14e |
n += (*p++ - '0'); \
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
int num_cpus = 0;
|
|
Packit |
6ad14e |
int num_nodes = 0;
|
|
Packit |
6ad14e |
int threads_per_core = 0;
|
|
Packit |
6ad14e |
uint64_t page_size_in_bytes = 0;
|
|
Packit |
6ad14e |
uint64_t huge_page_size_in_bytes = 0;
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
int min_interval = MIN_INTERVAL;
|
|
Packit |
6ad14e |
int max_interval = MAX_INTERVAL;
|
|
Packit |
6ad14e |
int htt_percent = DEFAULT_HTT_PERCENT;
|
|
Packit |
6ad14e |
int thp_scan_sleep_ms = DEFAULT_THP_SCAN_SLEEP_MS;
|
|
Packit |
6ad14e |
int target_utilization = DEFAULT_UTILIZATION_PERCENT;
|
|
Packit |
6ad14e |
int target_memlocality = DEFAULT_MEMLOCALITY_PERCENT;
|
|
Packit |
6ad14e |
int scan_all_processes = 1;
|
|
Packit |
6ad14e |
int keep_interleaved_memory = 0;
|
|
Packit |
6ad14e |
int use_inactive_file_cache = 1;
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
pthread_mutex_t pid_list_mutex;
|
|
Packit |
6ad14e |
pthread_mutex_t node_info_mutex;
|
|
Packit |
6ad14e |
long sum_CPUs_total = 0;
|
|
Packit |
6ad14e |
int requested_mbs = 0;
|
|
Packit |
6ad14e |
int requested_cpus = 0;
|
|
Packit |
6ad14e |
int got_sighup = 0;
|
|
Packit |
6ad14e |
int got_sigterm = 0;
|
|
Packit |
6ad14e |
int got_sigquit = 0;
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
void sig_handler(int signum) {
|
|
Packit |
6ad14e |
switch (signum) {
|
|
Packit |
6ad14e |
case SIGHUP: got_sighup = 1; break;
|
|
Packit |
6ad14e |
case SIGTERM: got_sigterm = 1; break;
|
|
Packit |
6ad14e |
case SIGQUIT: got_sigquit = 1; break;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
FILE *log_fs = NULL;
|
|
Packit |
6ad14e |
int log_level = LOG_NOTICE;
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
void numad_log(int level, const char *fmt, ...) {
|
|
Packit |
6ad14e |
if (level > log_level) {
|
|
Packit |
6ad14e |
return;
|
|
Packit |
6ad14e |
// Logging levels (from sys/syslog.h)
|
|
Packit |
6ad14e |
// #define LOG_EMERG 0 /* system is unusable */
|
|
Packit |
6ad14e |
// #define LOG_ALERT 1 /* action must be taken immediately */
|
|
Packit |
6ad14e |
// #define LOG_CRIT 2 /* critical conditions */
|
|
Packit |
6ad14e |
// #define LOG_ERR 3 /* error conditions */
|
|
Packit |
6ad14e |
// #define LOG_WARNING 4 /* warning conditions */
|
|
Packit |
6ad14e |
// #define LOG_NOTICE 5 /* normal but significant condition */
|
|
Packit |
6ad14e |
// #define LOG_INFO 6 /* informational */
|
|
Packit |
6ad14e |
// #define LOG_DEBUG 7 /* debug-level messages */
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
char buf[BUF_SIZE];
|
|
Packit |
6ad14e |
time_t ts = time(NULL);
|
|
Packit |
6ad14e |
strncpy(buf, ctime(&ts), sizeof(buf));
|
|
Packit |
6ad14e |
char *p = &buf[strlen(buf) - 1];
|
|
Packit |
6ad14e |
*p++ = ':';
|
|
Packit |
6ad14e |
*p++ = ' ';
|
|
Packit |
6ad14e |
va_list ap;
|
|
Packit |
6ad14e |
va_start(ap, fmt);
|
|
Packit |
6ad14e |
vsnprintf(p, BUF_SIZE, fmt, ap);
|
|
Packit |
6ad14e |
va_end(ap);
|
|
Packit |
6ad14e |
fprintf(log_fs, "%s", buf);
|
|
Packit |
6ad14e |
fflush(log_fs);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
void open_log_file() {
|
|
Packit |
6ad14e |
log_fs = fopen(VAR_LOG_FILE, "a");
|
|
Packit |
6ad14e |
if (log_fs == NULL) {
|
|
Packit |
6ad14e |
log_fs = stderr;
|
|
Packit |
6ad14e |
numad_log(LOG_ERR, "Cannot open numad log file (errno: %d) -- using stderr\n", errno);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
void close_log_file() {
|
|
Packit |
6ad14e |
if (log_fs != NULL) {
|
|
Packit |
6ad14e |
if (log_fs != stderr) {
|
|
Packit |
6ad14e |
fclose(log_fs);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
log_fs = NULL;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
#define MSG_BODY_TEXT_SIZE 96
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
typedef struct msg_body {
|
|
Packit |
6ad14e |
long src_pid;
|
|
Packit |
6ad14e |
long cmd;
|
|
Packit |
6ad14e |
long arg1;
|
|
Packit |
6ad14e |
long arg2;
|
|
Packit |
6ad14e |
char text[MSG_BODY_TEXT_SIZE];
|
|
Packit |
6ad14e |
} msg_body_t, *msg_body_p;
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
typedef struct msg {
|
|
Packit |
6ad14e |
long dst_pid; // msg mtype is dest PID
|
|
Packit |
6ad14e |
msg_body_t body;
|
|
Packit |
6ad14e |
} msg_t, *msg_p;
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
int msg_qid;
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
void flush_msg_queue() {
|
|
Packit |
6ad14e |
msg_t msg;
|
|
Packit |
6ad14e |
do {
|
|
Packit |
6ad14e |
msgrcv(msg_qid, &msg, sizeof(msg_body_t), 0, IPC_NOWAIT);
|
|
Packit |
6ad14e |
} while (errno != ENOMSG);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
void init_msg_queue() {
|
|
Packit |
6ad14e |
key_t msg_key = 0xdeadbeef;
|
|
Packit |
6ad14e |
int msg_flg = 0660 | IPC_CREAT;
|
|
Packit |
6ad14e |
msg_qid = msgget(msg_key, msg_flg);
|
|
Packit |
6ad14e |
if (msg_qid < 0) {
|
|
Packit |
6ad14e |
numad_log(LOG_CRIT, "msgget failed\n");
|
|
Packit |
6ad14e |
exit(EXIT_FAILURE);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
flush_msg_queue();
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
void recv_msg(msg_p m) {
|
|
Packit |
6ad14e |
if (msgrcv(msg_qid, m, sizeof(msg_body_t), getpid(), 0) < 0) {
|
|
Packit |
6ad14e |
numad_log(LOG_CRIT, "msgrcv failed\n");
|
|
Packit |
6ad14e |
exit(EXIT_FAILURE);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
// printf("Received: >>%s<< from process %d\n", m->body.text, m->body.src_pid);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
void send_msg(long dst_pid, long cmd, long arg1, long arg2, char *s) {
|
|
Packit |
6ad14e |
msg_t msg;
|
|
Packit |
6ad14e |
msg.dst_pid = dst_pid;
|
|
Packit |
6ad14e |
msg.body.src_pid = getpid();
|
|
Packit |
6ad14e |
msg.body.cmd = cmd;
|
|
Packit |
6ad14e |
msg.body.arg1 = arg1;
|
|
Packit |
6ad14e |
msg.body.arg2 = arg2;
|
|
Packit |
6ad14e |
int s_len = strlen(s);
|
|
Packit |
6ad14e |
if (s_len >= MSG_BODY_TEXT_SIZE) {
|
|
Packit |
6ad14e |
numad_log(LOG_CRIT, "msgsnd text too big\n");
|
|
Packit |
6ad14e |
exit(EXIT_FAILURE);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
strcpy(msg.body.text, s);
|
|
Packit |
6ad14e |
size_t m_len = sizeof(msg_body_t) - MSG_BODY_TEXT_SIZE + s_len + 1;
|
|
Packit |
6ad14e |
if (msgsnd(msg_qid, &msg, m_len, IPC_NOWAIT) < 0) {
|
|
Packit |
6ad14e |
numad_log(LOG_CRIT, "msgsnd failed\n");
|
|
Packit |
6ad14e |
exit(EXIT_FAILURE);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
// printf("Sent: >>%s<< to process %d\n", msg.body.text, msg.dst_pid);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
typedef struct id_list {
|
|
Packit |
6ad14e |
// Use CPU_SET(3) <sched.h> bitmasks,
|
|
Packit |
6ad14e |
// but bundle size and pointer together
|
|
Packit |
6ad14e |
// and genericize for both CPU and Node IDs
|
|
Packit |
6ad14e |
cpu_set_t *set_p;
|
|
Packit |
6ad14e |
size_t bytes;
|
|
Packit |
6ad14e |
} id_list_t, *id_list_p;
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
#define ID_LIST_SET_P(list_p) (list_p->set_p)
|
|
Packit |
6ad14e |
#define ID_LIST_BYTES(list_p) (list_p->bytes)
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
#define INIT_ID_LIST(list_p, num_elements) \
|
|
Packit |
6ad14e |
list_p = malloc(sizeof(id_list_t)); \
|
|
Packit |
6ad14e |
if (list_p == NULL) { numad_log(LOG_CRIT, "INIT_ID_LIST malloc failed\n"); exit(EXIT_FAILURE); } \
|
|
Packit |
6ad14e |
list_p->set_p = CPU_ALLOC(num_elements); \
|
|
Packit |
6ad14e |
if (list_p->set_p == NULL) { numad_log(LOG_CRIT, "CPU_ALLOC failed\n"); exit(EXIT_FAILURE); } \
|
|
Packit |
6ad14e |
list_p->bytes = CPU_ALLOC_SIZE(num_elements);
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
#define CLEAR_CPU_LIST(list_p) \
|
|
Packit |
6ad14e |
if (list_p == NULL) { \
|
|
Packit |
6ad14e |
INIT_ID_LIST(list_p, num_cpus); \
|
|
Packit |
6ad14e |
} \
|
|
Packit |
6ad14e |
CPU_ZERO_S(list_p->bytes, list_p->set_p)
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
#define CLEAR_NODE_LIST(list_p) \
|
|
Packit |
6ad14e |
if (list_p == NULL) { \
|
|
Packit |
6ad14e |
INIT_ID_LIST(list_p, num_nodes); \
|
|
Packit |
6ad14e |
} \
|
|
Packit |
6ad14e |
CPU_ZERO_S(list_p->bytes, list_p->set_p)
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
#define FREE_LIST(list_p) \
|
|
Packit |
6ad14e |
if (list_p != NULL) { \
|
|
Packit |
6ad14e |
if (list_p->set_p != NULL) { CPU_FREE(list_p->set_p); } \
|
|
Packit |
6ad14e |
free(list_p); \
|
|
Packit |
6ad14e |
list_p = NULL; \
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
#define COPY_LIST(orig_list_p, copy_list_p) \
|
|
Packit |
6ad14e |
memcpy(copy_list_p->set_p, orig_list_p->set_p, orig_list_p->bytes)
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
#define NUM_IDS_IN_LIST(list_p) CPU_COUNT_S(list_p->bytes, list_p->set_p)
|
|
Packit |
6ad14e |
#define ADD_ID_TO_LIST(k, list_p) CPU_SET_S(k, list_p->bytes, list_p->set_p)
|
|
Packit |
6ad14e |
#define CLR_ID_IN_LIST(k, list_p) CPU_CLR_S(k, list_p->bytes, list_p->set_p)
|
|
Packit |
6ad14e |
#define ID_IS_IN_LIST(k, list_p) CPU_ISSET_S(k, list_p->bytes, list_p->set_p)
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
#define EQUAL_LISTS(list_1_p, list_2_p) CPU_EQUAL_S(list_1_p->bytes, list_1_p->set_p, list_2_p->set_p)
|
|
Packit |
6ad14e |
#define AND_LISTS(and_list_p, list_1_p, list_2_p) CPU_AND_S(and_list_p->bytes, and_list_p->set_p, list_1_p->set_p, list_2_p->set_p)
|
|
Packit |
6ad14e |
#define OR_LISTS( or_list_p, list_1_p, list_2_p) CPU_OR_S( or_list_p->bytes, or_list_p->set_p, list_1_p->set_p, list_2_p->set_p)
|
|
Packit |
6ad14e |
#define XOR_LISTS(xor_list_p, list_1_p, list_2_p) CPU_XOR_S(xor_list_p->bytes, xor_list_p->set_p, list_1_p->set_p, list_2_p->set_p)
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
int negate_cpu_list(id_list_p list_p) {
|
|
Packit |
6ad14e |
if (list_p == NULL) {
|
|
Packit |
6ad14e |
numad_log(LOG_CRIT, "Cannot negate a NULL list\n");
|
|
Packit |
6ad14e |
exit(EXIT_FAILURE);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
if (num_cpus < 1) {
|
|
Packit |
6ad14e |
numad_log(LOG_CRIT, "No CPUs to negate in list!\n");
|
|
Packit |
6ad14e |
exit(EXIT_FAILURE);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
for (int ix = 0; (ix < num_cpus); ix++) {
|
|
Packit |
6ad14e |
if (ID_IS_IN_LIST(ix, list_p)) {
|
|
Packit |
6ad14e |
CLR_ID_IN_LIST(ix, list_p);
|
|
Packit |
6ad14e |
} else {
|
|
Packit |
6ad14e |
ADD_ID_TO_LIST(ix, list_p);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
return NUM_IDS_IN_LIST(list_p);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
int add_ids_to_list_from_str(id_list_p list_p, char *s) {
|
|
Packit |
6ad14e |
if (list_p == NULL) {
|
|
Packit |
6ad14e |
numad_log(LOG_CRIT, "Cannot add to NULL list\n");
|
|
Packit |
6ad14e |
exit(EXIT_FAILURE);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
if ((s == NULL) || (strlen(s) == 0)) {
|
|
Packit |
6ad14e |
goto return_list;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
int in_range = 0;
|
|
Packit |
6ad14e |
int next_id = 0;
|
|
Packit |
6ad14e |
for (;;) {
|
|
Packit |
6ad14e |
// skip over non-digits
|
|
Packit |
6ad14e |
while (!isdigit(*s)) {
|
|
Packit |
6ad14e |
if ((*s == '\n') || (*s == '\0')) {
|
|
Packit |
6ad14e |
goto return_list;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
if (*s++ == '-') {
|
|
Packit |
6ad14e |
in_range = 1;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
int id;
|
|
Packit |
6ad14e |
CONVERT_DIGITS_TO_NUM(s, id);
|
|
Packit |
6ad14e |
if (!in_range) {
|
|
Packit |
6ad14e |
next_id = id;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
for (; (next_id <= id); next_id++) {
|
|
Packit |
6ad14e |
ADD_ID_TO_LIST(next_id, list_p);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
in_range = 0;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
return_list:
|
|
Packit |
6ad14e |
return NUM_IDS_IN_LIST(list_p);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
int str_from_id_list(char *str_p, int str_size, id_list_p list_p) {
|
|
Packit |
6ad14e |
char *p = str_p;
|
|
Packit |
6ad14e |
if ((p == NULL) || (str_size < 3)) {
|
|
Packit |
6ad14e |
numad_log(LOG_CRIT, "Bad string for ID listing\n");
|
|
Packit |
6ad14e |
exit(EXIT_FAILURE);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
int n;
|
|
Packit |
6ad14e |
if ((list_p == NULL) || ((n = NUM_IDS_IN_LIST(list_p)) == 0)) {
|
|
Packit |
6ad14e |
goto terminate_string;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
int id_range_start = -1;
|
|
Packit |
6ad14e |
for (int id = 0; ; id++) {
|
|
Packit |
6ad14e |
int id_in_list = (ID_IS_IN_LIST(id, list_p) != 0);
|
|
Packit |
6ad14e |
if ((id_in_list) && (id_range_start < 0)) {
|
|
Packit |
6ad14e |
id_range_start = id; // beginning an ID range
|
|
Packit |
6ad14e |
} else if ((!id_in_list) && (id_range_start >= 0)) {
|
|
Packit |
6ad14e |
// convert the range that just ended...
|
|
Packit |
6ad14e |
p += snprintf(p, (str_p + str_size - p - 1), "%d", id_range_start);
|
|
Packit |
6ad14e |
if (id - id_range_start > 1) {
|
|
Packit |
6ad14e |
*p++ = '-';
|
|
Packit |
6ad14e |
p += snprintf(p, (str_p + str_size - p - 1), "%d", (id - 1));
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
*p++ = ',';
|
|
Packit |
6ad14e |
id_range_start = -1; // no longer in a range
|
|
Packit |
6ad14e |
if (n <= 0) { break; } // exit only after finishing a range
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
n -= id_in_list;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
p -= 1; // eliminate trailing ','
|
|
Packit |
6ad14e |
terminate_string:
|
|
Packit |
6ad14e |
*p = '\0';
|
|
Packit |
6ad14e |
return (p - str_p);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
typedef struct node_data {
|
|
Packit |
6ad14e |
uint64_t node_id;
|
|
Packit |
6ad14e |
uint64_t MBs_total;
|
|
Packit |
6ad14e |
uint64_t MBs_free;
|
|
Packit |
6ad14e |
uint64_t CPUs_total; // scaled * ONE_HUNDRED
|
|
Packit |
6ad14e |
uint64_t CPUs_free; // scaled * ONE_HUNDRED
|
|
Packit |
6ad14e |
uint64_t magnitude; // hack: MBs * CPUs
|
|
Packit |
6ad14e |
uint8_t *distance;
|
|
Packit |
6ad14e |
id_list_p cpu_list_p;
|
|
Packit |
6ad14e |
} node_data_t, *node_data_p;
|
|
Packit |
6ad14e |
node_data_p node = NULL;
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
int min_node_CPUs_free_ix = -1;
|
|
Packit |
6ad14e |
int min_node_MBs_free_ix = -1;
|
|
Packit |
6ad14e |
long min_node_CPUs_free = MAXINT;
|
|
Packit |
6ad14e |
long min_node_MBs_free = MAXINT;
|
|
Packit |
6ad14e |
long max_node_CPUs_free = 0;
|
|
Packit |
6ad14e |
long max_node_MBs_free = 0;
|
|
Packit |
6ad14e |
long avg_node_CPUs_free = 0;
|
|
Packit |
6ad14e |
long avg_node_MBs_free = 0;
|
|
Packit |
6ad14e |
double stddev_node_CPUs_free = 0.0;
|
|
Packit |
6ad14e |
double stddev_node_MBs_free = 0.0;
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
// RING_BUF_SIZE must be a power of two
|
|
Packit |
6ad14e |
#define RING_BUF_SIZE 8
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
#define PROCESS_FLAG_INTERLEAVED (1 << 0)
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
typedef struct process_data {
|
|
Packit |
6ad14e |
int pid;
|
|
Packit |
6ad14e |
unsigned int flags;
|
|
Packit |
6ad14e |
uint64_t data_time_stamp; // hundredths of seconds
|
|
Packit |
6ad14e |
uint64_t bind_time_stamp;
|
|
Packit |
6ad14e |
uint64_t num_threads;
|
|
Packit |
6ad14e |
uint64_t MBs_size;
|
|
Packit |
6ad14e |
uint64_t MBs_used;
|
|
Packit |
6ad14e |
uint64_t cpu_util;
|
|
Packit |
6ad14e |
uint64_t CPUs_used; // scaled * ONE_HUNDRED
|
|
Packit |
6ad14e |
uint64_t CPUs_used_ring_buf[RING_BUF_SIZE];
|
|
Packit |
6ad14e |
int ring_buf_ix;
|
|
Packit |
6ad14e |
char *comm;
|
|
Packit |
6ad14e |
id_list_p node_list_p;
|
|
Packit |
6ad14e |
uint64_t *process_MBs;
|
|
Packit |
6ad14e |
} process_data_t, *process_data_p;
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
// Hash table size must always be a power of two
|
|
Packit |
6ad14e |
#define MIN_PROCESS_HASH_TABLE_SIZE 16
|
|
Packit |
6ad14e |
int process_hash_table_size = 0;
|
|
Packit |
6ad14e |
int process_hash_collisions = 0;
|
|
Packit |
6ad14e |
process_data_p process_hash_table = NULL;
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
int process_hash_ix(int pid) {
|
|
Packit |
6ad14e |
unsigned ix = pid;
|
|
Packit |
6ad14e |
ix *= 717;
|
|
Packit |
6ad14e |
ix >>= 8;
|
|
Packit |
6ad14e |
ix &= (process_hash_table_size - 1);
|
|
Packit |
6ad14e |
return ix;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
int process_hash_lookup(int pid) {
|
|
Packit |
6ad14e |
int ix = process_hash_ix(pid);
|
|
Packit |
6ad14e |
int starting_ix = ix;
|
|
Packit |
6ad14e |
while (process_hash_table[ix].pid) {
|
|
Packit |
6ad14e |
// Assumes table with some blank entries...
|
|
Packit |
6ad14e |
if (pid == process_hash_table[ix].pid) {
|
|
Packit |
6ad14e |
return ix; // found it
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
ix += 1;
|
|
Packit |
6ad14e |
ix &= (process_hash_table_size - 1);
|
|
Packit |
6ad14e |
if (ix == starting_ix) {
|
|
Packit |
6ad14e |
// Table full and pid not found.
|
|
Packit |
6ad14e |
// This "should never happen"...
|
|
Packit |
6ad14e |
break;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
return -1;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
int process_hash_insert(int pid) {
|
|
Packit |
6ad14e |
// This reserves the hash table slot, but initializes only the pid field
|
|
Packit |
6ad14e |
int ix = process_hash_ix(pid);
|
|
Packit |
6ad14e |
int starting_ix = ix;
|
|
Packit |
6ad14e |
while (process_hash_table[ix].pid) {
|
|
Packit |
6ad14e |
if (pid == process_hash_table[ix].pid) {
|
|
Packit |
6ad14e |
return ix; // found it
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
process_hash_collisions += 1;
|
|
Packit |
6ad14e |
ix += 1;
|
|
Packit |
6ad14e |
ix &= (process_hash_table_size - 1);
|
|
Packit |
6ad14e |
if (ix == starting_ix) {
|
|
Packit |
6ad14e |
// This "should never happen"...
|
|
Packit |
6ad14e |
numad_log(LOG_ERR, "Process hash table is full\n");
|
|
Packit |
6ad14e |
return -1;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
process_hash_table[ix].pid = pid;
|
|
Packit |
6ad14e |
return ix;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
int process_hash_update(process_data_p newp) {
|
|
Packit |
6ad14e |
// This updates hash table stats for processes we are monitoring. Only the
|
|
Packit |
6ad14e |
// scalar resource consumption stats need to be updated here.
|
|
Packit |
6ad14e |
int new_hash_table_entry = 1;
|
|
Packit |
6ad14e |
int ix = process_hash_insert(newp->pid);
|
|
Packit |
6ad14e |
if (ix >= 0) {
|
|
Packit |
6ad14e |
process_data_p p = &process_hash_table[ix];
|
|
Packit |
6ad14e |
if (p->data_time_stamp) {
|
|
Packit |
6ad14e |
new_hash_table_entry = 0;
|
|
Packit |
6ad14e |
p->ring_buf_ix += 1;
|
|
Packit |
6ad14e |
p->ring_buf_ix &= (RING_BUF_SIZE - 1);
|
|
Packit |
6ad14e |
uint64_t cpu_util_diff = newp->cpu_util - p->cpu_util;
|
|
Packit |
6ad14e |
uint64_t time_diff = newp->data_time_stamp - p->data_time_stamp;
|
|
Packit |
6ad14e |
p->CPUs_used_ring_buf[p->ring_buf_ix] = 100 * (cpu_util_diff) / time_diff;
|
|
Packit |
6ad14e |
// Use largest CPU utilization currently in ring buffer
|
|
Packit |
6ad14e |
uint64_t max_CPUs_used = p->CPUs_used_ring_buf[0];
|
|
Packit |
6ad14e |
for (int ix = 1; (ix < RING_BUF_SIZE); ix++) {
|
|
Packit |
6ad14e |
if (max_CPUs_used < p->CPUs_used_ring_buf[ix]) {
|
|
Packit |
6ad14e |
max_CPUs_used = p->CPUs_used_ring_buf[ix];
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
p->CPUs_used = max_CPUs_used;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
if ((!p->comm) || (strcmp(p->comm, newp->comm))) {
|
|
Packit |
6ad14e |
if (p->comm) {
|
|
Packit |
6ad14e |
free(p->comm);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
p->comm = strdup(newp->comm);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
p->MBs_size = newp->MBs_size;
|
|
Packit |
6ad14e |
p->MBs_used = newp->MBs_used;
|
|
Packit |
6ad14e |
p->cpu_util = newp->cpu_util;
|
|
Packit |
6ad14e |
p->num_threads = newp->num_threads;
|
|
Packit |
6ad14e |
p->data_time_stamp = newp->data_time_stamp;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
return new_hash_table_entry;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
void process_hash_clear_all_bind_time_stamps() {
|
|
Packit |
6ad14e |
for (int ix = 0; (ix < process_hash_table_size); ix++) {
|
|
Packit |
6ad14e |
process_hash_table[ix].bind_time_stamp = 0;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
int process_hash_rehash(int old_ix) {
|
|
Packit |
6ad14e |
// Given the index of a table entry that would otherwise be orphaned by
|
|
Packit |
6ad14e |
// process_hash_remove(), reinsert into table using PID from existing record.
|
|
Packit |
6ad14e |
process_data_p op = &process_hash_table[old_ix];
|
|
Packit |
6ad14e |
int new_ix = process_hash_insert(op->pid);
|
|
Packit |
6ad14e |
if (new_ix >= 0) {
|
|
Packit |
6ad14e |
// Copy old slot to new slot, and zero old slot
|
|
Packit |
6ad14e |
process_data_p np = &process_hash_table[new_ix];
|
|
Packit |
6ad14e |
memcpy(np, op, sizeof(process_data_t));
|
|
Packit |
6ad14e |
memset(op, 0, sizeof(process_data_t));
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
return new_ix;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
int process_hash_remove(int pid) {
|
|
Packit |
6ad14e |
int ix = process_hash_lookup(pid);
|
|
Packit |
6ad14e |
if (ix >= 0) {
|
|
Packit |
6ad14e |
// remove the target
|
|
Packit |
6ad14e |
process_data_p dp = &process_hash_table[ix];
|
|
Packit |
6ad14e |
if (dp->comm) { free(dp->comm); }
|
|
Packit |
6ad14e |
if (dp->process_MBs) { free(dp->process_MBs); }
|
|
Packit |
6ad14e |
FREE_LIST(dp->node_list_p);
|
|
Packit |
6ad14e |
memset(dp, 0, sizeof(process_data_t));
|
|
Packit |
6ad14e |
// bubble up the collision chain and rehash if neeeded
|
|
Packit |
6ad14e |
for (;;) {
|
|
Packit |
6ad14e |
ix += 1;
|
|
Packit |
6ad14e |
ix &= (process_hash_table_size - 1);
|
|
Packit |
6ad14e |
if ((pid = process_hash_table[ix].pid) <= 0) {
|
|
Packit |
6ad14e |
break;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
if (process_hash_lookup(pid) < 0) {
|
|
Packit |
6ad14e |
if (process_hash_rehash(ix) < 0) {
|
|
Packit |
6ad14e |
numad_log(LOG_ERR, "rehash fail\n");
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
return ix;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
void process_hash_table_expand() {
|
|
Packit |
6ad14e |
// Save old table size and address
|
|
Packit |
6ad14e |
int old_size = process_hash_table_size;
|
|
Packit |
6ad14e |
process_data_p old_table = process_hash_table;
|
|
Packit |
6ad14e |
// Double size of table and allocate new space
|
|
Packit |
6ad14e |
if (old_size > 0) {
|
|
Packit |
6ad14e |
process_hash_table_size *= 2;
|
|
Packit |
6ad14e |
} else {
|
|
Packit |
6ad14e |
process_hash_table_size = MIN_PROCESS_HASH_TABLE_SIZE;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
numad_log(LOG_DEBUG, "Expanding hash table size: %d\n", process_hash_table_size);
|
|
Packit |
6ad14e |
process_hash_table = malloc(process_hash_table_size * sizeof(process_data_t));
|
|
Packit |
6ad14e |
if (process_hash_table == NULL) {
|
|
Packit |
6ad14e |
numad_log(LOG_CRIT, "hash table malloc failed\n");
|
|
Packit |
6ad14e |
exit(EXIT_FAILURE);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
// Clear the new table, and copy valid entries from old table
|
|
Packit |
6ad14e |
memset(process_hash_table, 0, process_hash_table_size * sizeof(process_data_t));
|
|
Packit |
6ad14e |
for (int ix = 0; (ix < old_size); ix++) {
|
|
Packit |
6ad14e |
process_data_p p = &old_table[ix];
|
|
Packit |
6ad14e |
if (p->pid) {
|
|
Packit |
6ad14e |
int new_table_ix = process_hash_insert(p->pid);
|
|
Packit |
6ad14e |
memcpy(&process_hash_table[new_table_ix], p, sizeof(process_data_t));
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
if (old_table != NULL) {
|
|
Packit |
6ad14e |
free(old_table);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
void process_hash_table_dump() {
|
|
Packit |
6ad14e |
for (int ix = 0; (ix < process_hash_table_size); ix++) {
|
|
Packit |
6ad14e |
process_data_p p = &process_hash_table[ix];
|
|
Packit |
6ad14e |
if (p->pid) {
|
|
Packit |
6ad14e |
numad_log(LOG_DEBUG,
|
|
Packit |
6ad14e |
"ix: %d PID: %d %s Thds: %d CPU %ld MBs: %ld/%ld Data TS: %ld Bind TS: %ld\n",
|
|
Packit |
6ad14e |
ix, p->pid, ((p->comm != NULL) ? p->comm : "(Null)"), p->num_threads,
|
|
Packit |
6ad14e |
p->CPUs_used, p->MBs_used, p->MBs_size, p->data_time_stamp, p->bind_time_stamp);
|
|
Packit |
6ad14e |
// FIXME: make this dump every field, but this is not even currently used
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
void process_hash_table_cleanup(uint64_t update_time) {
|
|
Packit |
6ad14e |
int num_hash_entries_used = 0;
|
|
Packit |
6ad14e |
for (int ix = 0; (ix < process_hash_table_size); ix++) {
|
|
Packit |
6ad14e |
process_data_p p = &process_hash_table[ix];
|
|
Packit |
6ad14e |
if (p->pid) {
|
|
Packit |
6ad14e |
num_hash_entries_used += 1;
|
|
Packit |
6ad14e |
if (p->data_time_stamp < update_time) {
|
|
Packit |
6ad14e |
// Mark as old, and zero CPU utilization
|
|
Packit |
6ad14e |
p->data_time_stamp = 0;
|
|
Packit |
6ad14e |
p->CPUs_used = 0;
|
|
Packit |
6ad14e |
// Check for dead pids and remove them...
|
|
Packit |
6ad14e |
if ((kill(p->pid, 0) == -1) && (errno == ESRCH)) {
|
|
Packit |
6ad14e |
// Seems dead. Forget this pid
|
|
Packit |
6ad14e |
process_hash_remove(p->pid);
|
|
Packit |
6ad14e |
num_hash_entries_used -= 1;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
// Keep hash table approximately half empty
|
|
Packit |
6ad14e |
if ((num_hash_entries_used * 7) / 4 > process_hash_table_size) {
|
|
Packit |
6ad14e |
process_hash_table_expand();
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
typedef struct pid_list {
|
|
Packit |
6ad14e |
long pid;
|
|
Packit |
6ad14e |
struct pid_list* next;
|
|
Packit |
6ad14e |
} pid_list_t, *pid_list_p;
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
pid_list_p include_pid_list = NULL;
|
|
Packit |
6ad14e |
pid_list_p exclude_pid_list = NULL;
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
pid_list_p insert_pid_into_pid_list(pid_list_p list_ptr, long pid) {
|
|
Packit |
6ad14e |
if (process_hash_table != NULL) {
|
|
Packit |
6ad14e |
int hash_ix = process_hash_lookup(pid);
|
|
Packit |
6ad14e |
if ((hash_ix >= 0) && (list_ptr == include_pid_list)) {
|
|
Packit |
6ad14e |
// Clear interleaved flag, in case user wants it to be re-evaluated
|
|
Packit |
6ad14e |
process_hash_table[hash_ix].flags &= ~PROCESS_FLAG_INTERLEAVED;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
// Check for duplicate pid first
|
|
Packit |
6ad14e |
pid_list_p pid_ptr = list_ptr;
|
|
Packit |
6ad14e |
while (pid_ptr != NULL) {
|
|
Packit |
6ad14e |
if (pid_ptr->pid == pid) {
|
|
Packit |
6ad14e |
// pid already in list
|
|
Packit |
6ad14e |
return list_ptr;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
pid_ptr = pid_ptr->next;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
// pid not yet in list -- insert new node
|
|
Packit |
6ad14e |
pid_ptr = malloc(sizeof(pid_list_t));
|
|
Packit |
6ad14e |
if (pid_ptr == NULL) {
|
|
Packit |
6ad14e |
numad_log(LOG_CRIT, "pid_list malloc failed\n");
|
|
Packit |
6ad14e |
exit(EXIT_FAILURE);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
pid_ptr->pid = pid;
|
|
Packit |
6ad14e |
pid_ptr->next = list_ptr;
|
|
Packit |
6ad14e |
list_ptr = pid_ptr;
|
|
Packit |
6ad14e |
return list_ptr;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
pid_list_p remove_pid_from_pid_list(pid_list_p list_ptr, long pid) {
|
|
Packit |
6ad14e |
pid_list_p last_pid_ptr = NULL;
|
|
Packit |
6ad14e |
pid_list_p pid_ptr = list_ptr;
|
|
Packit |
6ad14e |
while (pid_ptr != NULL) {
|
|
Packit |
6ad14e |
if (pid_ptr->pid == pid) {
|
|
Packit |
6ad14e |
if (pid_ptr == list_ptr) {
|
|
Packit |
6ad14e |
list_ptr = list_ptr->next;
|
|
Packit |
6ad14e |
free(pid_ptr);
|
|
Packit |
6ad14e |
pid_ptr = list_ptr;
|
|
Packit |
6ad14e |
continue;
|
|
Packit |
6ad14e |
} else {
|
|
Packit |
6ad14e |
last_pid_ptr->next = pid_ptr->next;
|
|
Packit |
6ad14e |
free(pid_ptr);
|
|
Packit |
6ad14e |
pid_ptr = last_pid_ptr;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
last_pid_ptr = pid_ptr;
|
|
Packit |
6ad14e |
pid_ptr = pid_ptr->next;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
return list_ptr;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
void shut_down_numad() {
|
|
Packit |
6ad14e |
numad_log(LOG_NOTICE, "Shutting down numad\n");
|
|
Packit |
6ad14e |
flush_msg_queue();
|
|
Packit |
6ad14e |
unlink(VAR_RUN_FILE);
|
|
Packit |
6ad14e |
close_log_file();
|
|
Packit |
6ad14e |
exit(EXIT_SUCCESS);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
void print_version_and_exit(char *prog_name) {
|
|
Packit |
6ad14e |
fprintf(stdout, "%s version: %s: compiled %s\n", prog_name, VERSION_STRING, __DATE__);
|
|
Packit |
6ad14e |
exit(EXIT_SUCCESS);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
void print_usage_and_exit(char *prog_name) {
|
|
Packit |
6ad14e |
fprintf(stderr, "Usage: %s <options> ...\n", prog_name);
|
|
Packit |
6ad14e |
fprintf(stderr, "-C 1 to count inactive file cache as available memory (default 1)\n");
|
|
Packit |
6ad14e |
fprintf(stderr, "-C 0 to count inactive file cache memory as unavailable (default 1)\n");
|
|
Packit |
6ad14e |
fprintf(stderr, "-d for debug logging (same effect as '-l 7')\n");
|
|
Packit |
6ad14e |
fprintf(stderr, "-h to print this usage info\n");
|
|
Packit |
6ad14e |
fprintf(stderr, "-H <N> to set THP scan_sleep_ms (default %d)\n", DEFAULT_THP_SCAN_SLEEP_MS);
|
|
Packit |
6ad14e |
fprintf(stderr, "-i [<MIN>:]<MAX> to specify interval seconds\n");
|
|
Packit |
6ad14e |
fprintf(stderr, "-K 1 to keep interleaved memory spread across nodes (default 0)\n");
|
|
Packit |
6ad14e |
fprintf(stderr, "-K 0 to merge interleaved memory to local NUMA nodes (default 0)\n");
|
|
Packit |
6ad14e |
fprintf(stderr, "-l <N> to specify logging level (usually 5, 6, or 7 -- default 5)\n");
|
|
Packit |
6ad14e |
fprintf(stderr, "-m <N> to specify memory locality target percent (default %d)\n", DEFAULT_MEMLOCALITY_PERCENT);
|
|
Packit |
6ad14e |
fprintf(stderr, "-p <PID> to add PID to inclusion pid list\n");
|
|
Packit |
6ad14e |
fprintf(stderr, "-r <PID> to remove PID from explicit pid lists\n");
|
|
Packit |
6ad14e |
fprintf(stderr, "-R <CPU_LIST> to reserve some CPUs for non-numad use\n");
|
|
Packit |
6ad14e |
fprintf(stderr, "-S 1 to scan all processes (default 1)\n");
|
|
Packit |
6ad14e |
fprintf(stderr, "-S 0 to scan only explicit PID list processes (default 1)\n");
|
|
Packit |
6ad14e |
fprintf(stderr, "-t <N> to specify thread / logical CPU valuation percent (default %d)\n", DEFAULT_HTT_PERCENT);
|
|
Packit |
6ad14e |
fprintf(stderr, "-u <N> to specify utilization target percent (default %d)\n", DEFAULT_UTILIZATION_PERCENT);
|
|
Packit |
6ad14e |
fprintf(stderr, "-v for verbose (same effect as '-l 6')\n");
|
|
Packit |
6ad14e |
fprintf(stderr, "-V to show version info\n");
|
|
Packit |
6ad14e |
fprintf(stderr, "-w <CPUs>[:<MBs>] for NUMA node suggestions\n");
|
|
Packit |
6ad14e |
fprintf(stderr, "-x <PID> to add PID to exclusion pid list\n");
|
|
Packit |
6ad14e |
exit(EXIT_FAILURE);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
void set_thp_scan_sleep_ms(int new_ms) {
|
|
Packit |
6ad14e |
if (new_ms < 1) {
|
|
Packit |
6ad14e |
// 0 means do not change the system default
|
|
Packit |
6ad14e |
return;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
char *thp_scan_fname = "/sys/kernel/mm/transparent_hugepage/khugepaged/scan_sleep_millisecs";
|
|
Packit |
6ad14e |
int fd = open(thp_scan_fname, O_RDWR, 0);
|
|
Packit |
6ad14e |
if (fd >= 0) {
|
|
Packit |
6ad14e |
char buf[BUF_SIZE];
|
|
Packit |
6ad14e |
int bytes = read(fd, buf, BUF_SIZE);
|
|
Packit |
6ad14e |
if (bytes > 0) {
|
|
Packit |
6ad14e |
buf[bytes] = '\0';
|
|
Packit |
6ad14e |
int cur_ms;
|
|
Packit |
6ad14e |
char *p = buf;
|
|
Packit |
6ad14e |
CONVERT_DIGITS_TO_NUM(p, cur_ms);
|
|
Packit |
6ad14e |
if (cur_ms != new_ms) {
|
|
Packit |
6ad14e |
lseek(fd, 0, SEEK_SET);
|
|
Packit |
6ad14e |
numad_log(LOG_NOTICE, "Changing THP scan time in %s from %d to %d ms.\n", thp_scan_fname, cur_ms, new_ms);
|
|
Packit |
6ad14e |
sprintf(buf, "%d\n", new_ms);
|
|
Packit |
6ad14e |
write(fd, buf, strlen(buf));
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
close(fd);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
void check_prereqs(char *prog_name) {
|
|
Packit |
6ad14e |
// Adjust kernel tunable to scan for THP more frequently...
|
|
Packit |
6ad14e |
set_thp_scan_sleep_ms(thp_scan_sleep_ms);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
int get_daemon_pid() {
|
|
Packit |
6ad14e |
int fd = open(VAR_RUN_FILE, O_RDONLY, 0);
|
|
Packit |
6ad14e |
if (fd < 0) {
|
|
Packit |
6ad14e |
return 0;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
char buf[BUF_SIZE];
|
|
Packit |
6ad14e |
int bytes = read(fd, buf, BUF_SIZE);
|
|
Packit |
6ad14e |
close(fd);
|
|
Packit |
6ad14e |
if (bytes <= 0) {
|
|
Packit |
6ad14e |
return 0;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
int pid;
|
|
Packit |
6ad14e |
char *p = buf;
|
|
Packit |
6ad14e |
CONVERT_DIGITS_TO_NUM(p, pid);
|
|
Packit |
6ad14e |
// Check run file pid still active
|
|
Packit |
6ad14e |
char fname[FNAME_SIZE];
|
|
Packit |
6ad14e |
snprintf(fname, FNAME_SIZE, "/proc/%d", pid);
|
|
Packit |
6ad14e |
if (access(fname, F_OK) < 0) {
|
|
Packit |
6ad14e |
if (errno == ENOENT) {
|
|
Packit |
6ad14e |
numad_log(LOG_NOTICE, "Removing out-of-date numad run file because %s doesn't exist\n", fname);
|
|
Packit |
6ad14e |
unlink(VAR_RUN_FILE);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
return 0;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
// Daemon must be running already.
|
|
Packit |
6ad14e |
return pid;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
int register_numad_pid() {
|
|
Packit |
6ad14e |
int pid;
|
|
Packit |
6ad14e |
char buf[BUF_SIZE];
|
|
Packit |
6ad14e |
int fd;
|
|
Packit |
6ad14e |
create_run_file:
|
|
Packit |
6ad14e |
fd = open(VAR_RUN_FILE, O_RDWR|O_CREAT|O_EXCL, S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH);
|
|
Packit |
6ad14e |
if (fd >= 0) {
|
|
Packit |
6ad14e |
pid = getpid();
|
|
Packit |
6ad14e |
sprintf(buf, "%d\n", pid);
|
|
Packit |
6ad14e |
write(fd, buf, strlen(buf));
|
|
Packit |
6ad14e |
close(fd);
|
|
Packit |
6ad14e |
numad_log(LOG_NOTICE, "Registering numad version %s PID %d\n", VERSION_STRING, pid);
|
|
Packit |
6ad14e |
return pid;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
if (errno == EEXIST) {
|
|
Packit |
6ad14e |
fd = open(VAR_RUN_FILE, O_RDWR|O_CREAT, S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH);
|
|
Packit |
6ad14e |
if (fd < 0) {
|
|
Packit |
6ad14e |
goto fail_numad_run_file;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
int bytes = read(fd, buf, BUF_SIZE);
|
|
Packit |
6ad14e |
close(fd);
|
|
Packit |
6ad14e |
if (bytes > 0) {
|
|
Packit |
6ad14e |
char *p = buf;
|
|
Packit |
6ad14e |
CONVERT_DIGITS_TO_NUM(p, pid);
|
|
Packit |
6ad14e |
// Check pid in run file still active
|
|
Packit |
6ad14e |
char fname[FNAME_SIZE];
|
|
Packit |
6ad14e |
snprintf(fname, FNAME_SIZE, "/proc/%d", pid);
|
|
Packit |
6ad14e |
if (access(fname, F_OK) < 0) {
|
|
Packit |
6ad14e |
if (errno == ENOENT) {
|
|
Packit |
6ad14e |
// Assume run file is out-of-date...
|
|
Packit |
6ad14e |
numad_log(LOG_NOTICE, "Removing out-of-date numad run file because %s doesn't exist\n", fname);
|
|
Packit |
6ad14e |
unlink(VAR_RUN_FILE);
|
|
Packit |
6ad14e |
goto create_run_file;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
// Daemon must be running already.
|
|
Packit |
6ad14e |
return pid;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
fail_numad_run_file:
|
|
Packit |
6ad14e |
numad_log(LOG_CRIT, "Cannot open numad.pid file\n");
|
|
Packit |
6ad14e |
exit(EXIT_FAILURE);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
int count_set_bits_in_hex_list_file(char *fname) {
|
|
Packit |
6ad14e |
int sum = 0;
|
|
Packit |
6ad14e |
int fd = open(fname, O_RDONLY, 0);
|
|
Packit |
6ad14e |
if (fd >= 0) {
|
|
Packit |
6ad14e |
char buf[BUF_SIZE];
|
|
Packit |
6ad14e |
int bytes = read(fd, buf, BUF_SIZE);
|
|
Packit |
6ad14e |
close(fd);
|
|
Packit |
6ad14e |
for (int ix = 0; (ix < bytes); ix++) {
|
|
Packit |
6ad14e |
char c = tolower(buf[ix]);
|
|
Packit |
6ad14e |
switch (c) {
|
|
Packit |
6ad14e |
case '0' : sum += 0; break;
|
|
Packit |
6ad14e |
case '1' : sum += 1; break;
|
|
Packit |
6ad14e |
case '2' : sum += 1; break;
|
|
Packit |
6ad14e |
case '3' : sum += 2; break;
|
|
Packit |
6ad14e |
case '4' : sum += 1; break;
|
|
Packit |
6ad14e |
case '5' : sum += 2; break;
|
|
Packit |
6ad14e |
case '6' : sum += 2; break;
|
|
Packit |
6ad14e |
case '7' : sum += 3; break;
|
|
Packit |
6ad14e |
case '8' : sum += 1; break;
|
|
Packit |
6ad14e |
case '9' : sum += 2; break;
|
|
Packit |
6ad14e |
case 'a' : sum += 2; break;
|
|
Packit |
6ad14e |
case 'b' : sum += 3; break;
|
|
Packit |
6ad14e |
case 'c' : sum += 2; break;
|
|
Packit |
6ad14e |
case 'd' : sum += 3; break;
|
|
Packit |
6ad14e |
case 'e' : sum += 3; break;
|
|
Packit |
6ad14e |
case 'f' : sum += 4; break;
|
|
Packit |
6ad14e |
case ' ' : sum += 0; break;
|
|
Packit |
6ad14e |
case ',' : sum += 0; break;
|
|
Packit |
6ad14e |
case '\n' : sum += 0; break;
|
|
Packit |
6ad14e |
default : numad_log(LOG_CRIT, "Unexpected character in list\n"); exit(EXIT_FAILURE);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
return sum;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
int get_num_cpus() {
|
|
Packit |
6ad14e |
int n1 = sysconf(_SC_NPROCESSORS_CONF);
|
|
Packit |
6ad14e |
int n2 = sysconf(_SC_NPROCESSORS_ONLN);
|
|
Packit |
6ad14e |
if (n1 < n2) {
|
|
Packit |
6ad14e |
n1 = n2;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
if (n1 < 0) {
|
|
Packit |
6ad14e |
numad_log(LOG_CRIT, "Cannot count number of processors\n");
|
|
Packit |
6ad14e |
exit(EXIT_FAILURE);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
return n1;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
int get_num_kvm_vcpu_threads(int pid) {
|
|
Packit |
6ad14e |
// Try to return the number of vCPU threads for this VM guest,
|
|
Packit |
6ad14e |
// excluding the IO threads. All failures return MAXINT.
|
|
Packit |
6ad14e |
// FIXME: someday figure out some better way to do this...
|
|
Packit |
6ad14e |
char fname[FNAME_SIZE];
|
|
Packit |
6ad14e |
snprintf(fname, FNAME_SIZE, "/proc/%d/cmdline", pid);
|
|
Packit |
6ad14e |
int fd = open(fname, O_RDONLY, 0);
|
|
Packit |
6ad14e |
if (fd >= 0) {
|
|
Packit |
6ad14e |
char buf[BUF_SIZE];
|
|
Packit |
6ad14e |
int bytes = read(fd, buf, BUF_SIZE);
|
|
Packit |
6ad14e |
close(fd);
|
|
Packit |
6ad14e |
if (bytes > 0) {
|
|
Packit |
6ad14e |
char *p = memmem(buf, bytes, "smp", 3);
|
|
Packit |
6ad14e |
if (p != NULL) {
|
|
Packit |
6ad14e |
while (!isdigit(*p) && (p - buf < bytes - 2)) {
|
|
Packit |
6ad14e |
p++;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
if (isdigit(*p)) {
|
|
Packit |
6ad14e |
int vcpu_threads;
|
|
Packit |
6ad14e |
CONVERT_DIGITS_TO_NUM(p, vcpu_threads);
|
|
Packit |
6ad14e |
if ((vcpu_threads > 0) && (vcpu_threads <= num_cpus)) {
|
|
Packit |
6ad14e |
return vcpu_threads;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
return MAXINT;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
uint64_t get_huge_page_size_in_bytes() {
|
|
Packit |
6ad14e |
uint64_t huge_page_size = 0;;
|
|
Packit |
6ad14e |
FILE *fs = fopen("/proc/meminfo", "r");
|
|
Packit |
6ad14e |
if (!fs) {
|
|
Packit |
6ad14e |
numad_log(LOG_CRIT, "Can't open /proc/meminfo\n");
|
|
Packit |
6ad14e |
exit(EXIT_FAILURE);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
char buf[BUF_SIZE];
|
|
Packit |
6ad14e |
while (fgets(buf, BUF_SIZE, fs)) {
|
|
Packit |
6ad14e |
if (!strncmp("Hugepagesize", buf, 12)) {
|
|
Packit |
6ad14e |
char *p = &buf[12];
|
|
Packit |
6ad14e |
while ((!isdigit(*p)) && (p < buf + BUF_SIZE)) {
|
|
Packit |
6ad14e |
p++;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
huge_page_size = atol(p);
|
|
Packit |
6ad14e |
break;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
fclose(fs);
|
|
Packit |
6ad14e |
return huge_page_size * KILOBYTE;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
uint64_t get_time_stamp() {
|
|
Packit |
6ad14e |
// Return time stamp in hundredths of a second
|
|
Packit |
6ad14e |
struct timespec ts;
|
|
Packit |
6ad14e |
if (clock_gettime(CLOCK_MONOTONIC, &ts) < 0) {
|
|
Packit |
6ad14e |
numad_log(LOG_CRIT, "Cannot get clock_gettime()\n");
|
|
Packit |
6ad14e |
exit(EXIT_FAILURE);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
return (ts.tv_sec * ONE_HUNDRED) +
|
|
Packit |
6ad14e |
(ts.tv_nsec / (1000000000 / ONE_HUNDRED));
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
static int name_starts_with_digit(const struct dirent *dptr) {
|
|
Packit |
6ad14e |
return (isdigit(dptr->d_name[0]));
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
#define BITS_IN_LONG (CHAR_BIT * sizeof(unsigned long))
|
|
Packit |
6ad14e |
#define SET_BIT(i,a) (a)[(i) / BITS_IN_LONG] |= (1u << ((i) % BITS_IN_LONG))
|
|
Packit |
6ad14e |
#define TEST_BIT(i,a) (((a)[(i) / BITS_IN_LONG] & (1u << ((i) % BITS_IN_LONG))) != 0)
|
|
Packit |
6ad14e |
#define CLEAR_BIT(i,a) (a)[(i) / BITS_IN_LONG] &= ~(1u << ((i) % BITS_IN_LONG))
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
int bind_process_and_migrate_memory(process_data_p p) {
|
|
Packit |
6ad14e |
uint64_t t0 = get_time_stamp();
|
|
Packit |
6ad14e |
// Parameter p is a pointer to an element in the hash table
|
|
Packit |
6ad14e |
if ((!p) || (p->pid < 1)) {
|
|
Packit |
6ad14e |
numad_log(LOG_CRIT, "Bad PID to bind\n");
|
|
Packit |
6ad14e |
exit(EXIT_FAILURE);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
if (!p->node_list_p) {
|
|
Packit |
6ad14e |
numad_log(LOG_CRIT, "Cannot bind to unspecified node(s)\n");
|
|
Packit |
6ad14e |
exit(EXIT_FAILURE);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
// Generate CPU list derived from target node list.
|
|
Packit |
6ad14e |
static id_list_p cpu_bind_list_p;
|
|
Packit |
6ad14e |
CLEAR_CPU_LIST(cpu_bind_list_p);
|
|
Packit |
6ad14e |
int nodes = NUM_IDS_IN_LIST(p->node_list_p);
|
|
Packit |
6ad14e |
int node_id = 0;
|
|
Packit |
6ad14e |
while (nodes) {
|
|
Packit |
6ad14e |
if (ID_IS_IN_LIST(node_id, p->node_list_p)) {
|
|
Packit |
6ad14e |
OR_LISTS(cpu_bind_list_p, cpu_bind_list_p, node[node_id].cpu_list_p);
|
|
Packit |
6ad14e |
nodes -= 1;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
node_id += 1;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
char fname[FNAME_SIZE];
|
|
Packit |
6ad14e |
struct dirent **namelist;
|
|
Packit |
6ad14e |
snprintf(fname, FNAME_SIZE, "/proc/%d/task", p->pid);
|
|
Packit |
6ad14e |
int num_tasks = scandir(fname, &namelist, name_starts_with_digit, NULL);
|
|
Packit |
6ad14e |
if (num_tasks <= 0) {
|
|
Packit |
6ad14e |
numad_log(LOG_WARNING, "Could not scandir task list for PID: %d\n", p->pid);
|
|
Packit |
6ad14e |
return 0; // Assume the process terminated
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
// Set the affinity of each task in the process...
|
|
Packit |
6ad14e |
for (int namelist_ix = 0; (namelist_ix < num_tasks); namelist_ix++) {
|
|
Packit |
6ad14e |
int tid = atoi(namelist[namelist_ix]->d_name);
|
|
Packit |
6ad14e |
int rc = sched_setaffinity(tid, ID_LIST_BYTES(cpu_bind_list_p), ID_LIST_SET_P(cpu_bind_list_p));
|
|
Packit |
6ad14e |
if (rc < 0) {
|
|
Packit |
6ad14e |
// Check errno
|
|
Packit |
6ad14e |
if (errno == ESRCH) {
|
|
Packit |
6ad14e |
numad_log(LOG_WARNING, "Tried to move PID %d, TID %d, but it apparently went away.\n", p->pid, tid);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
numad_log(LOG_ERR, "Bad sched_setaffinity() on PID %d, TID %d -- errno: %d\n", p->pid, tid, errno);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
free(namelist[namelist_ix]);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
free(namelist);
|
|
Packit |
6ad14e |
// Now move the memory to the target nodes....
|
|
Packit |
6ad14e |
static unsigned long *dest_mask;
|
|
Packit |
6ad14e |
static unsigned long *from_mask;
|
|
Packit |
6ad14e |
static int allocated_bytes_in_masks;
|
|
Packit |
6ad14e |
// Lie about num_nodes being one bigger because of kernel bug...
|
|
Packit |
6ad14e |
int num_bytes_in_masks = (1 + ((num_nodes + 1) / BITS_IN_LONG)) * sizeof(unsigned long);
|
|
Packit |
6ad14e |
if (allocated_bytes_in_masks < num_bytes_in_masks) {
|
|
Packit |
6ad14e |
allocated_bytes_in_masks = num_bytes_in_masks;
|
|
Packit |
6ad14e |
dest_mask = realloc(dest_mask, num_bytes_in_masks);
|
|
Packit |
6ad14e |
from_mask = realloc(from_mask, num_bytes_in_masks);
|
|
Packit |
6ad14e |
if ((dest_mask == NULL) || (from_mask == NULL)) {
|
|
Packit |
6ad14e |
numad_log(LOG_CRIT, "bit mask malloc failed\n");
|
|
Packit |
6ad14e |
exit(EXIT_FAILURE);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
// In an effort to put semi-balanced memory in each target node, move the
|
|
Packit |
6ad14e |
// contents from the source node with the max amount of memory to the
|
|
Packit |
6ad14e |
// destination node with the least amount of memory. Repeat until done.
|
|
Packit |
6ad14e |
int prev_from_node_id = -1;
|
|
Packit |
6ad14e |
for (;;) {
|
|
Packit |
6ad14e |
int min_dest_node_id = -1;
|
|
Packit |
6ad14e |
int max_from_node_id = -1;
|
|
Packit |
6ad14e |
for (int node_ix = 0; (node_ix < num_nodes); node_ix++) {
|
|
Packit |
6ad14e |
node_id = node[node_ix].node_id;
|
|
Packit |
6ad14e |
if (ID_IS_IN_LIST(node_id, p->node_list_p)) {
|
|
Packit |
6ad14e |
if ((min_dest_node_id < 0) || (p->process_MBs[min_dest_node_id] >= p->process_MBs[node_id])) {
|
|
Packit |
6ad14e |
// The ">=" above is intentional, so we tend to move memory to higher numbered nodes
|
|
Packit |
6ad14e |
min_dest_node_id = node_id;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
} else {
|
|
Packit |
6ad14e |
if ((max_from_node_id < 0) || (p->process_MBs[max_from_node_id] < p->process_MBs[node_id])) {
|
|
Packit |
6ad14e |
max_from_node_id = node_id;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
if ((p->process_MBs[max_from_node_id] == 0) || (max_from_node_id == prev_from_node_id)) {
|
|
Packit |
6ad14e |
break;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
memset(dest_mask, 0, num_bytes_in_masks);
|
|
Packit |
6ad14e |
memset(from_mask, 0, num_bytes_in_masks);
|
|
Packit |
6ad14e |
SET_BIT(max_from_node_id, from_mask);
|
|
Packit |
6ad14e |
SET_BIT(min_dest_node_id, dest_mask);
|
|
Packit |
6ad14e |
numad_log(LOG_DEBUG, "Moving memory from node: %d to node %d\n", max_from_node_id, min_dest_node_id);
|
|
Packit |
6ad14e |
// Lie about num_nodes being one bigger because of kernel bug...
|
|
Packit |
6ad14e |
int rc = syscall(__NR_migrate_pages, p->pid, num_nodes + 1, from_mask, dest_mask);
|
|
Packit |
6ad14e |
if (rc > 2) {
|
|
Packit |
6ad14e |
// rc == the number of pages that could not be moved.
|
|
Packit |
6ad14e |
// A couple pages not moving is probably not a problem, hence ignoring rc == 1 or 2.
|
|
Packit |
6ad14e |
numad_log(LOG_WARNING, "Tried to move PID %d, but %d pages would not move.\n", p->pid, rc);
|
|
Packit |
6ad14e |
} else if (rc < 0) {
|
|
Packit |
6ad14e |
// Check errno
|
|
Packit |
6ad14e |
if (errno == ESRCH) {
|
|
Packit |
6ad14e |
numad_log(LOG_WARNING, "Tried to move PID %d, but it apparently went away.\n", p->pid);
|
|
Packit |
6ad14e |
return 0; // Assume the process terminated
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
// Assume memory did move for current accounting purposes...
|
|
Packit |
6ad14e |
p->process_MBs[min_dest_node_id] += p->process_MBs[max_from_node_id];
|
|
Packit |
6ad14e |
p->process_MBs[max_from_node_id] = 0;
|
|
Packit |
6ad14e |
prev_from_node_id = max_from_node_id;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
// Check pid still active
|
|
Packit |
6ad14e |
snprintf(fname, FNAME_SIZE, "/proc/%d", p->pid);
|
|
Packit |
6ad14e |
if (access(fname, F_OK) < 0) {
|
|
Packit |
6ad14e |
numad_log(LOG_WARNING, "Could not migrate pid %d. Apparently it went away.\n", p->pid);
|
|
Packit |
6ad14e |
return 0;
|
|
Packit |
6ad14e |
} else {
|
|
Packit |
6ad14e |
uint64_t t1 = get_time_stamp();
|
|
Packit |
6ad14e |
p->bind_time_stamp = t1;
|
|
Packit |
6ad14e |
char node_list_str[BUF_SIZE];
|
|
Packit |
6ad14e |
str_from_id_list(node_list_str, BUF_SIZE, p->node_list_p);
|
|
Packit |
6ad14e |
numad_log(LOG_NOTICE, "PID %d moved to node(s) %s in %d.%d seconds\n", p->pid, node_list_str, (t1-t0)/100, (t1-t0)%100);
|
|
Packit |
6ad14e |
return 1;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
typedef struct cpu_data {
|
|
Packit |
6ad14e |
uint64_t time_stamp;
|
|
Packit |
6ad14e |
uint64_t *idle;
|
|
Packit |
6ad14e |
} cpu_data_t, *cpu_data_p;
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
cpu_data_t cpu_data_buf[2]; // Two sets, to calc deltas
|
|
Packit |
6ad14e |
int cur_cpu_data_buf = 0;
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
void update_cpu_data() {
|
|
Packit |
6ad14e |
// Parse idle percents from CPU stats in /proc/stat cpu<N> lines
|
|
Packit |
6ad14e |
static FILE *fs;
|
|
Packit |
6ad14e |
if (fs != NULL) {
|
|
Packit |
6ad14e |
rewind(fs);
|
|
Packit |
6ad14e |
} else {
|
|
Packit |
6ad14e |
fs = fopen("/proc/stat", "r");
|
|
Packit |
6ad14e |
if (!fs) {
|
|
Packit |
6ad14e |
numad_log(LOG_CRIT, "Cannot get /proc/stat contents\n");
|
|
Packit |
6ad14e |
exit(EXIT_FAILURE);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
cpu_data_buf[0].idle = malloc(num_cpus * sizeof(uint64_t));
|
|
Packit |
6ad14e |
cpu_data_buf[1].idle = malloc(num_cpus * sizeof(uint64_t));
|
|
Packit |
6ad14e |
if ((cpu_data_buf[0].idle == NULL) || (cpu_data_buf[1].idle == NULL)) {
|
|
Packit |
6ad14e |
numad_log(LOG_CRIT, "cpu_data_buf malloc failed\n");
|
|
Packit |
6ad14e |
exit(EXIT_FAILURE);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
// Use the other cpu_data buffer...
|
|
Packit |
6ad14e |
int new = 1 - cur_cpu_data_buf;
|
|
Packit |
6ad14e |
// First get the current time stamp
|
|
Packit |
6ad14e |
cpu_data_buf[new].time_stamp = get_time_stamp();
|
|
Packit |
6ad14e |
// Now pull the idle stat from each cpu<N> line
|
|
Packit |
6ad14e |
char buf[BUF_SIZE];
|
|
Packit |
6ad14e |
while (fgets(buf, BUF_SIZE, fs)) {
|
|
Packit |
6ad14e |
/*
|
|
Packit |
6ad14e |
* Lines are of the form:
|
|
Packit |
6ad14e |
*
|
|
Packit |
6ad14e |
* cpu<N> user nice system idle iowait irq softirq steal guest guest_nice
|
|
Packit |
6ad14e |
*
|
|
Packit |
6ad14e |
* # cat /proc/stat
|
|
Packit |
6ad14e |
* cpu 11105906 0 78639 3359578423 24607 151679 322319 0 0 0
|
|
Packit |
6ad14e |
* cpu0 190540 0 1071 52232942 39 7538 234039 0 0 0
|
|
Packit |
6ad14e |
* cpu1 124519 0 50 52545188 0 1443 6267 0 0 0
|
|
Packit |
6ad14e |
* cpu2 143133 0 452 52531440 36 1573 834 0 0 0
|
|
Packit |
6ad14e |
* . . . .
|
|
Packit |
6ad14e |
*/
|
|
Packit |
6ad14e |
if ( (buf[0] == 'c') && (buf[1] == 'p') && (buf[2] == 'u') && (isdigit(buf[3])) ) {
|
|
Packit |
6ad14e |
char *p = &buf[3];
|
|
Packit |
6ad14e |
int cpu_id = *p++ - '0'; while (isdigit(*p)) { cpu_id *= 10; cpu_id += (*p++ - '0'); }
|
|
Packit |
6ad14e |
while (!isdigit(*p)) { p++; } while (isdigit(*p)) { p++; } // skip user
|
|
Packit |
6ad14e |
while (!isdigit(*p)) { p++; } while (isdigit(*p)) { p++; } // skip nice
|
|
Packit |
6ad14e |
while (!isdigit(*p)) { p++; } while (isdigit(*p)) { p++; } // skip system
|
|
Packit |
6ad14e |
while (!isdigit(*p)) { p++; }
|
|
Packit |
6ad14e |
uint64_t idle;
|
|
Packit |
6ad14e |
CONVERT_DIGITS_TO_NUM(p, idle);
|
|
Packit |
6ad14e |
cpu_data_buf[new].idle[cpu_id] = idle;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
cur_cpu_data_buf = new;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
int node_and_digits(const struct dirent *dptr) {
|
|
Packit |
6ad14e |
char *p = (char *)(dptr->d_name);
|
|
Packit |
6ad14e |
if (*p++ != 'n') return 0;
|
|
Packit |
6ad14e |
if (*p++ != 'o') return 0;
|
|
Packit |
6ad14e |
if (*p++ != 'd') return 0;
|
|
Packit |
6ad14e |
if (*p++ != 'e') return 0;
|
|
Packit |
6ad14e |
do {
|
|
Packit |
6ad14e |
if (!isdigit(*p++))
|
|
Packit |
6ad14e |
return 0;
|
|
Packit |
6ad14e |
} while (*p != '\0');
|
|
Packit |
6ad14e |
return 1;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
uint64_t node_info_time_stamp = 0;
|
|
Packit |
6ad14e |
id_list_p all_cpus_list_p = NULL;
|
|
Packit |
6ad14e |
id_list_p all_nodes_list_p = NULL;
|
|
Packit |
6ad14e |
id_list_p reserved_cpu_mask_list_p = NULL;
|
|
Packit |
6ad14e |
char *reserved_cpu_str = NULL;
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
void show_nodes() {
|
|
Packit |
6ad14e |
fprintf(log_fs, "\n");
|
|
Packit |
6ad14e |
numad_log(LOG_INFO, "Nodes: %d\n", num_nodes);
|
|
Packit |
6ad14e |
fprintf(log_fs, "Min CPUs free: %ld, Max CPUs: %ld, Avg CPUs: %ld, StdDev: %lg\n",
|
|
Packit |
6ad14e |
min_node_CPUs_free, max_node_CPUs_free, avg_node_CPUs_free, stddev_node_CPUs_free);
|
|
Packit |
6ad14e |
fprintf(log_fs, "Min MBs free: %ld, Max MBs: %ld, Avg MBs: %ld, StdDev: %lg\n",
|
|
Packit |
6ad14e |
min_node_MBs_free, max_node_MBs_free, avg_node_MBs_free, stddev_node_MBs_free);
|
|
Packit |
6ad14e |
for (int ix = 0; (ix < num_nodes); ix++) {
|
|
Packit |
6ad14e |
fprintf(log_fs, "Node %d: MBs_total %ld, MBs_free %6ld, CPUs_total %ld, CPUs_free %4ld, Distance: ",
|
|
Packit |
6ad14e |
ix, node[ix].MBs_total, node[ix].MBs_free, node[ix].CPUs_total, node[ix].CPUs_free);
|
|
Packit |
6ad14e |
for (int d = 0; (d < num_nodes); d++) {
|
|
Packit |
6ad14e |
fprintf(log_fs, "%d ", node[ix].distance[d]);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
char buf[BUF_SIZE];
|
|
Packit |
6ad14e |
str_from_id_list(buf, BUF_SIZE, node[ix].cpu_list_p);
|
|
Packit |
6ad14e |
fprintf(log_fs, " CPUs: %s\n", buf);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
fflush(log_fs);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
int update_nodes() {
|
|
Packit |
6ad14e |
char fname[FNAME_SIZE];
|
|
Packit |
6ad14e |
char buf[BIG_BUF_SIZE];
|
|
Packit |
6ad14e |
// First, check to see if we should refresh basic node info that probably never changes...
|
|
Packit |
6ad14e |
uint64_t time_stamp = get_time_stamp();
|
|
Packit |
6ad14e |
#define STATIC_NODE_INFO_DELAY (600 * ONE_HUNDRED)
|
|
Packit |
6ad14e |
if ((num_nodes == 0) || (node_info_time_stamp + STATIC_NODE_INFO_DELAY < time_stamp)) {
|
|
Packit |
6ad14e |
node_info_time_stamp = time_stamp;
|
|
Packit |
6ad14e |
// Count directory names of the form: /sys/devices/system/node/node<N>
|
|
Packit |
6ad14e |
struct dirent **namelist;
|
|
Packit |
6ad14e |
int num_files = scandir ("/sys/devices/system/node", &namelist, node_and_digits, NULL);
|
|
Packit |
6ad14e |
if (num_files < 1) {
|
|
Packit |
6ad14e |
numad_log(LOG_CRIT, "Could not get NUMA node info\n");
|
|
Packit |
6ad14e |
exit(EXIT_FAILURE);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
int need_to_realloc = (num_files != num_nodes);
|
|
Packit |
6ad14e |
if (need_to_realloc) {
|
|
Packit |
6ad14e |
for (int ix = num_files; (ix < num_nodes); ix++) {
|
|
Packit |
6ad14e |
// If new < old, free old node_data pointers
|
|
Packit |
6ad14e |
free(node[ix].distance);
|
|
Packit |
6ad14e |
FREE_LIST(node[ix].cpu_list_p);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
node = realloc(node, (num_files * sizeof(node_data_t)));
|
|
Packit |
6ad14e |
if (node == NULL) {
|
|
Packit |
6ad14e |
numad_log(LOG_CRIT, "node realloc failed\n");
|
|
Packit |
6ad14e |
exit(EXIT_FAILURE);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
for (int ix = num_nodes; (ix < num_files); ix++) {
|
|
Packit |
6ad14e |
// If new > old, nullify new node_data pointers
|
|
Packit |
6ad14e |
node[ix].distance = NULL;
|
|
Packit |
6ad14e |
node[ix].cpu_list_p = NULL;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
num_nodes = num_files;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
sum_CPUs_total = 0;
|
|
Packit |
6ad14e |
CLEAR_CPU_LIST(all_cpus_list_p);
|
|
Packit |
6ad14e |
CLEAR_NODE_LIST(all_nodes_list_p);
|
|
Packit |
6ad14e |
// Figure out how many threads per core there are (for later discounting of hyper-threads)
|
|
Packit |
6ad14e |
threads_per_core = count_set_bits_in_hex_list_file("/sys/devices/system/cpu/cpu0/topology/thread_siblings");
|
|
Packit |
6ad14e |
if (threads_per_core < 1) {
|
|
Packit |
6ad14e |
numad_log(LOG_CRIT, "Could not count threads per core\n");
|
|
Packit |
6ad14e |
exit(EXIT_FAILURE);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
// For each "node<N>" filename present, save <N> in node[ix].node_id
|
|
Packit |
6ad14e |
// Note that the node id might not necessarily match the node ix.
|
|
Packit |
6ad14e |
// Also populate the cpu lists and distance vectors for this node.
|
|
Packit |
6ad14e |
for (int node_ix = 0; (node_ix < num_nodes); node_ix++) {
|
|
Packit |
6ad14e |
int node_id;
|
|
Packit |
6ad14e |
char *p = &namelist[node_ix]->d_name[4];
|
|
Packit |
6ad14e |
CONVERT_DIGITS_TO_NUM(p, node_id);
|
|
Packit |
6ad14e |
free(namelist[node_ix]);
|
|
Packit |
6ad14e |
node[node_ix].node_id = node_id;
|
|
Packit |
6ad14e |
ADD_ID_TO_LIST(node_id, all_nodes_list_p);
|
|
Packit |
6ad14e |
// Get all the CPU IDs in this node... Read lines from node<N>/cpulist
|
|
Packit |
6ad14e |
// file, and set the corresponding bits in the node cpu list.
|
|
Packit |
6ad14e |
snprintf(fname, FNAME_SIZE, "/sys/devices/system/node/node%d/cpulist", node_id);
|
|
Packit |
6ad14e |
int fd = open(fname, O_RDONLY, 0);
|
|
Packit |
6ad14e |
if ((fd >= 0) && (read(fd, buf, BIG_BUF_SIZE) > 0)) {
|
|
Packit |
6ad14e |
buf[BIG_BUF_SIZE - 1] = '\0';
|
|
Packit |
6ad14e |
// get cpulist from the cpulist string
|
|
Packit |
6ad14e |
CLEAR_CPU_LIST(node[node_ix].cpu_list_p);
|
|
Packit |
6ad14e |
int n = add_ids_to_list_from_str(node[node_ix].cpu_list_p, buf);
|
|
Packit |
6ad14e |
if (reserved_cpu_str != NULL) {
|
|
Packit |
6ad14e |
AND_LISTS(node[node_ix].cpu_list_p, node[node_ix].cpu_list_p, reserved_cpu_mask_list_p);
|
|
Packit |
6ad14e |
n = NUM_IDS_IN_LIST(node[node_ix].cpu_list_p);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
OR_LISTS(all_cpus_list_p, all_cpus_list_p, node[node_ix].cpu_list_p);
|
|
Packit |
6ad14e |
// Calculate total CPUs, but possibly discount hyper-threads
|
|
Packit |
6ad14e |
if ((threads_per_core == 1) || (htt_percent >= 100)) {
|
|
Packit |
6ad14e |
node[node_ix].CPUs_total = n * ONE_HUNDRED;
|
|
Packit |
6ad14e |
} else {
|
|
Packit |
6ad14e |
n /= threads_per_core;
|
|
Packit |
6ad14e |
node[node_ix].CPUs_total = n * ONE_HUNDRED;
|
|
Packit |
6ad14e |
node[node_ix].CPUs_total += n * (threads_per_core - 1) * htt_percent;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
sum_CPUs_total += node[node_ix].CPUs_total;
|
|
Packit |
6ad14e |
close(fd);
|
|
Packit |
6ad14e |
} else {
|
|
Packit |
6ad14e |
numad_log(LOG_CRIT, "Could not get node cpu list\n");
|
|
Packit |
6ad14e |
exit(EXIT_FAILURE);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
// Get distance vector of ACPI SLIT data from node<N>/distance file
|
|
Packit |
6ad14e |
if (need_to_realloc) {
|
|
Packit |
6ad14e |
node[node_ix].distance = realloc(node[node_ix].distance, (num_nodes * sizeof(uint8_t)));
|
|
Packit |
6ad14e |
if (node[node_ix].distance == NULL) {
|
|
Packit |
6ad14e |
numad_log(LOG_CRIT, "node distance realloc failed\n");
|
|
Packit |
6ad14e |
exit(EXIT_FAILURE);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
snprintf(fname, FNAME_SIZE, "/sys/devices/system/node/node%d/distance", node_id);
|
|
Packit |
6ad14e |
fd = open(fname, O_RDONLY, 0);
|
|
Packit |
6ad14e |
if ((fd >= 0) && (read(fd, buf, BIG_BUF_SIZE) > 0)) {
|
|
Packit |
6ad14e |
int rnode = 0;
|
|
Packit |
6ad14e |
for (char *p = buf; (*p != '\n'); ) {
|
|
Packit |
6ad14e |
int lat;
|
|
Packit |
6ad14e |
CONVERT_DIGITS_TO_NUM(p, lat);
|
|
Packit |
6ad14e |
node[node_ix].distance[rnode++] = lat;
|
|
Packit |
6ad14e |
while (*p == ' ') { p++; }
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
close(fd);
|
|
Packit |
6ad14e |
} else {
|
|
Packit |
6ad14e |
numad_log(LOG_CRIT, "Could not get node distance data\n");
|
|
Packit |
6ad14e |
exit(EXIT_FAILURE);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
free(namelist);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
// Second, update the dynamic free memory and available CPU capacity
|
|
Packit |
6ad14e |
while (cpu_data_buf[cur_cpu_data_buf].time_stamp + 7 >= time_stamp) {
|
|
Packit |
6ad14e |
// Make sure at least 7/100 of a second has passed.
|
|
Packit |
6ad14e |
// Otherwise sleep for 1/10 second.
|
|
Packit |
6ad14e |
struct timespec ts = { 0, 100000000 };
|
|
Packit |
6ad14e |
nanosleep(&ts, &ts);
|
|
Packit |
6ad14e |
time_stamp = get_time_stamp();
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
update_cpu_data();
|
|
Packit |
6ad14e |
max_node_MBs_free = 0;
|
|
Packit |
6ad14e |
max_node_CPUs_free = 0;
|
|
Packit |
6ad14e |
min_node_MBs_free = MAXINT;
|
|
Packit |
6ad14e |
min_node_CPUs_free = MAXINT;
|
|
Packit |
6ad14e |
uint64_t sum_of_node_MBs_free = 0;
|
|
Packit |
6ad14e |
uint64_t sum_of_node_CPUs_free = 0;
|
|
Packit |
6ad14e |
for (int node_ix = 0; (node_ix < num_nodes); node_ix++) {
|
|
Packit |
6ad14e |
int node_id = node[node_ix].node_id;
|
|
Packit |
6ad14e |
// Get available memory info from node<N>/meminfo file
|
|
Packit |
6ad14e |
snprintf(fname, FNAME_SIZE, "/sys/devices/system/node/node%d/meminfo", node_id);
|
|
Packit |
6ad14e |
int fd = open(fname, O_RDONLY, 0);
|
|
Packit |
6ad14e |
if ((fd >= 0) && (read(fd, buf, BIG_BUF_SIZE) > 0)) {
|
|
Packit |
6ad14e |
close(fd);
|
|
Packit |
6ad14e |
uint64_t KB;
|
|
Packit |
6ad14e |
buf[BIG_BUF_SIZE - 1] = '\0';
|
|
Packit |
6ad14e |
char *p = strstr(buf, "MemTotal:");
|
|
Packit |
6ad14e |
if (p != NULL) {
|
|
Packit |
6ad14e |
p += 9;
|
|
Packit |
6ad14e |
} else {
|
|
Packit |
6ad14e |
numad_log(LOG_CRIT, "Could not get node MemTotal\n");
|
|
Packit |
6ad14e |
exit(EXIT_FAILURE);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
while (!isdigit(*p)) { p++; }
|
|
Packit |
6ad14e |
CONVERT_DIGITS_TO_NUM(p, KB);
|
|
Packit |
6ad14e |
node[node_ix].MBs_total = (KB / KILOBYTE);
|
|
Packit |
6ad14e |
if (node[node_ix].MBs_total < 1) {
|
|
Packit |
6ad14e |
// If a node has zero memory, remove it from the all_nodes_list...
|
|
Packit |
6ad14e |
CLR_ID_IN_LIST(node_id, all_nodes_list_p);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
p = strstr(p, "MemFree:");
|
|
Packit |
6ad14e |
if (p != NULL) {
|
|
Packit |
6ad14e |
p += 8;
|
|
Packit |
6ad14e |
} else {
|
|
Packit |
6ad14e |
numad_log(LOG_CRIT, "Could not get node MemFree\n");
|
|
Packit |
6ad14e |
exit(EXIT_FAILURE);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
while (!isdigit(*p)) { p++; }
|
|
Packit |
6ad14e |
CONVERT_DIGITS_TO_NUM(p, KB);
|
|
Packit |
6ad14e |
node[node_ix].MBs_free = (KB / KILOBYTE);
|
|
Packit |
6ad14e |
if (use_inactive_file_cache) {
|
|
Packit |
6ad14e |
// Add inactive file cache quantity to "free" memory
|
|
Packit |
6ad14e |
p = strstr(p, "Inactive(file):");
|
|
Packit |
6ad14e |
if (p != NULL) {
|
|
Packit |
6ad14e |
p += 15;
|
|
Packit |
6ad14e |
} else {
|
|
Packit |
6ad14e |
numad_log(LOG_CRIT, "Could not get node Inactive(file)\n");
|
|
Packit |
6ad14e |
exit(EXIT_FAILURE);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
while (!isdigit(*p)) { p++; }
|
|
Packit |
6ad14e |
CONVERT_DIGITS_TO_NUM(p, KB);
|
|
Packit |
6ad14e |
node[node_ix].MBs_free += (KB / KILOBYTE);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
sum_of_node_MBs_free += node[node_ix].MBs_free;
|
|
Packit |
6ad14e |
if (min_node_MBs_free > node[node_ix].MBs_free) {
|
|
Packit |
6ad14e |
min_node_MBs_free = node[node_ix].MBs_free;
|
|
Packit |
6ad14e |
min_node_MBs_free_ix = node[node_ix].node_id;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
if (max_node_MBs_free < node[node_ix].MBs_free) {
|
|
Packit |
6ad14e |
max_node_MBs_free = node[node_ix].MBs_free;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
} else {
|
|
Packit |
6ad14e |
numad_log(LOG_CRIT, "Could not get node meminfo\n");
|
|
Packit |
6ad14e |
exit(EXIT_FAILURE);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
// If both buffers have been populated by now, sum CPU idle data
|
|
Packit |
6ad14e |
// for each node in order to calculate available capacity
|
|
Packit |
6ad14e |
int old_cpu_data_buf = 1 - cur_cpu_data_buf;
|
|
Packit |
6ad14e |
if (cpu_data_buf[old_cpu_data_buf].time_stamp > 0) {
|
|
Packit |
6ad14e |
uint64_t idle_ticks = 0;
|
|
Packit |
6ad14e |
int cpu = 0;
|
|
Packit |
6ad14e |
int num_lcpus = NUM_IDS_IN_LIST(node[node_ix].cpu_list_p);
|
|
Packit |
6ad14e |
int num_cpus_to_process = num_lcpus;
|
|
Packit |
6ad14e |
while (num_cpus_to_process) {
|
|
Packit |
6ad14e |
if (ID_IS_IN_LIST(cpu, node[node_ix].cpu_list_p)) {
|
|
Packit |
6ad14e |
idle_ticks += cpu_data_buf[cur_cpu_data_buf].idle[cpu]
|
|
Packit |
6ad14e |
- cpu_data_buf[old_cpu_data_buf].idle[cpu];
|
|
Packit |
6ad14e |
num_cpus_to_process -= 1;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
cpu += 1;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
uint64_t time_diff = cpu_data_buf[cur_cpu_data_buf].time_stamp
|
|
Packit |
6ad14e |
- cpu_data_buf[old_cpu_data_buf].time_stamp;
|
|
Packit |
6ad14e |
// printf("Node: %d CPUs: %ld time diff %ld Idle ticks %ld\n", node_id, node[node_ix].CPUs_total, time_diff, idle_ticks);
|
|
Packit |
6ad14e |
// assert(time_diff > 0);
|
|
Packit |
6ad14e |
node[node_ix].CPUs_free = (idle_ticks * ONE_HUNDRED) / time_diff;
|
|
Packit |
6ad14e |
// Possibly discount hyper-threads
|
|
Packit |
6ad14e |
if ((threads_per_core > 1) && (htt_percent < 100)) {
|
|
Packit |
6ad14e |
uint64_t htt_discount = (num_lcpus - (num_lcpus / threads_per_core)) * (100 - htt_percent);
|
|
Packit |
6ad14e |
if (node[node_ix].CPUs_free > htt_discount) {
|
|
Packit |
6ad14e |
node[node_ix].CPUs_free -= htt_discount;
|
|
Packit |
6ad14e |
} else {
|
|
Packit |
6ad14e |
node[node_ix].CPUs_free = 0;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
if (node[node_ix].CPUs_free > node[node_ix].CPUs_total) {
|
|
Packit |
6ad14e |
node[node_ix].CPUs_free = node[node_ix].CPUs_total;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
sum_of_node_CPUs_free += node[node_ix].CPUs_free;
|
|
Packit |
6ad14e |
if (min_node_CPUs_free > node[node_ix].CPUs_free) {
|
|
Packit |
6ad14e |
min_node_CPUs_free = node[node_ix].CPUs_free;
|
|
Packit |
6ad14e |
min_node_CPUs_free_ix = node[node_ix].node_id;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
if (max_node_CPUs_free < node[node_ix].CPUs_free) {
|
|
Packit |
6ad14e |
max_node_CPUs_free = node[node_ix].CPUs_free;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
node[node_ix].magnitude = node[node_ix].CPUs_free * node[node_ix].MBs_free;
|
|
Packit |
6ad14e |
} else {
|
|
Packit |
6ad14e |
node[node_ix].CPUs_free = 0;
|
|
Packit |
6ad14e |
node[node_ix].magnitude = 0;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
avg_node_MBs_free = sum_of_node_MBs_free / num_nodes;
|
|
Packit |
6ad14e |
avg_node_CPUs_free = sum_of_node_CPUs_free / num_nodes;
|
|
Packit |
6ad14e |
double MBs_variance_sum = 0.0;
|
|
Packit |
6ad14e |
double CPUs_variance_sum = 0.0;
|
|
Packit |
6ad14e |
for (int node_ix = 0; (node_ix < num_nodes); node_ix++) {
|
|
Packit |
6ad14e |
double MBs_diff = (double)node[node_ix].MBs_free - (double)avg_node_MBs_free;
|
|
Packit |
6ad14e |
double CPUs_diff = (double)node[node_ix].CPUs_free - (double)avg_node_CPUs_free;
|
|
Packit |
6ad14e |
MBs_variance_sum += MBs_diff * MBs_diff;
|
|
Packit |
6ad14e |
CPUs_variance_sum += CPUs_diff * CPUs_diff;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
double MBs_variance = MBs_variance_sum / (num_nodes);
|
|
Packit |
6ad14e |
double CPUs_variance = CPUs_variance_sum / (num_nodes);
|
|
Packit |
6ad14e |
stddev_node_MBs_free = sqrt(MBs_variance);
|
|
Packit |
6ad14e |
stddev_node_CPUs_free = sqrt(CPUs_variance);
|
|
Packit |
6ad14e |
if (log_level >= LOG_INFO) {
|
|
Packit |
6ad14e |
show_nodes();
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
return num_nodes;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
typedef struct stat_data {
|
|
Packit |
6ad14e |
// This structure isn't actually used in numad -- it is here just to
|
|
Packit |
6ad14e |
// document the field type and order of the /proc/<PID>/stat items, some of
|
|
Packit |
6ad14e |
// which are used in the process_data_t structure.
|
|
Packit |
6ad14e |
int pid; // 0
|
|
Packit |
6ad14e |
char *comm; // 1
|
|
Packit |
6ad14e |
char state;
|
|
Packit |
6ad14e |
int ppid;
|
|
Packit |
6ad14e |
int pgrp;
|
|
Packit |
6ad14e |
int session;
|
|
Packit |
6ad14e |
int tty_nr;
|
|
Packit |
6ad14e |
int tpgid;
|
|
Packit |
6ad14e |
unsigned flags;
|
|
Packit |
6ad14e |
uint64_t minflt;
|
|
Packit |
6ad14e |
uint64_t cminflt;
|
|
Packit |
6ad14e |
uint64_t majflt;
|
|
Packit |
6ad14e |
uint64_t cmajflt;
|
|
Packit |
6ad14e |
uint64_t utime; // 13
|
|
Packit |
6ad14e |
uint64_t stime; // 14
|
|
Packit |
6ad14e |
int64_t cutime;
|
|
Packit |
6ad14e |
int64_t cstime;
|
|
Packit |
6ad14e |
int64_t priority; // 17
|
|
Packit |
6ad14e |
int64_t nice;
|
|
Packit |
6ad14e |
int64_t num_threads; // 19
|
|
Packit |
6ad14e |
int64_t itrealvalue;
|
|
Packit |
6ad14e |
uint64_t starttime;
|
|
Packit |
6ad14e |
uint64_t vsize; // 22
|
|
Packit |
6ad14e |
int64_t rss; // 23
|
|
Packit |
6ad14e |
uint64_t rsslim;
|
|
Packit |
6ad14e |
uint64_t startcode;
|
|
Packit |
6ad14e |
uint64_t endcode;
|
|
Packit |
6ad14e |
uint64_t startstack;
|
|
Packit |
6ad14e |
uint64_t kstkesp;
|
|
Packit |
6ad14e |
uint64_t kstkeip;
|
|
Packit |
6ad14e |
uint64_t signal;
|
|
Packit |
6ad14e |
uint64_t blocked;
|
|
Packit |
6ad14e |
uint64_t sigignore;
|
|
Packit |
6ad14e |
uint64_t sigcatch;
|
|
Packit |
6ad14e |
uint64_t wchan;
|
|
Packit |
6ad14e |
uint64_t nswap;
|
|
Packit |
6ad14e |
uint64_t cnswap;
|
|
Packit |
6ad14e |
int exit_signal;
|
|
Packit |
6ad14e |
int processor;
|
|
Packit |
6ad14e |
unsigned rt_priority;
|
|
Packit |
6ad14e |
unsigned policy; // 40
|
|
Packit |
6ad14e |
uint64_t delayacct_blkio_ticks;
|
|
Packit |
6ad14e |
uint64_t guest_time; // 42
|
|
Packit |
6ad14e |
int64_t cguest_time;
|
|
Packit |
6ad14e |
} stat_data_t, *stat_data_p;
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
process_data_p get_stat_data_for_pid(int pid, char *pid_string) {
|
|
Packit |
6ad14e |
// Note: This function uses static data buffers and is not thread safe.
|
|
Packit |
6ad14e |
char fname[FNAME_SIZE];
|
|
Packit |
6ad14e |
if (pid >= 0) {
|
|
Packit |
6ad14e |
snprintf(fname, FNAME_SIZE, "/proc/%d/stat", pid);
|
|
Packit |
6ad14e |
} else {
|
|
Packit |
6ad14e |
snprintf(fname, FNAME_SIZE, "/proc/%s/stat", pid_string);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
int fd = open(fname, O_RDONLY, 0);
|
|
Packit |
6ad14e |
if (fd < 0) {
|
|
Packit |
6ad14e |
numad_log(LOG_WARNING, "Could not open stat file: %s\n", fname);
|
|
Packit |
6ad14e |
return NULL;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
static char buf[BUF_SIZE];
|
|
Packit |
6ad14e |
int bytes = read(fd, buf, BUF_SIZE);
|
|
Packit |
6ad14e |
close(fd);
|
|
Packit |
6ad14e |
if (bytes < 50) {
|
|
Packit |
6ad14e |
numad_log(LOG_WARNING, "Could not read stat file: %s\n", fname);
|
|
Packit |
6ad14e |
return NULL;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
uint64_t val;
|
|
Packit |
6ad14e |
char *p = buf;
|
|
Packit |
6ad14e |
static process_data_t data;
|
|
Packit |
6ad14e |
// Get PID from field 0
|
|
Packit |
6ad14e |
CONVERT_DIGITS_TO_NUM(p, val);
|
|
Packit |
6ad14e |
data.pid = val;
|
|
Packit |
6ad14e |
// Copy comm from field 1
|
|
Packit |
6ad14e |
while (*p == ' ') { p++; }
|
|
Packit |
6ad14e |
data.comm = p; while (*p != ' ') { p++; }
|
|
Packit |
6ad14e |
*p++ = '\0'; // replacing the presumed single ' ' before next field
|
|
Packit |
6ad14e |
// Skip fields 2 through 12
|
|
Packit |
6ad14e |
for (int ix = 0; (ix < 11); ix++) { while (*p != ' ') { p++; } while (*p == ' ') { p++; } }
|
|
Packit |
6ad14e |
// Get utime from field 13 for cpu_util
|
|
Packit |
6ad14e |
CONVERT_DIGITS_TO_NUM(p, val);
|
|
Packit |
6ad14e |
data.cpu_util = val;
|
|
Packit |
6ad14e |
// Get stime from field 14 to add on to cpu_util (which already has utime)
|
|
Packit |
6ad14e |
while (*p == ' ') { p++; }
|
|
Packit |
6ad14e |
CONVERT_DIGITS_TO_NUM(p, val);
|
|
Packit |
6ad14e |
data.cpu_util += val;
|
|
Packit |
6ad14e |
// Skip fields 15 through 18
|
|
Packit |
6ad14e |
while (*p == ' ') { p++; }
|
|
Packit |
6ad14e |
for (int ix = 0; (ix < 4); ix++) { while (*p != ' ') { p++; } while (*p == ' ') { p++; } }
|
|
Packit |
6ad14e |
// Get num_threads from field 19
|
|
Packit |
6ad14e |
CONVERT_DIGITS_TO_NUM(p, val);
|
|
Packit |
6ad14e |
data.num_threads = val;
|
|
Packit |
6ad14e |
// Skip fields 20 through 21
|
|
Packit |
6ad14e |
while (*p == ' ') { p++; }
|
|
Packit |
6ad14e |
for (int ix = 0; (ix < 2); ix++) { while (*p != ' ') { p++; } while (*p == ' ') { p++; } }
|
|
Packit |
6ad14e |
// Get vsize from field 22 to compute MBs_size
|
|
Packit |
6ad14e |
CONVERT_DIGITS_TO_NUM(p, val);
|
|
Packit |
6ad14e |
data.MBs_size = val / MEGABYTE;
|
|
Packit |
6ad14e |
// Get rss from field 23 to compute MBs_used
|
|
Packit |
6ad14e |
while (*p == ' ') { p++; }
|
|
Packit |
6ad14e |
CONVERT_DIGITS_TO_NUM(p, val);
|
|
Packit |
6ad14e |
data.MBs_used = (val * page_size_in_bytes) / MEGABYTE;
|
|
Packit |
6ad14e |
// Return pointer to data
|
|
Packit |
6ad14e |
return &dat;;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
int update_processes() {
|
|
Packit |
6ad14e |
// Conditionally scan /proc/<PID>/stat files for processes we should
|
|
Packit |
6ad14e |
// perhaps manage. For all processes, evaluate whether or not they should
|
|
Packit |
6ad14e |
// be added to our hash table of managed processes candidates. If so,
|
|
Packit |
6ad14e |
// update the statistics, time stamp and utilization numbers for the select
|
|
Packit |
6ad14e |
// processes in the hash table.
|
|
Packit |
6ad14e |
uint64_t this_update_time = get_time_stamp();
|
|
Packit |
6ad14e |
int new_candidates = 0; // limit number of new candidates per update
|
|
Packit |
6ad14e |
int files = 0;
|
|
Packit |
6ad14e |
if (scan_all_processes) {
|
|
Packit |
6ad14e |
struct dirent **namelist;
|
|
Packit |
6ad14e |
files = scandir("/proc", &namelist, name_starts_with_digit, NULL);
|
|
Packit |
6ad14e |
if (files < 0) {
|
|
Packit |
6ad14e |
numad_log(LOG_CRIT, "Could not open /proc\n");
|
|
Packit |
6ad14e |
exit(EXIT_FAILURE);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
for (int ix = 0; (ix < files); ix++) {
|
|
Packit |
6ad14e |
process_data_p data_p;
|
|
Packit |
6ad14e |
if ((data_p = get_stat_data_for_pid(-1, namelist[ix]->d_name)) != NULL) {
|
|
Packit |
6ad14e |
// See if this process uses enough memory to be managed.
|
|
Packit |
6ad14e |
if ((data_p->MBs_used > MEMORY_THRESHOLD)
|
|
Packit |
6ad14e |
&& (new_candidates < process_hash_table_size / 3)) {
|
|
Packit |
6ad14e |
data_p->data_time_stamp = get_time_stamp();
|
|
Packit |
6ad14e |
new_candidates += process_hash_update(data_p);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
free(namelist[ix]);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
free(namelist);
|
|
Packit |
6ad14e |
} // scan_all_processes
|
|
Packit |
6ad14e |
// Process explicit inclusion and exclusion pid lists
|
|
Packit |
6ad14e |
pthread_mutex_lock(&pid_list_mutex);
|
|
Packit |
6ad14e |
// Include candidate processes from the explicit include pid list
|
|
Packit |
6ad14e |
pid_list_p pid_ptr = include_pid_list;
|
|
Packit |
6ad14e |
while ((pid_ptr != NULL) && (new_candidates < process_hash_table_size / 3)) {
|
|
Packit |
6ad14e |
int hash_ix = process_hash_lookup(pid_ptr->pid);
|
|
Packit |
6ad14e |
if ( (hash_ix >= 0) && (process_hash_table[hash_ix].data_time_stamp > this_update_time)) {
|
|
Packit |
6ad14e |
// Already in hash table, and recently updated...
|
|
Packit |
6ad14e |
pid_ptr = pid_ptr->next;
|
|
Packit |
6ad14e |
continue;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
process_data_p data_p;
|
|
Packit |
6ad14e |
if ((data_p = get_stat_data_for_pid(pid_ptr->pid, NULL)) != NULL) {
|
|
Packit |
6ad14e |
data_p->data_time_stamp = get_time_stamp();
|
|
Packit |
6ad14e |
new_candidates += process_hash_update(data_p);
|
|
Packit |
6ad14e |
if (!scan_all_processes) {
|
|
Packit |
6ad14e |
files += 1;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
pid_ptr = pid_ptr->next;
|
|
Packit |
6ad14e |
} else {
|
|
Packit |
6ad14e |
// no stat file so assume pid dead -- remove it from pid list
|
|
Packit |
6ad14e |
include_pid_list = remove_pid_from_pid_list(include_pid_list, pid_ptr->pid);
|
|
Packit |
6ad14e |
pid_ptr = include_pid_list; // just restart from list beginning
|
|
Packit |
6ad14e |
continue;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
// Zero CPU utilization for processes on the explicit exclude pid list
|
|
Packit |
6ad14e |
pid_ptr = exclude_pid_list;
|
|
Packit |
6ad14e |
while (pid_ptr != NULL) {
|
|
Packit |
6ad14e |
int hash_ix = process_hash_lookup(pid_ptr->pid);
|
|
Packit |
6ad14e |
if (hash_ix >= 0) {
|
|
Packit |
6ad14e |
process_hash_table[hash_ix].CPUs_used = 0;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
pid_ptr = pid_ptr->next;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
pthread_mutex_unlock(&pid_list_mutex);
|
|
Packit |
6ad14e |
if (log_level >= LOG_INFO) {
|
|
Packit |
6ad14e |
numad_log(LOG_INFO, "Processes: %d\n", files);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
// Now go through all managed processes to cleanup out-of-date and dead ones.
|
|
Packit |
6ad14e |
process_hash_table_cleanup(this_update_time);
|
|
Packit |
6ad14e |
return files;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
int initialize_mem_node_list(process_data_p p) {
|
|
Packit |
6ad14e |
// Parameter p is a pointer to an element in the hash table
|
|
Packit |
6ad14e |
if ((!p) || (p->pid < 1)) {
|
|
Packit |
6ad14e |
numad_log(LOG_CRIT, "Cannot initialize mem node lists with bad PID\n");
|
|
Packit |
6ad14e |
exit(EXIT_FAILURE);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
int n = 0;
|
|
Packit |
6ad14e |
char fname[FNAME_SIZE];
|
|
Packit |
6ad14e |
char buf[BIG_BUF_SIZE];
|
|
Packit |
6ad14e |
p->process_MBs = NULL;
|
|
Packit |
6ad14e |
CLEAR_NODE_LIST(p->node_list_p);
|
|
Packit |
6ad14e |
snprintf(fname, FNAME_SIZE, "/proc/%d/status", p->pid);
|
|
Packit |
6ad14e |
int fd = open(fname, O_RDONLY, 0);
|
|
Packit |
6ad14e |
if (fd < 0) {
|
|
Packit |
6ad14e |
numad_log(LOG_WARNING, "Tried to research PID %d, but it apparently went away.\n", p->pid);
|
|
Packit |
6ad14e |
return 0; // Assume the process terminated
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
int bytes = read(fd, buf, BIG_BUF_SIZE);
|
|
Packit |
6ad14e |
close(fd);
|
|
Packit |
6ad14e |
if (bytes <= 0) {
|
|
Packit |
6ad14e |
numad_log(LOG_WARNING, "Tried to research PID %d, but cannot read status file.\n", p->pid);
|
|
Packit |
6ad14e |
return 0; // Assume the process terminated
|
|
Packit |
6ad14e |
} else if (bytes >= BIG_BUF_SIZE) {
|
|
Packit |
6ad14e |
buf[BIG_BUF_SIZE - 1] = '\0';
|
|
Packit |
6ad14e |
} else {
|
|
Packit |
6ad14e |
buf[bytes] = '\0';
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
char *list_str_p = strstr(buf, "Mems_allowed_list:");
|
|
Packit |
6ad14e |
if (!list_str_p) {
|
|
Packit |
6ad14e |
numad_log(LOG_CRIT, "Could not get node Mems_allowed_list\n");
|
|
Packit |
6ad14e |
exit(EXIT_FAILURE);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
list_str_p += 18;
|
|
Packit |
6ad14e |
while (!isdigit(*list_str_p)) { list_str_p++; }
|
|
Packit |
6ad14e |
n = add_ids_to_list_from_str(p->node_list_p, list_str_p);
|
|
Packit |
6ad14e |
if (n < num_nodes) {
|
|
Packit |
6ad14e |
// If process already bound to a subset of nodes when we discover it,
|
|
Packit |
6ad14e |
// set initial bind_time_stamp to 30 minutes ago...
|
|
Packit |
6ad14e |
p->bind_time_stamp = get_time_stamp() - (1800 * ONE_HUNDRED);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
return n;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
uint64_t combined_value_of_weighted_resources(int ix, int mbs, int cpus, uint64_t MBs_free, uint64_t CPUs_free) {
|
|
Packit |
6ad14e |
int64_t needed_mem;
|
|
Packit |
6ad14e |
int64_t needed_cpu;
|
|
Packit |
6ad14e |
int64_t excess_mem;
|
|
Packit |
6ad14e |
int64_t excess_cpu;
|
|
Packit |
6ad14e |
if (MBs_free > mbs) {
|
|
Packit |
6ad14e |
needed_mem = mbs;
|
|
Packit |
6ad14e |
excess_mem = MBs_free - mbs;
|
|
Packit |
6ad14e |
} else {
|
|
Packit |
6ad14e |
needed_mem = MBs_free;
|
|
Packit |
6ad14e |
excess_mem = 0;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
if (CPUs_free > cpus) {
|
|
Packit |
6ad14e |
needed_cpu = cpus;
|
|
Packit |
6ad14e |
excess_cpu = CPUs_free - cpus;
|
|
Packit |
6ad14e |
} else {
|
|
Packit |
6ad14e |
needed_cpu = CPUs_free;
|
|
Packit |
6ad14e |
excess_cpu = 0;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
// Weight the available resources, and then calculate magnitude as
|
|
Packit |
6ad14e |
// product of available CPUs and available MBs.
|
|
Packit |
6ad14e |
int64_t memfactor = (needed_mem * 10 + excess_mem * 4);
|
|
Packit |
6ad14e |
int64_t cpufactor = (needed_cpu * 6 + excess_cpu * 1);
|
|
Packit |
6ad14e |
numad_log(LOG_DEBUG, " Node[%d]: mem: %ld cpu: %ld\n", ix, memfactor, cpufactor);
|
|
Packit |
6ad14e |
return (memfactor * cpufactor);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
id_list_p pick_numa_nodes(int pid, int cpus, int mbs, int assume_enough_cpus) {
|
|
Packit |
6ad14e |
if (log_level >= LOG_DEBUG) {
|
|
Packit |
6ad14e |
numad_log(LOG_DEBUG, "PICK NODES FOR: PID: %d, CPUs %d, MBs %d\n", pid, cpus, mbs);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
char buf[BUF_SIZE];
|
|
Packit |
6ad14e |
uint64_t proc_avg_node_CPUs_free = 0;
|
|
Packit |
6ad14e |
// For existing processes, get miscellaneous process specific details
|
|
Packit |
6ad14e |
int pid_ix;
|
|
Packit |
6ad14e |
process_data_p p = NULL;
|
|
Packit |
6ad14e |
if ((pid > 0) && ((pid_ix = process_hash_lookup(pid)) >= 0)) {
|
|
Packit |
6ad14e |
p = &process_hash_table[pid_ix];
|
|
Packit |
6ad14e |
// Add up per-node memory in use by this process.
|
|
Packit |
6ad14e |
// This scanning is expensive and should be minimized.
|
|
Packit |
6ad14e |
char fname[FNAME_SIZE];
|
|
Packit |
6ad14e |
snprintf(fname, FNAME_SIZE, "/proc/%d/numa_maps", pid);
|
|
Packit |
6ad14e |
FILE *fs = fopen(fname, "r");
|
|
Packit |
6ad14e |
if (!fs) {
|
|
Packit |
6ad14e |
numad_log(LOG_WARNING, "Tried to research PID %d numamaps, but it apparently went away.\n", p->pid);
|
|
Packit |
6ad14e |
return NULL; // Assume the process terminated
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
// Allocate and zero per node memory array.
|
|
Packit |
6ad14e |
// The "+1 node" is for accumulating interleaved memory
|
|
Packit |
6ad14e |
p->process_MBs = realloc(p->process_MBs, (num_nodes + 1) * sizeof(uint64_t));
|
|
Packit |
6ad14e |
if (p->process_MBs == NULL) {
|
|
Packit |
6ad14e |
numad_log(LOG_CRIT, "p->process_MBs realloc failed\n");
|
|
Packit |
6ad14e |
exit(EXIT_FAILURE);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
memset(p->process_MBs, 0, (num_nodes + 1) * sizeof(uint64_t));
|
|
Packit |
6ad14e |
int process_has_interleaved_memory = 0;
|
|
Packit |
6ad14e |
while (fgets(buf, BUF_SIZE, fs)) {
|
|
Packit |
6ad14e |
int interleaved_memory = 0;
|
|
Packit |
6ad14e |
uint64_t page_size = page_size_in_bytes;
|
|
Packit |
6ad14e |
const char *delimiters = " \n";
|
|
Packit |
6ad14e |
char *str_p = strtok(buf, delimiters);
|
|
Packit |
6ad14e |
while (str_p) {
|
|
Packit |
6ad14e |
if (!strncmp(str_p, "interleave", 10)) {
|
|
Packit |
6ad14e |
interleaved_memory = 1;
|
|
Packit |
6ad14e |
process_has_interleaved_memory = 1;
|
|
Packit |
6ad14e |
} else if (!strcmp(str_p, "huge")) {
|
|
Packit |
6ad14e |
page_size = huge_page_size_in_bytes;
|
|
Packit |
6ad14e |
} else if (*str_p++ == 'N') {
|
|
Packit |
6ad14e |
int node;
|
|
Packit |
6ad14e |
uint64_t pages;
|
|
Packit |
6ad14e |
CONVERT_DIGITS_TO_NUM(str_p, node);
|
|
Packit |
6ad14e |
if (*str_p++ != '=') {
|
|
Packit |
6ad14e |
numad_log(LOG_CRIT, "numa_maps node number parse error\n");
|
|
Packit |
6ad14e |
exit(EXIT_FAILURE);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
CONVERT_DIGITS_TO_NUM(str_p, pages);
|
|
Packit |
6ad14e |
p->process_MBs[node] += (pages * page_size);
|
|
Packit |
6ad14e |
if (interleaved_memory) {
|
|
Packit |
6ad14e |
// sum interleaved quantity in "extra node"
|
|
Packit |
6ad14e |
p->process_MBs[num_nodes] += (pages * page_size);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
// Get next token on the line
|
|
Packit |
6ad14e |
str_p = strtok(NULL, delimiters);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
fclose(fs);
|
|
Packit |
6ad14e |
proc_avg_node_CPUs_free = p->CPUs_used;
|
|
Packit |
6ad14e |
for (int ix = 0; (ix <= num_nodes); ix++) {
|
|
Packit |
6ad14e |
p->process_MBs[ix] /= MEGABYTE;
|
|
Packit |
6ad14e |
if ((log_level >= LOG_DEBUG) && (p->process_MBs[ix] > 0)) {
|
|
Packit |
6ad14e |
if (ix == num_nodes) {
|
|
Packit |
6ad14e |
numad_log(LOG_DEBUG, "Interleaved MBs: %ld\n", ix, p->process_MBs[ix]);
|
|
Packit |
6ad14e |
} else {
|
|
Packit |
6ad14e |
numad_log(LOG_DEBUG, "PROCESS_MBs[%d]: %ld\n", ix, p->process_MBs[ix]);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
if (ID_IS_IN_LIST(ix, p->node_list_p)) {
|
|
Packit |
6ad14e |
proc_avg_node_CPUs_free += node[ix].CPUs_free;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
proc_avg_node_CPUs_free /= NUM_IDS_IN_LIST(p->node_list_p);
|
|
Packit |
6ad14e |
if ((process_has_interleaved_memory) && (keep_interleaved_memory)) {
|
|
Packit |
6ad14e |
// Mark this process as having interleaved memory so we do not
|
|
Packit |
6ad14e |
// merge the interleaved memory. Time stamp it as done and return.
|
|
Packit |
6ad14e |
p->flags |= PROCESS_FLAG_INTERLEAVED;
|
|
Packit |
6ad14e |
p->bind_time_stamp = get_time_stamp();
|
|
Packit |
6ad14e |
if (log_level >= LOG_DEBUG) {
|
|
Packit |
6ad14e |
numad_log(LOG_DEBUG, "Skipping evaluation of PID %d because of interleaved memory.\n", p->pid);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
return NULL;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
} // end of existing PID conditional
|
|
Packit |
6ad14e |
// Make a copy of node available resources array. Add in info specific to
|
|
Packit |
6ad14e |
// this process to equalize available resource quantities wrt locations of
|
|
Packit |
6ad14e |
// resources already in use by this process.
|
|
Packit |
6ad14e |
static node_data_p tmp_node;
|
|
Packit |
6ad14e |
tmp_node = realloc(tmp_node, num_nodes * sizeof(node_data_t) );
|
|
Packit |
6ad14e |
if (tmp_node == NULL) {
|
|
Packit |
6ad14e |
numad_log(LOG_CRIT, "tmp_node realloc failed\n");
|
|
Packit |
6ad14e |
exit(EXIT_FAILURE);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
memcpy(tmp_node, node, num_nodes * sizeof(node_data_t) );
|
|
Packit |
6ad14e |
uint64_t sum_of_node_CPUs_free = 0;
|
|
Packit |
6ad14e |
for (int ix = 0; (ix < num_nodes); ix++) {
|
|
Packit |
6ad14e |
if (pid > 0) {
|
|
Packit |
6ad14e |
if (NUM_IDS_IN_LIST(p->node_list_p) >= num_nodes) {
|
|
Packit |
6ad14e |
// Process not yet bound to a subset of nodes.
|
|
Packit |
6ad14e |
// Add back memory used by this process on this node.
|
|
Packit |
6ad14e |
tmp_node[ix].MBs_free += ((p->process_MBs[ix] * 17) / 16); // Apply light mem bias
|
|
Packit |
6ad14e |
// Add back CPU used by this process in proportion to the memory used on this node.
|
|
Packit |
6ad14e |
tmp_node[ix].CPUs_free += ((p->CPUs_used * p->process_MBs[ix]) / p->MBs_used);
|
|
Packit |
6ad14e |
} else {
|
|
Packit |
6ad14e |
// If the process is currently running on less than all the
|
|
Packit |
6ad14e |
// nodes, first add back (biased) memory already used by this
|
|
Packit |
6ad14e |
// process on this node, then assign average process CPU / node
|
|
Packit |
6ad14e |
// for this process iff the process is present on this node.
|
|
Packit |
6ad14e |
tmp_node[ix].MBs_free += ((p->process_MBs[ix] * 5) / 4); // Apply heavy mem bias
|
|
Packit |
6ad14e |
if (ID_IS_IN_LIST(ix, p->node_list_p)) {
|
|
Packit |
6ad14e |
tmp_node[ix].CPUs_free = proc_avg_node_CPUs_free;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
sum_of_node_CPUs_free += tmp_node[ix].CPUs_free;
|
|
Packit |
6ad14e |
if (tmp_node[ix].CPUs_free > tmp_node[ix].CPUs_total) {
|
|
Packit |
6ad14e |
tmp_node[ix].CPUs_free = tmp_node[ix].CPUs_total;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
if (tmp_node[ix].MBs_free > tmp_node[ix].MBs_total) {
|
|
Packit |
6ad14e |
tmp_node[ix].MBs_free = tmp_node[ix].MBs_total;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
// Enforce 1/100th CPU minimum
|
|
Packit |
6ad14e |
if (tmp_node[ix].CPUs_free < 1) {
|
|
Packit |
6ad14e |
tmp_node[ix].CPUs_free = 1;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
// numad_log(LOG_DEBUG, "Raw Node[%d]: mem: %ld cpu: %ld\n", ix, tmp_node[ix].MBs_free, tmp_node[ix].CPUs_free);
|
|
Packit |
6ad14e |
tmp_node[ix].magnitude = combined_value_of_weighted_resources(ix, mbs, cpus, tmp_node[ix].MBs_free, tmp_node[ix].CPUs_free);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
// Now figure out where to get resources for this request....
|
|
Packit |
6ad14e |
static id_list_p target_node_list_p;
|
|
Packit |
6ad14e |
CLEAR_NODE_LIST(target_node_list_p);
|
|
Packit |
6ad14e |
if ((pid > 0) && (cpus > sum_of_node_CPUs_free)) {
|
|
Packit |
6ad14e |
// System CPUs might be oversubscribed, but...
|
|
Packit |
6ad14e |
assume_enough_cpus = 1;
|
|
Packit |
6ad14e |
// and rely on available memory for placement.
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
// Establish a CPU flex fudge factor, on the presumption it is OK if not
|
|
Packit |
6ad14e |
// quite all the CPU request is met. However, if trying to find resources
|
|
Packit |
6ad14e |
// for pre-placement advice request, do not underestimate the amount of
|
|
Packit |
6ad14e |
// CPUs needed. Instead, err on the side of providing too many resources.
|
|
Packit |
6ad14e |
int cpu_flex = 0;
|
|
Packit |
6ad14e |
if ((pid > 0) && (target_utilization < 100)) {
|
|
Packit |
6ad14e |
// FIXME: Is half of the utilization margin a good amount of CPU flexing?
|
|
Packit |
6ad14e |
cpu_flex = ((100 - target_utilization) * node[0].CPUs_total) / 200;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
// Figure out minimum number of nodes required
|
|
Packit |
6ad14e |
int mem_req_nodes = ceil((double)mbs / (double)node[0].MBs_total);
|
|
Packit |
6ad14e |
int cpu_req_nodes = ceil((double)(cpus - cpu_flex) / (double)node[0].CPUs_total);
|
|
Packit |
6ad14e |
int min_req_nodes = mem_req_nodes;
|
|
Packit |
6ad14e |
if (min_req_nodes < cpu_req_nodes) {
|
|
Packit |
6ad14e |
min_req_nodes = cpu_req_nodes;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
if (min_req_nodes > num_nodes) {
|
|
Packit |
6ad14e |
min_req_nodes = num_nodes;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
// Use an index to sort NUMA connected resource chain for each node
|
|
Packit |
6ad14e |
int index[num_nodes];
|
|
Packit |
6ad14e |
uint64_t totmag[num_nodes];
|
|
Packit |
6ad14e |
for (int ix = 0; (ix < num_nodes); ix++) {
|
|
Packit |
6ad14e |
// Reset the index each time
|
|
Packit |
6ad14e |
for (int n = 0; (n < num_nodes); n++) {
|
|
Packit |
6ad14e |
index[n] = n;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
// Sort by minimum relative NUMA distance from node[ix],
|
|
Packit |
6ad14e |
// breaking distance ties with magnitude of available resources
|
|
Packit |
6ad14e |
for (int ij = 0; (ij < num_nodes); ij++) {
|
|
Packit |
6ad14e |
int best_ix = ij;
|
|
Packit |
6ad14e |
for (int ik = ij + 1; (ik < num_nodes); ik++) {
|
|
Packit |
6ad14e |
int ik_dist = tmp_node[index[ik]].distance[ix];
|
|
Packit |
6ad14e |
int best_ix_dist = tmp_node[index[best_ix]].distance[ix];
|
|
Packit |
6ad14e |
if (best_ix_dist > ik_dist) {
|
|
Packit |
6ad14e |
best_ix = ik;
|
|
Packit |
6ad14e |
} else if (best_ix_dist == ik_dist) {
|
|
Packit |
6ad14e |
if (tmp_node[index[best_ix]].magnitude < tmp_node[index[ik]].magnitude ) {
|
|
Packit |
6ad14e |
best_ix = ik;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
if (best_ix != ij) {
|
|
Packit |
6ad14e |
int tmp = index[ij];
|
|
Packit |
6ad14e |
index[ij] = index[best_ix];
|
|
Packit |
6ad14e |
index[best_ix] = tmp;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
#if 0
|
|
Packit |
6ad14e |
if (log_level >= LOG_DEBUG) {
|
|
Packit |
6ad14e |
for (int iq = 0; (iq < num_nodes); iq++) {
|
|
Packit |
6ad14e |
numad_log(LOG_DEBUG, "Node: %d Dist: %d Magnitude: %ld\n",
|
|
Packit |
6ad14e |
tmp_node[index[iq]].node_id, tmp_node[index[iq]].distance[ix], tmp_node[index[iq]].magnitude);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
#endif
|
|
Packit |
6ad14e |
// Save the totmag[] sum of the magnitudes of expected needed nodes,
|
|
Packit |
6ad14e |
// "normalized" by NUMA distance (by dividing each magnitude by the
|
|
Packit |
6ad14e |
// relative distance squared).
|
|
Packit |
6ad14e |
totmag[ix] = 0;
|
|
Packit |
6ad14e |
for (int ij = 0; (ij < min_req_nodes); ij++) {
|
|
Packit |
6ad14e |
int dist = tmp_node[index[ij]].distance[ix];
|
|
Packit |
6ad14e |
totmag[ix] += (tmp_node[index[ij]].magnitude / (dist * dist));
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
numad_log(LOG_DEBUG, "Totmag[%d]: %ld\n", ix, totmag[ix]);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
// Now find the best NUMA node based on the normalized sum of node
|
|
Packit |
6ad14e |
// magnitudes expected to be used.
|
|
Packit |
6ad14e |
int best_node_ix = 0;
|
|
Packit |
6ad14e |
for (int ix = 0; (ix < num_nodes); ix++) {
|
|
Packit |
6ad14e |
if (totmag[best_node_ix] < totmag[ix]) {
|
|
Packit |
6ad14e |
best_node_ix = ix;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
numad_log(LOG_DEBUG, "best_node_ix: %d\n", best_node_ix);
|
|
Packit |
6ad14e |
// Reset sorting index again
|
|
Packit |
6ad14e |
for (int n = 0; (n < num_nodes); n++) {
|
|
Packit |
6ad14e |
index[n] = n;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
// Sort index by distance from node[best_node_ix],
|
|
Packit |
6ad14e |
// breaking distance ties with magnitude
|
|
Packit |
6ad14e |
for (int ij = 0; (ij < num_nodes); ij++) {
|
|
Packit |
6ad14e |
int best_ix = ij;
|
|
Packit |
6ad14e |
for (int ik = ij + 1; (ik < num_nodes); ik++) {
|
|
Packit |
6ad14e |
int ik_dist = tmp_node[index[ik]].distance[best_node_ix];
|
|
Packit |
6ad14e |
int best_ix_dist = tmp_node[index[best_ix]].distance[best_node_ix];
|
|
Packit |
6ad14e |
if (best_ix_dist > ik_dist) {
|
|
Packit |
6ad14e |
best_ix = ik;
|
|
Packit |
6ad14e |
} else if (best_ix_dist == ik_dist) {
|
|
Packit |
6ad14e |
if (tmp_node[index[best_ix]].magnitude < tmp_node[index[ik]].magnitude ) {
|
|
Packit |
6ad14e |
best_ix = ik;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
if (best_ix != ij) {
|
|
Packit |
6ad14e |
int tmp = index[ij];
|
|
Packit |
6ad14e |
index[ij] = index[best_ix];
|
|
Packit |
6ad14e |
index[best_ix] = tmp;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
if (log_level >= LOG_DEBUG) {
|
|
Packit |
6ad14e |
for (int iq = 0; (iq < num_nodes); iq++) {
|
|
Packit |
6ad14e |
numad_log(LOG_DEBUG, "Node: %d Dist: %d Magnitude: %ld\n",
|
|
Packit |
6ad14e |
tmp_node[index[iq]].node_id, tmp_node[index[iq]].distance[best_node_ix], tmp_node[index[iq]].magnitude);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
// Allocate more resources until request is met.
|
|
Packit |
6ad14e |
best_node_ix = 0;
|
|
Packit |
6ad14e |
while ((min_req_nodes > 0) || (mbs > 0) || ((cpus > cpu_flex) && (!assume_enough_cpus))) {
|
|
Packit |
6ad14e |
if (log_level >= LOG_DEBUG) {
|
|
Packit |
6ad14e |
numad_log(LOG_DEBUG, "MBs: %d, CPUs: %d\n", mbs, cpus);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
numad_log(LOG_DEBUG, "Assigning resources from node %d\n", index[best_node_ix]);
|
|
Packit |
6ad14e |
ADD_ID_TO_LIST(tmp_node[index[best_node_ix]].node_id, target_node_list_p);
|
|
Packit |
6ad14e |
min_req_nodes -= 1;
|
|
Packit |
6ad14e |
if (EQUAL_LISTS(target_node_list_p, all_nodes_list_p)) {
|
|
Packit |
6ad14e |
// Apparently we must use all resource nodes...
|
|
Packit |
6ad14e |
break;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
// "Consume" the resources on this node
|
|
Packit |
6ad14e |
#define CPUS_MARGIN 0
|
|
Packit |
6ad14e |
#define MBS_MARGIN 100
|
|
Packit |
6ad14e |
if (tmp_node[index[best_node_ix]].MBs_free >= (mbs + MBS_MARGIN)) {
|
|
Packit |
6ad14e |
tmp_node[index[best_node_ix]].MBs_free -= mbs;
|
|
Packit |
6ad14e |
mbs = 0;
|
|
Packit |
6ad14e |
} else {
|
|
Packit |
6ad14e |
mbs -= (tmp_node[index[best_node_ix]].MBs_free - MBS_MARGIN);
|
|
Packit |
6ad14e |
tmp_node[index[best_node_ix]].MBs_free = MBS_MARGIN;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
if (tmp_node[index[best_node_ix]].CPUs_free >= (cpus + CPUS_MARGIN)) {
|
|
Packit |
6ad14e |
tmp_node[index[best_node_ix]].CPUs_free -= cpus;
|
|
Packit |
6ad14e |
cpus = 0;
|
|
Packit |
6ad14e |
} else {
|
|
Packit |
6ad14e |
cpus -= (tmp_node[index[best_node_ix]].CPUs_free - CPUS_MARGIN);
|
|
Packit |
6ad14e |
tmp_node[index[best_node_ix]].CPUs_free = CPUS_MARGIN;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
// Next line optional, since we will not look at that node again
|
|
Packit |
6ad14e |
tmp_node[index[best_node_ix]].magnitude = combined_value_of_weighted_resources(0, mbs, cpus, tmp_node[index[best_node_ix]].MBs_free, tmp_node[index[best_node_ix]].CPUs_free);
|
|
Packit |
6ad14e |
best_node_ix += 1;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
// For existing processes, calculate the non-local memory percent to see if
|
|
Packit |
6ad14e |
// process is already in the right place.
|
|
Packit |
6ad14e |
if ((pid > 0) && (p != NULL)) {
|
|
Packit |
6ad14e |
uint64_t nonlocal_memory = 0;
|
|
Packit |
6ad14e |
for (int ix = 0; (ix < num_nodes); ix++) {
|
|
Packit |
6ad14e |
if (!ID_IS_IN_LIST(ix, target_node_list_p)) {
|
|
Packit |
6ad14e |
// Accumulate total of nonlocal memory
|
|
Packit |
6ad14e |
nonlocal_memory += p->process_MBs[ix];
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
int disp_percent = (100 * nonlocal_memory) / p->MBs_used;
|
|
Packit |
6ad14e |
// If this existing process is already located where we want it, then just
|
|
Packit |
6ad14e |
// return NULL indicating no need to change binding this time. Check the
|
|
Packit |
6ad14e |
// ammount of nonlocal memory against the target_memlocality_perecent.
|
|
Packit |
6ad14e |
if ((disp_percent <= (100 - target_memlocality)) && (p->bind_time_stamp) && (EQUAL_LISTS(target_node_list_p, p->node_list_p))) {
|
|
Packit |
6ad14e |
// Already bound to targets, and enough of the memory is located where we want it, so no need to rebind
|
|
Packit |
6ad14e |
if (log_level >= LOG_DEBUG) {
|
|
Packit |
6ad14e |
numad_log(LOG_DEBUG, "Process %d already %d percent localized to target nodes.\n", p->pid, 100 - disp_percent);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
p->bind_time_stamp = get_time_stamp();
|
|
Packit |
6ad14e |
return NULL;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
// Must always provide at least one node for pre-placement advice
|
|
Packit |
6ad14e |
// FIXME: verify this can happen only if no resources requested...
|
|
Packit |
6ad14e |
if ((pid <= 0) && (NUM_IDS_IN_LIST(target_node_list_p) <= 0)) {
|
|
Packit |
6ad14e |
ADD_ID_TO_LIST(node[0].node_id, target_node_list_p);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
// Log advice, and return target node list
|
|
Packit |
6ad14e |
if ((pid > 0) && (p->bind_time_stamp)) {
|
|
Packit |
6ad14e |
str_from_id_list(buf, BUF_SIZE, p->node_list_p);
|
|
Packit |
6ad14e |
} else {
|
|
Packit |
6ad14e |
str_from_id_list(buf, BUF_SIZE, all_nodes_list_p);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
char buf2[BUF_SIZE];
|
|
Packit |
6ad14e |
str_from_id_list(buf2, BUF_SIZE, target_node_list_p);
|
|
Packit |
6ad14e |
char *cmd_name = "(unknown)";
|
|
Packit |
6ad14e |
if ((p) && (p->comm)) {
|
|
Packit |
6ad14e |
cmd_name = p->comm;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
numad_log(LOG_NOTICE, "Advising pid %d %s move from nodes (%s) to nodes (%s)\n", pid, cmd_name, buf, buf2);
|
|
Packit |
6ad14e |
if (pid > 0) {
|
|
Packit |
6ad14e |
COPY_LIST(target_node_list_p, p->node_list_p);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
return target_node_list_p;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
int manage_loads() {
|
|
Packit |
6ad14e |
uint64_t time_stamp = get_time_stamp();
|
|
Packit |
6ad14e |
// Use temporary index to access and sort hash table entries
|
|
Packit |
6ad14e |
static int pindex_size;
|
|
Packit |
6ad14e |
static process_data_p *pindex;
|
|
Packit |
6ad14e |
if (pindex_size < process_hash_table_size) {
|
|
Packit |
6ad14e |
pindex_size = process_hash_table_size;
|
|
Packit |
6ad14e |
pindex = realloc(pindex, pindex_size * sizeof(process_data_p));
|
|
Packit |
6ad14e |
if (pindex == NULL) {
|
|
Packit |
6ad14e |
numad_log(LOG_CRIT, "pindex realloc failed\n");
|
|
Packit |
6ad14e |
exit(EXIT_FAILURE);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
// Quick round trip whenever we resize the hash table.
|
|
Packit |
6ad14e |
// This is mostly to avoid max_interval wait at start up.
|
|
Packit |
6ad14e |
return min_interval / 2;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
memset(pindex, 0, pindex_size * sizeof(process_data_p));
|
|
Packit |
6ad14e |
// Copy live candidate pointers to the index for sorting
|
|
Packit |
6ad14e |
// if they meet the threshold for memory usage and CPU usage.
|
|
Packit |
6ad14e |
int nprocs = 0;
|
|
Packit |
6ad14e |
long sum_CPUs_used = 0;
|
|
Packit |
6ad14e |
for (int ix = 0; (ix < process_hash_table_size); ix++) {
|
|
Packit |
6ad14e |
process_data_p p = &process_hash_table[ix];
|
|
Packit |
6ad14e |
if ((p->pid) && (p->CPUs_used > CPU_THRESHOLD) && (p->MBs_used > MEMORY_THRESHOLD)) {
|
|
Packit |
6ad14e |
pindex[nprocs++] = p;
|
|
Packit |
6ad14e |
sum_CPUs_used += p->CPUs_used;
|
|
Packit |
6ad14e |
// Initialize node list, if not already done for this process.
|
|
Packit |
6ad14e |
if (p->node_list_p == NULL) {
|
|
Packit |
6ad14e |
initialize_mem_node_list(p);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
// Order candidate considerations using timestamps and magnitude: amount of
|
|
Packit |
6ad14e |
// CPU used * amount of memory used. Not expecting a long list here. Use
|
|
Packit |
6ad14e |
// a simplistic sort -- however move all not yet bound to front of list and
|
|
Packit |
6ad14e |
// order by decreasing magnitude. Previously bound processes follow in
|
|
Packit |
6ad14e |
// bins of increasing magnitude treating values within 20% as aquivalent.
|
|
Packit |
6ad14e |
// Within bins, order by bind_time_stamp so oldest bound will be higher
|
|
Packit |
6ad14e |
// priority to evaluate. Start by moving all unbound to beginning.
|
|
Packit |
6ad14e |
int num_unbound = 0;
|
|
Packit |
6ad14e |
for (int ij = 0; (ij < nprocs); ij++) {
|
|
Packit |
6ad14e |
if (pindex[ij]->bind_time_stamp == 0) {
|
|
Packit |
6ad14e |
process_data_p tmp = pindex[num_unbound];
|
|
Packit |
6ad14e |
pindex[num_unbound++] = pindex[ij];
|
|
Packit |
6ad14e |
pindex[ij] = tmp;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
// Sort all unbound so biggest magnitude comes first
|
|
Packit |
6ad14e |
for (int ij = 0; (ij < num_unbound); ij++) {
|
|
Packit |
6ad14e |
int best = ij;
|
|
Packit |
6ad14e |
for (int ik = ij + 1; (ik < num_unbound); ik++) {
|
|
Packit |
6ad14e |
uint64_t ik_mag = (pindex[ ik]->CPUs_used * pindex[ ik]->MBs_used);
|
|
Packit |
6ad14e |
uint64_t best_mag = (pindex[best]->CPUs_used * pindex[best]->MBs_used);
|
|
Packit |
6ad14e |
if (ik_mag <= best_mag) continue;
|
|
Packit |
6ad14e |
best = ik;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
if (best != ij) {
|
|
Packit |
6ad14e |
process_data_p tmp = pindex[ij];
|
|
Packit |
6ad14e |
pindex[ij] = pindex[best];
|
|
Packit |
6ad14e |
pindex[best] = tmp;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
// Sort the remaining candidates into bins of increasting magnitude, and by
|
|
Packit |
6ad14e |
// timestamp within bins.
|
|
Packit |
6ad14e |
for (int ij = num_unbound; (ij < nprocs); ij++) {
|
|
Packit |
6ad14e |
int best = ij;
|
|
Packit |
6ad14e |
for (int ik = ij + 1; (ik < nprocs); ik++) {
|
|
Packit |
6ad14e |
uint64_t ik_mag = (pindex[ ik]->CPUs_used * pindex[ ik]->MBs_used);
|
|
Packit |
6ad14e |
uint64_t best_mag = (pindex[best]->CPUs_used * pindex[best]->MBs_used);
|
|
Packit |
6ad14e |
uint64_t min_mag = ik_mag;
|
|
Packit |
6ad14e |
uint64_t diff_mag = best_mag - ik_mag;
|
|
Packit |
6ad14e |
if (diff_mag < 0) {
|
|
Packit |
6ad14e |
diff_mag = -(diff_mag);
|
|
Packit |
6ad14e |
min_mag = best_mag;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
if ((diff_mag > 0) && (min_mag / diff_mag < 5)) {
|
|
Packit |
6ad14e |
// difference > 20 percent. Use magnitude ordering
|
|
Packit |
6ad14e |
if (ik_mag <= best_mag) continue;
|
|
Packit |
6ad14e |
} else {
|
|
Packit |
6ad14e |
// difference within 20 percent. Sort these by bind_time_stamp.
|
|
Packit |
6ad14e |
if (pindex[ik]->bind_time_stamp > pindex[best]->bind_time_stamp) continue;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
best = ik;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
if (best != ij) {
|
|
Packit |
6ad14e |
process_data_p tmp = pindex[ij];
|
|
Packit |
6ad14e |
pindex[ij] = pindex[best];
|
|
Packit |
6ad14e |
pindex[best] = tmp;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
// Show the candidate processes in the log file
|
|
Packit |
6ad14e |
if ((log_level >= LOG_INFO) && (nprocs > 0)) {
|
|
Packit |
6ad14e |
numad_log(LOG_INFO, "Candidates: %d\n", nprocs);
|
|
Packit |
6ad14e |
for (int ix = 0; (ix < nprocs); ix++) {
|
|
Packit |
6ad14e |
process_data_p p = pindex[ix];
|
|
Packit |
6ad14e |
char buf[BUF_SIZE];
|
|
Packit |
6ad14e |
str_from_id_list(buf, BUF_SIZE, p->node_list_p);
|
|
Packit |
6ad14e |
fprintf(log_fs, "%ld: PID %d: %s, Threads %2ld, MBs_size %6ld, MBs_used %6ld, CPUs_used %4ld, Magnitude %6ld, Nodes: %s\n",
|
|
Packit |
6ad14e |
p->data_time_stamp, p->pid, p->comm, p->num_threads, p->MBs_size, p->MBs_used, p->CPUs_used, p->MBs_used * p->CPUs_used, buf);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
fflush(log_fs);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
// Estimate desired size (+ margin capacity) and
|
|
Packit |
6ad14e |
// make resource requests for each candidate process
|
|
Packit |
6ad14e |
for (int ix = 0; (ix < nprocs); ix++) {
|
|
Packit |
6ad14e |
process_data_p p = pindex[ix];
|
|
Packit |
6ad14e |
// If this process has interleaved memory, recheck it only every 30 minutes...
|
|
Packit |
6ad14e |
#define MIN_DELAY_FOR_INTERLEAVE (1800 * ONE_HUNDRED)
|
|
Packit |
6ad14e |
if (((p->flags & PROCESS_FLAG_INTERLEAVED) > 0)
|
|
Packit |
6ad14e |
&& (p->bind_time_stamp + MIN_DELAY_FOR_INTERLEAVE > time_stamp)) {
|
|
Packit |
6ad14e |
if (log_level >= LOG_DEBUG) {
|
|
Packit |
6ad14e |
numad_log(LOG_DEBUG, "Skipping evaluation of PID %d because of interleaved memory.\n", p->pid);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
continue;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
// Expand resources needed estimate using target_utilization factor.
|
|
Packit |
6ad14e |
// Start with the CPUs actually used (capped by number of threads) for
|
|
Packit |
6ad14e |
// CPUs required, and the RSS MBs actually used for the MBs
|
|
Packit |
6ad14e |
// requirement,
|
|
Packit |
6ad14e |
int mem_target_utilization = target_utilization;
|
|
Packit |
6ad14e |
int cpu_target_utilization = target_utilization;
|
|
Packit |
6ad14e |
// Cap memory utilization at 100 percent (but allow CPUs to oversubscribe)
|
|
Packit |
6ad14e |
if (mem_target_utilization > 100) {
|
|
Packit |
6ad14e |
mem_target_utilization = 100;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
// If the process virtual memory size is bigger than one node, and it
|
|
Packit |
6ad14e |
// is already using more than 80 percent of a node, then request MBs
|
|
Packit |
6ad14e |
// based on the virtual size rather than on the current amount in use.
|
|
Packit |
6ad14e |
int mb_request;
|
|
Packit |
6ad14e |
if ((p->MBs_size > node[0].MBs_total) && ((p->MBs_used * 5 / 4) > node[0].MBs_total)) {
|
|
Packit |
6ad14e |
mb_request = (p->MBs_size * 100) / mem_target_utilization;
|
|
Packit |
6ad14e |
} else {
|
|
Packit |
6ad14e |
mb_request = (p->MBs_used * 100) / mem_target_utilization;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
int cpu_request = (p->CPUs_used * 100) / cpu_target_utilization;
|
|
Packit |
6ad14e |
// But do not give a process more CPUs than it has threads!
|
|
Packit |
6ad14e |
int thread_limit = p->num_threads;
|
|
Packit |
6ad14e |
// If process looks like a KVM guest, try to limit thread count to the
|
|
Packit |
6ad14e |
// number of vCPU threads. FIXME: Will need to do something more
|
|
Packit |
6ad14e |
// intelligent than this with guest IO threads when eventually
|
|
Packit |
6ad14e |
// considering devices and IRQs.
|
|
Packit |
6ad14e |
if ((p->comm) && (p->comm[0] == '(') && (p->comm[1] == 'q') && (strcmp(p->comm, "(qemu-kvm)") == 0)) {
|
|
Packit |
6ad14e |
int kvm_vcpu_threads = get_num_kvm_vcpu_threads(p->pid);
|
|
Packit |
6ad14e |
if (thread_limit > kvm_vcpu_threads) {
|
|
Packit |
6ad14e |
thread_limit = kvm_vcpu_threads;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
thread_limit *= ONE_HUNDRED;
|
|
Packit |
6ad14e |
if (cpu_request > thread_limit) {
|
|
Packit |
6ad14e |
cpu_request = thread_limit;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
// If this process was recently bound, enforce a five-minute minimum
|
|
Packit |
6ad14e |
// delay between repeated attempts to potentially move the process.
|
|
Packit |
6ad14e |
#define MIN_DELAY_FOR_REEVALUATION (300 * ONE_HUNDRED)
|
|
Packit |
6ad14e |
if (p->bind_time_stamp + MIN_DELAY_FOR_REEVALUATION > time_stamp) {
|
|
Packit |
6ad14e |
// Skip re-evaluation because we just did it recently, but check
|
|
Packit |
6ad14e |
// first for node utilization balance to see if we should
|
|
Packit |
6ad14e |
// re-evaluate this particular process right now. If this process
|
|
Packit |
6ad14e |
// is running on one of the busiest nodes, go ahead and re-evaluate
|
|
Packit |
6ad14e |
// it if it looks like it should have a better place with
|
|
Packit |
6ad14e |
// sufficient resources. FIXME: this is currently implemented for
|
|
Packit |
6ad14e |
// only smallish processes that will fit in a single node.
|
|
Packit |
6ad14e |
if ( ( ID_IS_IN_LIST(min_node_CPUs_free_ix, p->node_list_p) || ID_IS_IN_LIST(min_node_MBs_free_ix, p->node_list_p))
|
|
Packit |
6ad14e |
&& (cpu_request < node[0].CPUs_total) && (mb_request < node[0].MBs_total)
|
|
Packit |
6ad14e |
&& (abs(min_node_CPUs_free + p->CPUs_used - avg_node_CPUs_free)
|
|
Packit |
6ad14e |
+ abs((max_node_CPUs_free - p->CPUs_used) - avg_node_CPUs_free)
|
|
Packit |
6ad14e |
< (max_node_CPUs_free - min_node_CPUs_free) - CPU_THRESHOLD) // CPU slop
|
|
Packit |
6ad14e |
&& (abs(min_node_MBs_free + p->MBs_used - avg_node_MBs_free)
|
|
Packit |
6ad14e |
+ abs((max_node_MBs_free - p->MBs_used) - avg_node_MBs_free)
|
|
Packit |
6ad14e |
< (max_node_MBs_free - min_node_MBs_free)) ) {
|
|
Packit |
6ad14e |
if (log_level >= LOG_DEBUG) {
|
|
Packit |
6ad14e |
numad_log(LOG_DEBUG, "Bypassing delay for %d because it looks like it can do better.\n", p->pid);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
} else {
|
|
Packit |
6ad14e |
if (log_level >= LOG_DEBUG) {
|
|
Packit |
6ad14e |
numad_log(LOG_DEBUG, "Skipping evaluation of PID %d because done too recently.\n", p->pid);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
continue;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
// OK, now pick NUMA nodes for this process and bind it!
|
|
Packit |
6ad14e |
pthread_mutex_lock(&node_info_mutex);
|
|
Packit |
6ad14e |
int assume_enough_cpus = (sum_CPUs_used <= sum_CPUs_total);
|
|
Packit |
6ad14e |
id_list_p node_list_p = pick_numa_nodes(p->pid, cpu_request, mb_request, assume_enough_cpus);
|
|
Packit |
6ad14e |
if ((node_list_p != NULL) && (bind_process_and_migrate_memory(p))) {
|
|
Packit |
6ad14e |
pthread_mutex_unlock(&node_info_mutex);
|
|
Packit |
6ad14e |
// Return minimum interval when actively moving processes
|
|
Packit |
6ad14e |
return min_interval;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
pthread_mutex_unlock(&node_info_mutex);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
// Return maximum interval when no process movement
|
|
Packit |
6ad14e |
return max_interval;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
void *set_dynamic_options(void *arg) {
|
|
Packit |
6ad14e |
// int arg_value = *(int *)arg;
|
|
Packit |
6ad14e |
char buf[BUF_SIZE];
|
|
Packit |
6ad14e |
for (;;) {
|
|
Packit |
6ad14e |
// Loop here forever waiting for a msg to do something...
|
|
Packit |
6ad14e |
msg_t msg;
|
|
Packit |
6ad14e |
recv_msg(&msg;;
|
|
Packit |
6ad14e |
switch (msg.body.cmd) {
|
|
Packit |
6ad14e |
case 'C':
|
|
Packit |
6ad14e |
use_inactive_file_cache = (msg.body.arg1 != 0);
|
|
Packit |
6ad14e |
if (use_inactive_file_cache) {
|
|
Packit |
6ad14e |
numad_log(LOG_NOTICE, "Counting inactive file cache as available\n");
|
|
Packit |
6ad14e |
} else {
|
|
Packit |
6ad14e |
numad_log(LOG_NOTICE, "Counting inactive file cache as unavailable\n");
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
break;
|
|
Packit |
6ad14e |
case 'H':
|
|
Packit |
6ad14e |
thp_scan_sleep_ms = msg.body.arg1;
|
|
Packit |
6ad14e |
set_thp_scan_sleep_ms(thp_scan_sleep_ms);
|
|
Packit |
6ad14e |
break;
|
|
Packit |
6ad14e |
case 'i':
|
|
Packit |
6ad14e |
min_interval = msg.body.arg1;
|
|
Packit |
6ad14e |
max_interval = msg.body.arg2;
|
|
Packit |
6ad14e |
if (max_interval <= 0) {
|
|
Packit |
6ad14e |
shut_down_numad();
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
numad_log(LOG_NOTICE, "Changing interval to %d:%d\n", msg.body.arg1, msg.body.arg2);
|
|
Packit |
6ad14e |
break;
|
|
Packit |
6ad14e |
case 'K':
|
|
Packit |
6ad14e |
keep_interleaved_memory = (msg.body.arg1 != 0);
|
|
Packit |
6ad14e |
if (keep_interleaved_memory) {
|
|
Packit |
6ad14e |
numad_log(LOG_NOTICE, "Keeping interleaved memory spread across nodes\n");
|
|
Packit |
6ad14e |
} else {
|
|
Packit |
6ad14e |
numad_log(LOG_NOTICE, "Merging interleaved memory to localized NUMA nodes\n");
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
break;
|
|
Packit |
6ad14e |
case 'l':
|
|
Packit |
6ad14e |
numad_log(LOG_NOTICE, "Changing log level to %d\n", msg.body.arg1);
|
|
Packit |
6ad14e |
log_level = msg.body.arg1;
|
|
Packit |
6ad14e |
break;
|
|
Packit |
6ad14e |
case 'm':
|
|
Packit |
6ad14e |
numad_log(LOG_NOTICE, "Changing target memory locality to %d\n", msg.body.arg1);
|
|
Packit |
6ad14e |
target_memlocality = msg.body.arg1;
|
|
Packit |
6ad14e |
break;
|
|
Packit |
6ad14e |
case 'p':
|
|
Packit |
6ad14e |
numad_log(LOG_NOTICE, "Adding PID %d to inclusion PID list\n", msg.body.arg1);
|
|
Packit |
6ad14e |
pthread_mutex_lock(&pid_list_mutex);
|
|
Packit |
6ad14e |
exclude_pid_list = remove_pid_from_pid_list(exclude_pid_list, msg.body.arg1);
|
|
Packit |
6ad14e |
include_pid_list = insert_pid_into_pid_list(include_pid_list, msg.body.arg1);
|
|
Packit |
6ad14e |
pthread_mutex_unlock(&pid_list_mutex);
|
|
Packit |
6ad14e |
break;
|
|
Packit |
6ad14e |
case 'r':
|
|
Packit |
6ad14e |
numad_log(LOG_NOTICE, "Removing PID %d from explicit PID lists\n", msg.body.arg1);
|
|
Packit |
6ad14e |
pthread_mutex_lock(&pid_list_mutex);
|
|
Packit |
6ad14e |
include_pid_list = remove_pid_from_pid_list(include_pid_list, msg.body.arg1);
|
|
Packit |
6ad14e |
exclude_pid_list = remove_pid_from_pid_list(exclude_pid_list, msg.body.arg1);
|
|
Packit |
6ad14e |
pthread_mutex_unlock(&pid_list_mutex);
|
|
Packit |
6ad14e |
break;
|
|
Packit |
6ad14e |
case 'S':
|
|
Packit |
6ad14e |
scan_all_processes = (msg.body.arg1 != 0);
|
|
Packit |
6ad14e |
if (scan_all_processes) {
|
|
Packit |
6ad14e |
numad_log(LOG_NOTICE, "Scanning all processes\n");
|
|
Packit |
6ad14e |
} else {
|
|
Packit |
6ad14e |
numad_log(LOG_NOTICE, "Scanning only explicit PID list processes\n");
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
break;
|
|
Packit |
6ad14e |
case 't':
|
|
Packit |
6ad14e |
numad_log(LOG_NOTICE, "Changing logical CPU thread percent to %d\n", msg.body.arg1);
|
|
Packit |
6ad14e |
htt_percent = msg.body.arg1;
|
|
Packit |
6ad14e |
node_info_time_stamp = 0; // to force rescan of nodes/cpus soon
|
|
Packit |
6ad14e |
break;
|
|
Packit |
6ad14e |
case 'u':
|
|
Packit |
6ad14e |
numad_log(LOG_NOTICE, "Changing target utilization to %d\n", msg.body.arg1);
|
|
Packit |
6ad14e |
target_utilization = msg.body.arg1;
|
|
Packit |
6ad14e |
break;
|
|
Packit |
6ad14e |
case 'w':
|
|
Packit |
6ad14e |
numad_log(LOG_NOTICE, "Getting NUMA pre-placement advice for %d CPUs and %d MBs\n",
|
|
Packit |
6ad14e |
msg.body.arg1, msg.body.arg2);
|
|
Packit |
6ad14e |
pthread_mutex_lock(&node_info_mutex);
|
|
Packit |
6ad14e |
update_nodes();
|
|
Packit |
6ad14e |
id_list_p node_list_p = pick_numa_nodes(-1, msg.body.arg1, msg.body.arg2, 0);
|
|
Packit |
6ad14e |
str_from_id_list(buf, BUF_SIZE, node_list_p);
|
|
Packit |
6ad14e |
pthread_mutex_unlock(&node_info_mutex);
|
|
Packit |
6ad14e |
send_msg(msg.body.src_pid, 'w', 0, 0, buf);
|
|
Packit |
6ad14e |
break;
|
|
Packit |
6ad14e |
case 'x':
|
|
Packit |
6ad14e |
numad_log(LOG_NOTICE, "Adding PID %d to exclusion PID list\n", msg.body.arg1);
|
|
Packit |
6ad14e |
pthread_mutex_lock(&pid_list_mutex);
|
|
Packit |
6ad14e |
include_pid_list = remove_pid_from_pid_list(include_pid_list, msg.body.arg1);
|
|
Packit |
6ad14e |
exclude_pid_list = insert_pid_into_pid_list(exclude_pid_list, msg.body.arg1);
|
|
Packit |
6ad14e |
pthread_mutex_unlock(&pid_list_mutex);
|
|
Packit |
6ad14e |
break;
|
|
Packit |
6ad14e |
default:
|
|
Packit |
6ad14e |
numad_log(LOG_WARNING, "Unexpected msg command: %c %d %d %s from PID %d\n",
|
|
Packit |
6ad14e |
msg.body.cmd, msg.body.arg1, msg.body.arg1, msg.body.text,
|
|
Packit |
6ad14e |
msg.body.src_pid);
|
|
Packit |
6ad14e |
break;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
} // for (;;)
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
void parse_two_arg_values(char *p, int *first_ptr, int *second_ptr, int first_is_optional, int first_scale_digits) {
|
|
Packit |
6ad14e |
char *orig_p = p;
|
|
Packit |
6ad14e |
char *q = NULL;
|
|
Packit |
6ad14e |
int second = -1;
|
|
Packit |
6ad14e |
errno = 0;
|
|
Packit |
6ad14e |
int first = (int) strtol(p, &p, 10);
|
|
Packit |
6ad14e |
if ((errno != 0) || (p == orig_p) || (first < 0)) {
|
|
Packit |
6ad14e |
fprintf(stderr, "Can't parse arg value(s): %s\n", orig_p);
|
|
Packit |
6ad14e |
exit(EXIT_FAILURE);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
if (*p == '.') {
|
|
Packit |
6ad14e |
p++;
|
|
Packit |
6ad14e |
while ((first_scale_digits > 0) && (isdigit(*p))) {
|
|
Packit |
6ad14e |
first *= 10;
|
|
Packit |
6ad14e |
first += (*p++ - '0');
|
|
Packit |
6ad14e |
first_scale_digits -= 1;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
while (isdigit(*p)) { p++; }
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
while (first_scale_digits > 0) {
|
|
Packit |
6ad14e |
first *= 10;
|
|
Packit |
6ad14e |
first_scale_digits -= 1;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
if (*p == ':') {
|
|
Packit |
6ad14e |
q = p + 1;
|
|
Packit |
6ad14e |
errno = 0;
|
|
Packit |
6ad14e |
second = (int) strtol(q, &p, 10);
|
|
Packit |
6ad14e |
if ((errno != 0) || (p == q) || (second < 0)) {
|
|
Packit |
6ad14e |
fprintf(stderr, "Can't parse arg value(s): %s\n", orig_p);
|
|
Packit |
6ad14e |
exit(EXIT_FAILURE);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
if (q != NULL) {
|
|
Packit |
6ad14e |
// Two numbers are present
|
|
Packit |
6ad14e |
if (first_ptr != NULL) *first_ptr = first;
|
|
Packit |
6ad14e |
if (second_ptr != NULL) *second_ptr = second;
|
|
Packit |
6ad14e |
} else if (first_is_optional) {
|
|
Packit |
6ad14e |
if (second_ptr != NULL) *second_ptr = first;
|
|
Packit |
6ad14e |
} else {
|
|
Packit |
6ad14e |
if (first_ptr != NULL) *first_ptr = first;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
|
|
Packit |
6ad14e |
int main(int argc, char *argv[]) {
|
|
Packit |
6ad14e |
int opt;
|
|
Packit |
6ad14e |
int C_flag = 0;
|
|
Packit |
6ad14e |
int d_flag = 0;
|
|
Packit |
6ad14e |
int H_flag = 0;
|
|
Packit |
6ad14e |
int i_flag = 0;
|
|
Packit |
6ad14e |
int K_flag = 0;
|
|
Packit |
6ad14e |
int l_flag = 0;
|
|
Packit |
6ad14e |
int m_flag = 0;
|
|
Packit |
6ad14e |
int p_flag = 0;
|
|
Packit |
6ad14e |
int r_flag = 0;
|
|
Packit |
6ad14e |
int S_flag = 0;
|
|
Packit |
6ad14e |
int t_flag = 0;
|
|
Packit |
6ad14e |
int u_flag = 0;
|
|
Packit |
6ad14e |
int v_flag = 0;
|
|
Packit |
6ad14e |
int w_flag = 0;
|
|
Packit |
6ad14e |
int x_flag = 0;
|
|
Packit |
6ad14e |
int tmp_int = 0;
|
|
Packit |
6ad14e |
long list_pid = 0;
|
|
Packit |
6ad14e |
while ((opt = getopt(argc, argv, "C:dD:hH:i:K:l:p:r:R:S:t:u:vVw:x:")) != -1) {
|
|
Packit |
6ad14e |
switch (opt) {
|
|
Packit |
6ad14e |
case 'C':
|
|
Packit |
6ad14e |
C_flag = 1;
|
|
Packit |
6ad14e |
use_inactive_file_cache = (atoi(optarg) != 0);
|
|
Packit |
6ad14e |
break;
|
|
Packit |
6ad14e |
case 'd':
|
|
Packit |
6ad14e |
d_flag = 1;
|
|
Packit |
6ad14e |
log_level = LOG_DEBUG;
|
|
Packit |
6ad14e |
break;
|
|
Packit |
6ad14e |
case 'D':
|
|
Packit |
6ad14e |
// obsoleted
|
|
Packit |
6ad14e |
break;
|
|
Packit |
6ad14e |
case 'h':
|
|
Packit |
6ad14e |
print_usage_and_exit(argv[0]);
|
|
Packit |
6ad14e |
break;
|
|
Packit |
6ad14e |
case 'H':
|
|
Packit |
6ad14e |
tmp_int = atoi(optarg);
|
|
Packit |
6ad14e |
if ((tmp_int == 0) || ((tmp_int > 9) && (tmp_int < 1000001))) {
|
|
Packit |
6ad14e |
// 0 means do not change the system default value
|
|
Packit |
6ad14e |
H_flag = 1;
|
|
Packit |
6ad14e |
thp_scan_sleep_ms = tmp_int;
|
|
Packit |
6ad14e |
} else {
|
|
Packit |
6ad14e |
fprintf(stderr, "THP scan_sleep_ms must be > 9 and < 1000001\n");
|
|
Packit |
6ad14e |
exit(EXIT_FAILURE);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
break;
|
|
Packit |
6ad14e |
case 'i':
|
|
Packit |
6ad14e |
i_flag = 1;
|
|
Packit |
6ad14e |
parse_two_arg_values(optarg, &min_interval, &max_interval, 1, 0);
|
|
Packit |
6ad14e |
break;
|
|
Packit |
6ad14e |
case 'K':
|
|
Packit |
6ad14e |
K_flag = 1;
|
|
Packit |
6ad14e |
keep_interleaved_memory = (atoi(optarg) != 0);
|
|
Packit |
6ad14e |
break;
|
|
Packit |
6ad14e |
case 'l':
|
|
Packit |
6ad14e |
l_flag = 1;
|
|
Packit |
6ad14e |
log_level = atoi(optarg);
|
|
Packit |
6ad14e |
break;
|
|
Packit |
6ad14e |
case 'm':
|
|
Packit |
6ad14e |
tmp_int = atoi(optarg);
|
|
Packit |
6ad14e |
if ((tmp_int >= 50) && (tmp_int <= 100)) {
|
|
Packit |
6ad14e |
m_flag = 1;
|
|
Packit |
6ad14e |
target_memlocality = tmp_int;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
break;
|
|
Packit |
6ad14e |
case 'p':
|
|
Packit |
6ad14e |
p_flag = 1;
|
|
Packit |
6ad14e |
list_pid = atol(optarg);
|
|
Packit |
6ad14e |
exclude_pid_list = remove_pid_from_pid_list(exclude_pid_list, list_pid);
|
|
Packit |
6ad14e |
include_pid_list = insert_pid_into_pid_list(include_pid_list, list_pid);
|
|
Packit |
6ad14e |
break;
|
|
Packit |
6ad14e |
case 'r':
|
|
Packit |
6ad14e |
r_flag = 1;
|
|
Packit |
6ad14e |
list_pid = atol(optarg);
|
|
Packit |
6ad14e |
// Remove this PID from both explicit pid lists.
|
|
Packit |
6ad14e |
include_pid_list = remove_pid_from_pid_list(include_pid_list, list_pid);
|
|
Packit |
6ad14e |
exclude_pid_list = remove_pid_from_pid_list(exclude_pid_list, list_pid);
|
|
Packit |
6ad14e |
break;
|
|
Packit |
6ad14e |
case 'R':
|
|
Packit |
6ad14e |
reserved_cpu_str = strdup(optarg);
|
|
Packit |
6ad14e |
break;
|
|
Packit |
6ad14e |
case 'S':
|
|
Packit |
6ad14e |
S_flag = 1;
|
|
Packit |
6ad14e |
scan_all_processes = (atoi(optarg) != 0);
|
|
Packit |
6ad14e |
break;
|
|
Packit |
6ad14e |
case 't':
|
|
Packit |
6ad14e |
tmp_int = atoi(optarg);
|
|
Packit |
6ad14e |
if ((tmp_int >= 0) && (tmp_int <= 100)) {
|
|
Packit |
6ad14e |
t_flag = 1;
|
|
Packit |
6ad14e |
htt_percent = tmp_int;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
break;
|
|
Packit |
6ad14e |
case 'u':
|
|
Packit |
6ad14e |
tmp_int = atoi(optarg);
|
|
Packit |
6ad14e |
if ((tmp_int >= 10) && (tmp_int <= 130)) {
|
|
Packit |
6ad14e |
u_flag = 1;
|
|
Packit |
6ad14e |
target_utilization = tmp_int;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
break;
|
|
Packit |
6ad14e |
case 'v':
|
|
Packit |
6ad14e |
v_flag = 1;
|
|
Packit |
6ad14e |
log_level = LOG_INFO;
|
|
Packit |
6ad14e |
break;
|
|
Packit |
6ad14e |
case 'V':
|
|
Packit |
6ad14e |
print_version_and_exit(argv[0]);
|
|
Packit |
6ad14e |
break;
|
|
Packit |
6ad14e |
case 'w':
|
|
Packit |
6ad14e |
w_flag = 1;
|
|
Packit |
6ad14e |
parse_two_arg_values(optarg, &requested_cpus, &requested_mbs, 0, 2);
|
|
Packit |
6ad14e |
break;
|
|
Packit |
6ad14e |
case 'x':
|
|
Packit |
6ad14e |
x_flag = 1;
|
|
Packit |
6ad14e |
list_pid = atol(optarg);
|
|
Packit |
6ad14e |
include_pid_list = remove_pid_from_pid_list(include_pid_list, list_pid);
|
|
Packit |
6ad14e |
exclude_pid_list = insert_pid_into_pid_list(exclude_pid_list, list_pid);
|
|
Packit |
6ad14e |
break;
|
|
Packit |
6ad14e |
default:
|
|
Packit |
6ad14e |
print_usage_and_exit(argv[0]);
|
|
Packit |
6ad14e |
break;
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
if (argc > optind) {
|
|
Packit |
6ad14e |
fprintf(stderr, "Unexpected arg = %s\n", argv[optind]);
|
|
Packit |
6ad14e |
exit(EXIT_FAILURE);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
if (i_flag) {
|
|
Packit |
6ad14e |
if ((max_interval < min_interval) && (max_interval != 0)) {
|
|
Packit |
6ad14e |
fprintf(stderr, "Max interval (%d) must be greater than min interval (%d)\n", max_interval, min_interval);
|
|
Packit |
6ad14e |
exit(EXIT_FAILURE);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
open_log_file();
|
|
Packit |
6ad14e |
init_msg_queue();
|
|
Packit |
6ad14e |
num_cpus = get_num_cpus();
|
|
Packit |
6ad14e |
page_size_in_bytes = sysconf(_SC_PAGESIZE);
|
|
Packit |
6ad14e |
huge_page_size_in_bytes = get_huge_page_size_in_bytes();
|
|
Packit |
6ad14e |
// Figure out if this is the daemon, or a subsequent invocation
|
|
Packit |
6ad14e |
int daemon_pid = get_daemon_pid();
|
|
Packit |
6ad14e |
if (daemon_pid > 0) {
|
|
Packit |
6ad14e |
// Daemon is already running. So send dynamic options to persistant
|
|
Packit |
6ad14e |
// thread to handle requests, get the response (if any), and finish.
|
|
Packit |
6ad14e |
msg_t msg;
|
|
Packit |
6ad14e |
if (C_flag) {
|
|
Packit |
6ad14e |
send_msg(daemon_pid, 'C', use_inactive_file_cache, 0, "");
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
if (H_flag) {
|
|
Packit |
6ad14e |
send_msg(daemon_pid, 'H', thp_scan_sleep_ms, 0, "");
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
if (i_flag) {
|
|
Packit |
6ad14e |
send_msg(daemon_pid, 'i', min_interval, max_interval, "");
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
if (K_flag) {
|
|
Packit |
6ad14e |
send_msg(daemon_pid, 'K', keep_interleaved_memory, 0, "");
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
if (d_flag || l_flag || v_flag) {
|
|
Packit |
6ad14e |
send_msg(daemon_pid, 'l', log_level, 0, "");
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
if (m_flag) {
|
|
Packit |
6ad14e |
send_msg(daemon_pid, 'm', target_memlocality, 0, "");
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
if (p_flag) {
|
|
Packit |
6ad14e |
send_msg(daemon_pid, 'p', list_pid, 0, "");
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
if (r_flag) {
|
|
Packit |
6ad14e |
send_msg(daemon_pid, 'r', list_pid, 0, "");
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
if (S_flag) {
|
|
Packit |
6ad14e |
send_msg(daemon_pid, 'S', scan_all_processes, 0, "");
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
if (t_flag) {
|
|
Packit |
6ad14e |
send_msg(daemon_pid, 't', htt_percent, 0, "");
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
if (u_flag) {
|
|
Packit |
6ad14e |
send_msg(daemon_pid, 'u', target_utilization, 0, "");
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
if (w_flag) {
|
|
Packit |
6ad14e |
send_msg(daemon_pid, 'w', requested_cpus, requested_mbs, "");
|
|
Packit |
6ad14e |
recv_msg(&msg;;
|
|
Packit |
6ad14e |
fprintf(stdout, "%s\n", msg.body.text);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
if (x_flag) {
|
|
Packit |
6ad14e |
send_msg(daemon_pid, 'x', list_pid, 0, "");
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
close_log_file();
|
|
Packit |
6ad14e |
exit(EXIT_SUCCESS);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
// No numad daemon running yet.
|
|
Packit |
6ad14e |
// First, make note of any reserved CPUs....
|
|
Packit |
6ad14e |
if (reserved_cpu_str != NULL) {
|
|
Packit |
6ad14e |
CLEAR_CPU_LIST(reserved_cpu_mask_list_p);
|
|
Packit |
6ad14e |
int n = add_ids_to_list_from_str(reserved_cpu_mask_list_p, reserved_cpu_str);
|
|
Packit |
6ad14e |
char buf[BUF_SIZE];
|
|
Packit |
6ad14e |
str_from_id_list(buf, BUF_SIZE, reserved_cpu_mask_list_p);
|
|
Packit |
6ad14e |
numad_log(LOG_NOTICE, "Reserving %d CPUs (%s) for non-numad use\n", n, buf);
|
|
Packit |
6ad14e |
// turn reserved list into a negated mask for later ANDing use...
|
|
Packit |
6ad14e |
negate_cpu_list(reserved_cpu_mask_list_p);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
// If it is a "-w" pre-placement request, handle that without starting
|
|
Packit |
6ad14e |
// the daemon. Otherwise start the numad daemon.
|
|
Packit |
6ad14e |
if (w_flag) {
|
|
Packit |
6ad14e |
// Get pre-placement NUMA advice without starting daemon
|
|
Packit |
6ad14e |
update_nodes();
|
|
Packit |
6ad14e |
sleep(2);
|
|
Packit |
6ad14e |
update_nodes();
|
|
Packit |
6ad14e |
numad_log(LOG_NOTICE, "Getting NUMA pre-placement advice for %d CPUs and %d MBs\n", requested_cpus, requested_mbs);
|
|
Packit |
6ad14e |
id_list_p node_list_p = pick_numa_nodes(-1, requested_cpus, requested_mbs, 0);
|
|
Packit |
6ad14e |
char buf[BUF_SIZE];
|
|
Packit |
6ad14e |
str_from_id_list(buf, BUF_SIZE, node_list_p);
|
|
Packit |
6ad14e |
fprintf(stdout, "%s\n", buf);
|
|
Packit |
6ad14e |
close_log_file();
|
|
Packit |
6ad14e |
exit(EXIT_SUCCESS);
|
|
Packit |
6ad14e |
} else if (max_interval > 0) {
|
|
Packit |
6ad14e |
// Start the numad daemon...
|
|
Packit |
6ad14e |
check_prereqs(argv[0]);
|
|
Packit |
6ad14e |
#if (!NO_DAEMON)
|
|
Packit |
6ad14e |
// Daemonize self...
|
|
Packit |
6ad14e |
daemon_pid = fork();
|
|
Packit |
6ad14e |
if (daemon_pid < 0) { numad_log(LOG_CRIT, "fork() failed\n"); exit(EXIT_FAILURE); }
|
|
Packit |
6ad14e |
// Parent process now exits
|
|
Packit |
6ad14e |
if (daemon_pid > 0) { exit(EXIT_SUCCESS); }
|
|
Packit |
6ad14e |
// Child process continues...
|
|
Packit |
6ad14e |
umask(S_IWGRP | S_IWOTH); // Reset the file mode
|
|
Packit |
6ad14e |
int sid = setsid(); // Start a new session
|
|
Packit |
6ad14e |
if (sid < 0) { numad_log(LOG_CRIT, "setsid() failed\n"); exit(EXIT_FAILURE); }
|
|
Packit |
6ad14e |
if ((chdir("/")) < 0) { numad_log(LOG_CRIT, "chdir() failed"); exit(EXIT_FAILURE); }
|
|
Packit |
6ad14e |
daemon_pid = register_numad_pid();
|
|
Packit |
6ad14e |
if (daemon_pid != getpid()) {
|
|
Packit |
6ad14e |
numad_log(LOG_CRIT, "Could not register daemon PID\n");
|
|
Packit |
6ad14e |
exit(EXIT_FAILURE);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
fclose(stdin);
|
|
Packit |
6ad14e |
fclose(stdout);
|
|
Packit |
6ad14e |
if (log_fs != stderr) {
|
|
Packit |
6ad14e |
fclose(stderr);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
#endif
|
|
Packit |
6ad14e |
// Set up signal handlers
|
|
Packit |
6ad14e |
struct sigaction sa;
|
|
Packit |
6ad14e |
memset(&sa, 0, sizeof(sa));
|
|
Packit |
6ad14e |
sa.sa_handler = sig_handler;
|
|
Packit |
6ad14e |
if (sigaction(SIGHUP, &sa, NULL)
|
|
Packit |
6ad14e |
|| sigaction(SIGTERM, &sa, NULL)
|
|
Packit |
6ad14e |
|| sigaction(SIGQUIT, &sa, NULL)) {
|
|
Packit |
6ad14e |
numad_log(LOG_CRIT, "sigaction does not work?\n");
|
|
Packit |
6ad14e |
exit(EXIT_FAILURE);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
// Allocate initial process hash table
|
|
Packit |
6ad14e |
process_hash_table_expand();
|
|
Packit |
6ad14e |
// Spawn a thread to handle messages from subsequent invocation requests
|
|
Packit |
6ad14e |
pthread_mutex_init(&pid_list_mutex, NULL);
|
|
Packit |
6ad14e |
pthread_mutex_init(&node_info_mutex, NULL);
|
|
Packit |
6ad14e |
pthread_attr_t attr;
|
|
Packit |
6ad14e |
if (pthread_attr_init(&attr) != 0) {
|
|
Packit |
6ad14e |
numad_log(LOG_CRIT, "pthread_attr_init failure\n");
|
|
Packit |
6ad14e |
exit(EXIT_FAILURE);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
pthread_t tid;
|
|
Packit |
6ad14e |
if (pthread_create(&tid, &attr, &set_dynamic_options, &tid) != 0) {
|
|
Packit |
6ad14e |
numad_log(LOG_CRIT, "pthread_create failure: setting thread\n");
|
|
Packit |
6ad14e |
exit(EXIT_FAILURE);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
// Loop here forwever...
|
|
Packit |
6ad14e |
for (;;) {
|
|
Packit |
6ad14e |
int interval = max_interval;
|
|
Packit |
6ad14e |
pthread_mutex_lock(&node_info_mutex);
|
|
Packit |
6ad14e |
int nodes = update_nodes();
|
|
Packit |
6ad14e |
pthread_mutex_unlock(&node_info_mutex);
|
|
Packit |
6ad14e |
if (nodes > 1) {
|
|
Packit |
6ad14e |
update_processes();
|
|
Packit |
6ad14e |
interval = manage_loads();
|
|
Packit |
6ad14e |
if (interval < max_interval) {
|
|
Packit |
6ad14e |
// Update node info since we moved something
|
|
Packit |
6ad14e |
nodes = update_nodes();
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
sleep(interval);
|
|
Packit |
6ad14e |
if (got_sigterm | got_sigquit) {
|
|
Packit |
6ad14e |
shut_down_numad();
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
if (got_sighup) {
|
|
Packit |
6ad14e |
got_sighup = 0;
|
|
Packit |
6ad14e |
close_log_file();
|
|
Packit |
6ad14e |
open_log_file();
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
if (pthread_attr_destroy(&attr) != 0) {
|
|
Packit |
6ad14e |
numad_log(LOG_WARNING, "pthread_attr_destroy failure\n");
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
pthread_mutex_destroy(&pid_list_mutex);
|
|
Packit |
6ad14e |
pthread_mutex_destroy(&node_info_mutex);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
exit(EXIT_SUCCESS);
|
|
Packit |
6ad14e |
}
|
|
Packit |
6ad14e |
|