#include "config.h" #include #include #include #include #include #include #include #include "irqbalance.h" #include "types.h" char *classes[] = { "other", "legacy", "storage", "video", "ethernet", "gbit-ethernet", "10gbit-ethernet", "virt-event", 0 }; static int map_class_to_level[8] = { BALANCE_PACKAGE, BALANCE_CACHE, BALANCE_CORE, BALANCE_CORE, BALANCE_CORE, BALANCE_CORE, BALANCE_CORE, BALANCE_CORE }; struct user_irq_policy { int ban; int level; int numa_node_set; int numa_node; }; static GList *interrupts_db = NULL; static GList *banned_irqs = NULL; GList *cl_banned_irqs = NULL; static GList *cl_banned_modules = NULL; #define SYSFS_DIR "/sys" #define SYSDEV_DIR "/sys/bus/pci/devices" #define PCI_MAX_CLASS 0x14 #define PCI_MAX_SERIAL_SUBCLASS 0x81 #define PCI_INVAL_DATA 0xFFFFFFFF struct pci_info { unsigned short vendor; unsigned short device; unsigned short sub_vendor; unsigned short sub_device; unsigned int class; }; /* PCI vendor ID, device ID */ #define PCI_VENDOR_PLX 0x10b5 #define PCI_DEVICE_PLX_PEX8619 0x8619 #define PCI_VENDOR_CAVIUM 0x177d #define PCI_DEVICE_CAVIUM_CN61XX 0x0093 /* PCI subsystem vendor ID, subsystem device ID */ #define PCI_SUB_VENDOR_EMC 0x1120 #define PCI_SUB_DEVICE_EMC_055B 0x055b #define PCI_SUB_DEVICE_EMC_0568 0x0568 #define PCI_SUB_DEVICE_EMC_dd00 0xdd00 /* * Apply software workarounds for some special devices * * The world is not perfect and supplies us with broken PCI devices. * Usually there are two sort of cases: * * 1. The device is special * Before shipping the devices, PCI spec doesn't have the definitions. * * 2. Buggy PCI devices * Some PCI devices don't follow the PCI class code definitions. */ static void apply_pci_quirks(const struct pci_info *pci, int *irq_class) { if ((pci->vendor == PCI_VENDOR_PLX) && (pci->device == PCI_DEVICE_PLX_PEX8619) && (pci->sub_vendor == PCI_SUB_VENDOR_EMC)) { switch (pci->sub_device) { case PCI_SUB_DEVICE_EMC_055B: case PCI_SUB_DEVICE_EMC_dd00: *irq_class = IRQ_SCSI; break; } } if ((pci->vendor == PCI_VENDOR_CAVIUM) && (pci->device == PCI_DEVICE_CAVIUM_CN61XX) && (pci->sub_vendor == PCI_SUB_VENDOR_EMC)) { switch (pci->sub_device) { case PCI_SUB_DEVICE_EMC_0568: *irq_class = IRQ_SCSI; break; } } return; } /* Determin IRQ class based on PCI class code */ static int map_pci_irq_class(unsigned int pci_class) { unsigned int major = pci_class >> 16; unsigned int sub = (pci_class & 0xFF00) >> 8; int irq_class = IRQ_NODEF; /* * Class codes lifted from below PCI-SIG spec: * * PCI Code and ID Assignment Specification v1.5 * * and mapped to irqbalance types here. * * IRQ_NODEF will go through classification by PCI sub-class code. */ static short major_class_codes[PCI_MAX_CLASS] = { IRQ_OTHER, IRQ_SCSI, IRQ_ETH, IRQ_VIDEO, IRQ_OTHER, IRQ_OTHER, IRQ_LEGACY, IRQ_OTHER, IRQ_OTHER, IRQ_LEGACY, IRQ_OTHER, IRQ_OTHER, IRQ_NODEF, IRQ_ETH, IRQ_SCSI, IRQ_OTHER, IRQ_OTHER, IRQ_OTHER, IRQ_LEGACY, IRQ_LEGACY, }; /* * All sub-class code for serial bus controllers. * The major class code is 0xc. */ static short serial_sub_codes[PCI_MAX_SERIAL_SUBCLASS] = { IRQ_LEGACY, IRQ_LEGACY, IRQ_LEGACY, IRQ_LEGACY, IRQ_SCSI, IRQ_LEGACY, IRQ_SCSI, IRQ_LEGACY, IRQ_LEGACY, IRQ_LEGACY, [0xa ... 0x7f] = IRQ_NODEF, IRQ_LEGACY, }; /* * Check major class code first */ if (major >= PCI_MAX_CLASS) return IRQ_NODEF; switch (major) { case 0xc: /* Serial bus class */ if (sub >= PCI_MAX_SERIAL_SUBCLASS) return IRQ_NODEF; irq_class = serial_sub_codes[sub]; break; default: /* All other PCI classes */ irq_class = major_class_codes[major]; break; } return irq_class; } /* Read specific data from sysfs */ static unsigned int read_pci_data(const char *devpath, const char* file) { char path[PATH_MAX]; FILE *fd; unsigned int data = PCI_INVAL_DATA; sprintf(path, "%s/%s", devpath, file); fd = fopen(path, "r"); if (!fd) { log(TO_CONSOLE, LOG_WARNING, "PCI: can't open file:%s\n", path); return data; } (void) fscanf(fd, "%x", &data); fclose(fd); return data; } /* Get pci information for IRQ classification */ static int get_pci_info(const char *devpath, struct pci_info *pci) { unsigned int data = PCI_INVAL_DATA; if ((data = read_pci_data(devpath, "vendor")) == PCI_INVAL_DATA) return -ENODEV; pci->vendor = (unsigned short)data; if ((data = read_pci_data(devpath, "device")) == PCI_INVAL_DATA) return -ENODEV; pci->device = (unsigned short)data; if ((data = read_pci_data(devpath, "subsystem_vendor")) == PCI_INVAL_DATA) return -ENODEV; pci->sub_vendor = (unsigned short)data; if ((data = read_pci_data(devpath, "subsystem_device")) == PCI_INVAL_DATA) return -ENODEV; pci->sub_device = (unsigned short)data; if ((data = read_pci_data(devpath, "class")) == PCI_INVAL_DATA) return -ENODEV; pci->class = data; return 0; } /* Return IRQ class for given devpath */ static int get_irq_class(const char *devpath) { int irq_class = IRQ_NODEF; struct pci_info pci; /* Get PCI info from sysfs */ if (get_pci_info(devpath, &pci) < 0) return IRQ_NODEF; /* Map PCI class code to irq class */ irq_class = map_pci_irq_class(pci.class); if (irq_class < 0) { log(TO_CONSOLE, LOG_WARNING, "Invalid PCI class code %d\n", pci.class); return IRQ_NODEF; } /* Reassign irq class for some buggy devices */ apply_pci_quirks(&pci, &irq_class); return irq_class; } static gint compare_ints(gconstpointer a, gconstpointer b) { const struct irq_info *ai = a; const struct irq_info *bi = b; return ai->irq - bi->irq; } static void add_banned_irq(int irq, GList **list) { struct irq_info find, *new; GList *entry; find.irq = irq; entry = g_list_find_custom(*list, &find, compare_ints); if (entry) return; new = calloc(sizeof(struct irq_info), 1); if (!new) { log(TO_CONSOLE, LOG_WARNING, "No memory to ban irq %d\n", irq); return; } new->irq = irq; new->flags |= IRQ_FLAG_BANNED; *list = g_list_append(*list, new); log(TO_CONSOLE, LOG_INFO, "IRQ %d was BANNED.\n", irq); return; } void add_cl_banned_irq(int irq) { add_banned_irq(irq, &cl_banned_irqs); } static int is_banned_irq(int irq) { GList *entry; struct irq_info find; find.irq = irq; entry = g_list_find_custom(banned_irqs, &find, compare_ints); return entry ? 1:0; } gint substr_find(gconstpointer a, gconstpointer b) { if (strstr(b, a)) return 0; else return 1; } static void add_banned_module(char *modname, GList **modlist) { GList *entry; char *newmod; entry = g_list_find_custom(*modlist, modname, substr_find); if (entry) return; newmod = strdup(modname); if (!newmod) { log(TO_CONSOLE, LOG_WARNING, "No memory to ban module %s\n", modname); return; } *modlist = g_list_append(*modlist, newmod); } void add_cl_banned_module(char *modname) { add_banned_module(modname, &cl_banned_modules); } /* * Inserts an irq_info struct into the intterupts_db list * devpath points to the device directory in sysfs for the * related device. NULL devpath means no sysfs entries for * this irq. */ static struct irq_info *add_one_irq_to_db(const char *devpath, int irq, struct user_irq_policy *pol) { int irq_class = IRQ_OTHER; struct irq_info *new, find; int numa_node; char path[PATH_MAX]; FILE *fd; char *lcpu_mask; GList *entry; ssize_t ret; size_t blen; /* * First check to make sure this isn't a duplicate entry */ find.irq = irq; entry = g_list_find_custom(interrupts_db, &find, compare_ints); if (entry) { log(TO_CONSOLE, LOG_INFO, "DROPPING DUPLICATE ENTRY FOR IRQ %d on path %s\n", irq, devpath); return NULL; } if (is_banned_irq(irq)) { log(TO_ALL, LOG_INFO, "SKIPPING BANNED IRQ %d\n", irq); return NULL; } new = calloc(sizeof(struct irq_info), 1); if (!new) return NULL; new->irq = irq; new->class = IRQ_OTHER; interrupts_db = g_list_append(interrupts_db, new); /* Some special irqs have NULL devpath */ if (devpath != NULL) { /* Map PCI class code to irq class */ irq_class = get_irq_class(devpath); if (irq_class < 0) goto get_numa_node; } new->class = irq_class; if (pol->level >= 0) new->level = pol->level; else new->level = map_class_to_level[irq_class]; get_numa_node: numa_node = -1; if (numa_avail) { sprintf(path, "%s/numa_node", devpath); fd = fopen(path, "r"); if (fd) { fscanf(fd, "%d", &numa_node); fclose(fd); } } if (pol->numa_node_set == 1) new->numa_node = get_numa_node(pol->numa_node); else new->numa_node = get_numa_node(numa_node); sprintf(path, "%s/local_cpus", devpath); fd = fopen(path, "r"); if (!fd) { cpus_setall(new->cpumask); goto out; } lcpu_mask = NULL; ret = getline(&lcpu_mask, &blen, fd); fclose(fd); if (ret <= 0) { cpus_setall(new->cpumask); } else { cpumask_parse_user(lcpu_mask, ret, new->cpumask); } free(lcpu_mask); out: log(TO_CONSOLE, LOG_INFO, "Adding IRQ %d to database\n", irq); return new; } static void parse_user_policy_key(char *buf, int irq, struct user_irq_policy *pol) { char *key, *value, *end; char *levelvals[] = { "none", "package", "cache", "core" }; int idx; int key_set = 1; key = buf; value = strchr(buf, '='); if (!value) { log(TO_SYSLOG, LOG_WARNING, "Bad format for policy, ignoring: %s\n", buf); return; } /* NULL terminate the key and advance value to the start of the value * string */ *value = '\0'; value++; end = strchr(value, '\n'); if (end) *end = '\0'; if (!strcasecmp("ban", key)) { if (!strcasecmp("false", value)) pol->ban = 0; else if (!strcasecmp("true", value)) pol->ban = 1; else { key_set = 0; log(TO_ALL, LOG_WARNING, "Unknown value for ban policy: %s\n", value); } } else if (!strcasecmp("balance_level", key)) { for (idx=0; idx<4; idx++) { if (!strcasecmp(levelvals[idx], value)) break; } if (idx>3) { key_set = 0; log(TO_ALL, LOG_WARNING, "Bad value for balance_level policy: %s\n", value); } else pol->level = idx; } else if (!strcasecmp("numa_node", key)) { idx = strtoul(value, NULL, 10); if (!get_numa_node(idx)) { log(TO_ALL, LOG_WARNING, "NUMA node %d doesn't exist\n", idx); return; } pol->numa_node = idx; pol->numa_node_set = 1; } else { key_set = 0; log(TO_ALL, LOG_WARNING, "Unknown key returned, ignoring: %s\n", key); } if (key_set) log(TO_ALL, LOG_INFO, "IRQ %d: Override %s to %s\n", irq, key, value); } /* * Calls out to a possibly user defined script to get user assigned policy * aspects for a given irq. A value of -1 in a given field indicates no * policy was given and that system defaults should be used */ static void get_irq_user_policy(char *path, int irq, struct user_irq_policy *pol) { char *cmd; FILE *output; char buffer[128]; char *brc; memset(pol, -1, sizeof(struct user_irq_policy)); /* Return defaults if no script was given */ if (!polscript) return; /* Use SYSFS_DIR for irq has no sysfs entries */ if (!path) path = SYSFS_DIR; cmd = alloca(strlen(path)+strlen(polscript)+64); if (!cmd) return; sprintf(cmd, "exec %s %s %d", polscript, path, irq); output = popen(cmd, "r"); if (!output) { log(TO_ALL, LOG_WARNING, "Unable to execute user policy script %s\n", polscript); return; } while(!feof(output)) { brc = fgets(buffer, 128, output); if (brc) parse_user_policy_key(brc, irq, pol); } pclose(output); } static int check_for_module_ban(char *name) { GList *entry; entry = g_list_find_custom(cl_banned_modules, name, substr_find); if (entry) return 1; else return 0; } static int check_for_irq_ban(char *path __attribute__((unused)), int irq, GList *proc_interrupts) { struct irq_info find, *res; GList *entry; /* * Check to see if we banned this irq on the command line */ find.irq = irq; entry = g_list_find_custom(cl_banned_irqs, &find, compare_ints); if (entry) return 1; /* * Check to see if we banned module which the irq belongs to. */ entry = g_list_find_custom(proc_interrupts, &find, compare_ints); if (entry) { res = entry->data; if (check_for_module_ban(res->name)) return 1; } #ifdef INCLUDE_BANSCRIPT char *cmd; int rc; if (!banscript) return 0; if (!path) return 0; cmd = alloca(strlen(path)+strlen(banscript)+32); if (!cmd) return 0; sprintf(cmd, "%s %s %d > /dev/null",banscript, path, irq); rc = system(cmd); /* * The system command itself failed */ if (rc == -1) { log(TO_ALL, LOG_WARNING, "%s failed, please check the --banscript option\n", cmd); return 0; } if (WEXITSTATUS(rc)) { log(TO_ALL, LOG_INFO, "irq %d is baned by %s\n", irq, banscript); return 1; } #endif return 0; } /* * Figures out which interrupt(s) relate to the device we"re looking at in dirname */ static void build_one_dev_entry(const char *dirname, GList *tmp_irqs) { struct dirent *entry; DIR *msidir; FILE *fd; int irqnum; struct irq_info *new; char path[PATH_MAX]; char devpath[PATH_MAX]; struct user_irq_policy pol; sprintf(path, "%s/%s/msi_irqs", SYSDEV_DIR, dirname); sprintf(devpath, "%s/%s", SYSDEV_DIR, dirname); msidir = opendir(path); if (msidir) { do { entry = readdir(msidir); if (!entry) break; irqnum = strtol(entry->d_name, NULL, 10); if (irqnum) { new = get_irq_info(irqnum); if (new) continue; get_irq_user_policy(devpath, irqnum, &pol); if ((pol.ban == 1) || (check_for_irq_ban(devpath, irqnum, tmp_irqs))) { add_banned_irq(irqnum, &banned_irqs); continue; } new = add_one_irq_to_db(devpath, irqnum, &pol); if (!new) continue; new->type = IRQ_TYPE_MSIX; } } while (entry != NULL); closedir(msidir); return; } sprintf(path, "%s/%s/irq", SYSDEV_DIR, dirname); fd = fopen(path, "r"); if (!fd) return; if (fscanf(fd, "%d", &irqnum) < 0) goto done; /* * no pci device has irq 0 * irq 255 is invalid on x86/x64 architectures */ #if defined(__i386__) || defined(__x86_64__) if (irqnum && irqnum != 255) { #else if (irqnum) { #endif new = get_irq_info(irqnum); if (new) goto done; get_irq_user_policy(devpath, irqnum, &pol); if ((pol.ban == 1) || (check_for_irq_ban(path, irqnum, tmp_irqs))) { add_banned_irq(irqnum, &banned_irqs); goto done; } new = add_one_irq_to_db(devpath, irqnum, &pol); if (!new) goto done; new->type = IRQ_TYPE_LEGACY; } done: fclose(fd); return; } static void free_irq(struct irq_info *info, void *data __attribute__((unused))) { free(info); } void free_irq_db(void) { for_each_irq(NULL, free_irq, NULL); g_list_free(interrupts_db); interrupts_db = NULL; for_each_irq(banned_irqs, free_irq, NULL); g_list_free(banned_irqs); banned_irqs = NULL; g_list_free(rebalance_irq_list); rebalance_irq_list = NULL; } void free_cl_opts(void) { g_list_free_full(cl_banned_modules, free); g_list_free_full(cl_banned_irqs, free); g_list_free(banned_irqs); } static void add_new_irq(int irq, struct irq_info *hint, GList *proc_interrupts) { struct irq_info *new; struct user_irq_policy pol; new = get_irq_info(irq); if (new) return; /* Set NULL devpath for the irq has no sysfs entries */ get_irq_user_policy(NULL, irq, &pol); if ((pol.ban == 1) || check_for_irq_ban(NULL, irq, proc_interrupts)) { /*FIXME*/ add_banned_irq(irq, &banned_irqs); new = get_irq_info(irq); } else new = add_one_irq_to_db(NULL, irq, &pol); if (!new) { log(TO_CONSOLE, LOG_WARNING, "add_new_irq: Failed to add irq %d\n", irq); return; } /* * Override some of the new irq defaults here */ if (hint) { new->type = hint->type; new->class = hint->class; } new->level = map_class_to_level[new->class]; } static void add_missing_irq(struct irq_info *info, void *attr) { struct irq_info *lookup = get_irq_info(info->irq); GList *proc_interrupts = (GList *) attr; if (!lookup) add_new_irq(info->irq, info, proc_interrupts); } void rebuild_irq_db(void) { DIR *devdir; struct dirent *entry; GList *tmp_irqs = NULL; free_irq_db(); tmp_irqs = collect_full_irq_list(); devdir = opendir(SYSDEV_DIR); if (!devdir) goto free; do { entry = readdir(devdir); if (!entry) break; build_one_dev_entry(entry->d_name, tmp_irqs); } while (entry != NULL); closedir(devdir); for_each_irq(tmp_irqs, add_missing_irq, interrupts_db); free: g_list_free_full(tmp_irqs, free); } void for_each_irq(GList *list, void (*cb)(struct irq_info *info, void *data), void *data) { GList *entry = g_list_first(list ? list : interrupts_db); GList *next; while (entry) { next = g_list_next(entry); cb(entry->data, data); entry = next; } } struct irq_info *get_irq_info(int irq) { GList *entry; struct irq_info find; find.irq = irq; entry = g_list_find_custom(interrupts_db, &find, compare_ints); if (!entry) entry = g_list_find_custom(banned_irqs, &find, compare_ints); return entry ? entry->data : NULL; } void migrate_irq(GList **from, GList **to, struct irq_info *info) { GList *entry; struct irq_info find, *tmp; find.irq = info->irq; entry = g_list_find_custom(*from, &find, compare_ints); if (!entry) return; tmp = entry->data; *from = g_list_delete_link(*from, entry); *to = g_list_append(*to, tmp); info->moved = 1; } static gint sort_irqs(gconstpointer A, gconstpointer B) { struct irq_info *a, *b; a = (struct irq_info*)A; b = (struct irq_info*)B; if (a->class < b->class) return 1; if (a->class > b->class) return -1; if (a->load < b->load) return 1; if (a->load > b->load) return -1; if (a < b) return 1; return -1; } void sort_irq_list(GList **list) { *list = g_list_sort(*list, sort_irqs); }